Synced repo using 'sync_with_huggingface' Github Action
Browse files- iscc_sct/dev.py +22 -0
- pyproject.toml +8 -2
- space.yml +0 -34
iscc_sct/dev.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pathlib
|
2 |
+
|
3 |
+
|
4 |
+
HERE = pathlib.Path(__file__).parent.absolute()
|
5 |
+
|
6 |
+
|
7 |
+
def convert_lf(): # pragma: no cover
|
8 |
+
"""Convert line endings to LF"""
|
9 |
+
crlf = b"\r\n"
|
10 |
+
lf = b"\n"
|
11 |
+
extensions = {".py", ".toml", ".lock", ".txt", ".yml", ".sh", ".md"}
|
12 |
+
n = 0
|
13 |
+
for fp in HERE.parent.glob("**/*"):
|
14 |
+
if fp.suffix in extensions:
|
15 |
+
with open(fp, "rb") as infile:
|
16 |
+
content = infile.read()
|
17 |
+
if crlf in content:
|
18 |
+
content = content.replace(crlf, lf)
|
19 |
+
with open(fp, "wb") as outfile:
|
20 |
+
outfile.write(content)
|
21 |
+
n += 1
|
22 |
+
print(f"{n} files converted to LF")
|
pyproject.toml
CHANGED
@@ -84,11 +84,17 @@ line-length = 119
|
|
84 |
[tool.ruff.format]
|
85 |
line-ending = "lf"
|
86 |
|
|
|
|
|
|
|
87 |
[tool.poe.tasks]
|
88 |
format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
|
89 |
format-markdown = { cmd = "mdformat --wrap 119 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
92 |
|
93 |
[build-system]
|
94 |
requires = ["poetry-core>=1.0.0"]
|
|
|
84 |
[tool.ruff.format]
|
85 |
line-ending = "lf"
|
86 |
|
87 |
+
[tool.coverage.run]
|
88 |
+
omit = ["iscc_sct/dev.py", "tests/"]
|
89 |
+
|
90 |
[tool.poe.tasks]
|
91 |
format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
|
92 |
format-markdown = { cmd = "mdformat --wrap 119 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
|
93 |
+
convert-lf = { script = "iscc_sct.dev:convert_lf", help = "Convert line endings to LF"}
|
94 |
+
test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100", help = "Run tests with coverage" }
|
95 |
+
update-dependencies = { cmd = "poetry update", help = "Update dependencies" }
|
96 |
+
all = ["format-code", "format-markdown", "convert-lf", "test"]
|
97 |
+
update = ["update-dependencies", "all"]
|
98 |
|
99 |
[build-system]
|
100 |
requires = ["poetry-core>=1.0.0"]
|
space.yml
CHANGED
@@ -4,40 +4,6 @@ colorFrom: red
|
|
4 |
colorTo: blue
|
5 |
sdk: gradio
|
6 |
sdk_version: 4.41.0
|
7 |
-
app_file: ./iscc_sct/demo.py
|
8 |
pinned: true
|
9 |
license: CC-BY-NC-SA-4.0
|
10 |
short_description: Cross Lingual Similarity Preserving Text Simprints
|
11 |
-
description: >
|
12 |
-
# ISCC-LAB - Semantic-Code Text
|
13 |
-
|
14 |
-
`iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the
|
15 |
-
[ISCC](https://core.iscc.codes) (*International Standard Content Code*). Semantic Text-Codes are
|
16 |
-
short identifiers created from text documents that preserve similarity (in hamming distance)
|
17 |
-
for semantically similar cross-lingual text inputs.
|
18 |
-
|
19 |
-
## What is the ISCC
|
20 |
-
|
21 |
-
The ISCC is a combination of various similarity preserving fingerprints and an identifier for
|
22 |
-
digital media content.
|
23 |
-
|
24 |
-
ISCCs are generated algorithmically from digital content, just like cryptographic hashes. However,
|
25 |
-
instead of using a single cryptographic hash function to identify data only, the ISCC uses various
|
26 |
-
algorithms to create a composite identifier that exhibits similarity-preserving properties (soft
|
27 |
-
hash or Simprint).
|
28 |
-
|
29 |
-
The component-based structure of the ISCC identifies content at multiple levels of abstraction. Each
|
30 |
-
component is self-describing, modular, and can be used separately or with others to aid in various
|
31 |
-
content identification tasks. The algorithmic design supports content deduplication, database
|
32 |
-
synchronization, indexing, integrity verification, timestamping, versioning, data provenance,
|
33 |
-
similarity clustering, anomaly detection, usage tracking, allocation of royalties, fact-checking and
|
34 |
-
general digital asset management use-cases.
|
35 |
-
|
36 |
-
|
37 |
-
## ISCC Status
|
38 |
-
|
39 |
-
The [ISCC](https://iscc.codes) is an ISO Standrad published under
|
40 |
-
[ISO 24138:2024](https://www.iso.org/standard/77899.html) - International Standard Content Code
|
41 |
-
within [ISO/TC 46/SC 9/WG 18](https://www.iso.org/committee/48836.html).
|
42 |
-
|
43 |
-
The algorithms of this `iscc-sct` are experimental and not (yet) part of the official standard.
|
|
|
4 |
colorTo: blue
|
5 |
sdk: gradio
|
6 |
sdk_version: 4.41.0
|
|
|
7 |
pinned: true
|
8 |
license: CC-BY-NC-SA-4.0
|
9 |
short_description: Cross Lingual Similarity Preserving Text Simprints
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|