titusz commited on
Commit
9f0dcde
·
verified ·
1 Parent(s): a9f5bd3

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (3) hide show
  1. iscc_sct/dev.py +22 -0
  2. pyproject.toml +8 -2
  3. space.yml +0 -34
iscc_sct/dev.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+
3
+
4
+ HERE = pathlib.Path(__file__).parent.absolute()
5
+
6
+
7
+ def convert_lf(): # pragma: no cover
8
+ """Convert line endings to LF"""
9
+ crlf = b"\r\n"
10
+ lf = b"\n"
11
+ extensions = {".py", ".toml", ".lock", ".txt", ".yml", ".sh", ".md"}
12
+ n = 0
13
+ for fp in HERE.parent.glob("**/*"):
14
+ if fp.suffix in extensions:
15
+ with open(fp, "rb") as infile:
16
+ content = infile.read()
17
+ if crlf in content:
18
+ content = content.replace(crlf, lf)
19
+ with open(fp, "wb") as outfile:
20
+ outfile.write(content)
21
+ n += 1
22
+ print(f"{n} files converted to LF")
pyproject.toml CHANGED
@@ -84,11 +84,17 @@ line-length = 119
84
  [tool.ruff.format]
85
  line-ending = "lf"
86
 
 
 
 
87
  [tool.poe.tasks]
88
  format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
89
  format-markdown = { cmd = "mdformat --wrap 119 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
90
- test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100 --cov-report=term-missing --color=yes", help = "Run tests with coverage" }
91
- all = ["format-code", "format-markdown", "test"]
 
 
 
92
 
93
  [build-system]
94
  requires = ["poetry-core>=1.0.0"]
 
84
  [tool.ruff.format]
85
  line-ending = "lf"
86
 
87
+ [tool.coverage.run]
88
+ omit = ["iscc_sct/dev.py", "tests/"]
89
+
90
  [tool.poe.tasks]
91
  format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
92
  format-markdown = { cmd = "mdformat --wrap 119 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
93
+ convert-lf = { script = "iscc_sct.dev:convert_lf", help = "Convert line endings to LF"}
94
+ test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100", help = "Run tests with coverage" }
95
+ update-dependencies = { cmd = "poetry update", help = "Update dependencies" }
96
+ all = ["format-code", "format-markdown", "convert-lf", "test"]
97
+ update = ["update-dependencies", "all"]
98
 
99
  [build-system]
100
  requires = ["poetry-core>=1.0.0"]
space.yml CHANGED
@@ -4,40 +4,6 @@ colorFrom: red
4
  colorTo: blue
5
  sdk: gradio
6
  sdk_version: 4.41.0
7
- app_file: ./iscc_sct/demo.py
8
  pinned: true
9
  license: CC-BY-NC-SA-4.0
10
  short_description: Cross Lingual Similarity Preserving Text Simprints
11
- description: >
12
- # ISCC-LAB - Semantic-Code Text
13
-
14
- `iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the
15
- [ISCC](https://core.iscc.codes) (*International Standard Content Code*). Semantic Text-Codes are
16
- short identifiers created from text documents that preserve similarity (in hamming distance)
17
- for semantically similar cross-lingual text inputs.
18
-
19
- ## What is the ISCC
20
-
21
- The ISCC is a combination of various similarity preserving fingerprints and an identifier for
22
- digital media content.
23
-
24
- ISCCs are generated algorithmically from digital content, just like cryptographic hashes. However,
25
- instead of using a single cryptographic hash function to identify data only, the ISCC uses various
26
- algorithms to create a composite identifier that exhibits similarity-preserving properties (soft
27
- hash or Simprint).
28
-
29
- The component-based structure of the ISCC identifies content at multiple levels of abstraction. Each
30
- component is self-describing, modular, and can be used separately or with others to aid in various
31
- content identification tasks. The algorithmic design supports content deduplication, database
32
- synchronization, indexing, integrity verification, timestamping, versioning, data provenance,
33
- similarity clustering, anomaly detection, usage tracking, allocation of royalties, fact-checking and
34
- general digital asset management use-cases.
35
-
36
-
37
- ## ISCC Status
38
-
39
- The [ISCC](https://iscc.codes) is an ISO Standrad published under
40
- [ISO 24138:2024](https://www.iso.org/standard/77899.html) - International Standard Content Code
41
- within [ISO/TC 46/SC 9/WG 18](https://www.iso.org/committee/48836.html).
42
-
43
- The algorithms of this `iscc-sct` are experimental and not (yet) part of the official standard.
 
4
  colorTo: blue
5
  sdk: gradio
6
  sdk_version: 4.41.0
 
7
  pinned: true
8
  license: CC-BY-NC-SA-4.0
9
  short_description: Cross Lingual Similarity Preserving Text Simprints