Synced repo using 'sync_with_huggingface' Github Action
Browse files- CHANGELOG.md +2 -0
- iscc_sct/__init__.py +1 -1
- iscc_sct/demo.py +23 -10
- poetry.lock +3 -3
- pyproject.toml +1 -1
- tests/test_iscc_sct.py +1 -1
CHANGELOG.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
# Changelog
|
2 |
|
|
|
|
|
3 |
## [0.1.2] - 2024-08-19
|
4 |
- Encode granular features with base64
|
5 |
- Refactor result format to generic ISCC data model
|
|
|
1 |
# Changelog
|
2 |
|
3 |
+
## [0.1.3] - Unrelease
|
4 |
+
|
5 |
## [0.1.2] - 2024-08-19
|
6 |
- Encode granular features with base64
|
7 |
- Refactor result format to generic ISCC data model
|
iscc_sct/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
__version__ = "0.1.
|
2 |
from iscc_sct.options import *
|
3 |
from iscc_sct.utils import *
|
4 |
from iscc_sct.code_semantic_text import *
|
|
|
1 |
+
__version__ = "0.1.3"
|
2 |
from iscc_sct.options import *
|
3 |
from iscc_sct.utils import *
|
4 |
from iscc_sct.code_semantic_text import *
|
iscc_sct/demo.py
CHANGED
@@ -443,9 +443,8 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
443 |
)
|
444 |
|
445 |
with gr.Row(variant="panel"):
|
446 |
-
|
447 |
-
|
448 |
-
"""
|
449 |
## Understanding ISCC Semantic Text-Codes
|
450 |
|
451 |
### What is an ISCC Semantic Text-Code?
|
@@ -476,7 +475,11 @@ document-embedding.
|
|
476 |
The similarity shown is calculated by comparing the ISCC codes, not the original texts. This
|
477 |
allows for efficient and privacy-preserving comparisons, as only the codes need to be shared
|
478 |
or stored.
|
|
|
|
|
479 |
|
|
|
|
|
480 |
### Why is this useful?
|
481 |
- **Content creators**: Find similar content across languages.
|
482 |
- **Researchers**: Quickly compare documents or find related texts in different languages.
|
@@ -490,20 +493,30 @@ language barriers!
|
|
490 |
The "Explore Details & Advanced Options" section provides additional tools and information:
|
491 |
|
492 |
1. **ISCC Bit-Length**: Adjust the precision of the ISCC code. Higher values provide more detailed
|
493 |
-
|
494 |
|
495 |
2. **Max Tokens**: Set the maximum number of tokens per chunk. This affects how the text is split
|
496 |
-
|
497 |
|
498 |
3. **Chunked Text**: View how each input text is divided into chunks for processing. Each chunk is
|
499 |
-
|
500 |
|
501 |
4. **Granular Matches**: See a detailed comparison of individual chunks between Text A and Text B.
|
502 |
-
|
503 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
504 |
"""
|
505 |
-
|
506 |
-
|
|
|
|
|
|
|
|
|
507 |
|
508 |
if __name__ == "__main__": # pragma: no cover
|
509 |
demo.launch()
|
|
|
443 |
)
|
444 |
|
445 |
with gr.Row(variant="panel"):
|
446 |
+
gr.Markdown(
|
447 |
+
"""
|
|
|
448 |
## Understanding ISCC Semantic Text-Codes
|
449 |
|
450 |
### What is an ISCC Semantic Text-Code?
|
|
|
475 |
The similarity shown is calculated by comparing the ISCC codes, not the original texts. This
|
476 |
allows for efficient and privacy-preserving comparisons, as only the codes need to be shared
|
477 |
or stored.
|
478 |
+
"""
|
479 |
+
)
|
480 |
|
481 |
+
gr.Markdown(
|
482 |
+
"""
|
483 |
### Why is this useful?
|
484 |
- **Content creators**: Find similar content across languages.
|
485 |
- **Researchers**: Quickly compare documents or find related texts in different languages.
|
|
|
493 |
The "Explore Details & Advanced Options" section provides additional tools and information:
|
494 |
|
495 |
1. **ISCC Bit-Length**: Adjust the precision of the ISCC code. Higher values provide more detailed
|
496 |
+
comparisons but may be more sensitive to minor differences.
|
497 |
|
498 |
2. **Max Tokens**: Set the maximum number of tokens per chunk. This affects how the text is split
|
499 |
+
for processing.
|
500 |
|
501 |
3. **Chunked Text**: View how each input text is divided into chunks for processing. Each chunk is
|
502 |
+
color-coded and labeled with its size and simprint (a similarity preserving fingerprint).
|
503 |
|
504 |
4. **Granular Matches**: See a detailed comparison of individual chunks between Text A and Text B.
|
505 |
+
This table shows which specific parts of the texts are most similar (above 80%), along with their
|
506 |
+
approximate cosine similarity (scaled -100% to +100%).
|
507 |
+
|
508 |
+
For more information about the **ISCC** see:
|
509 |
+
- https://github.com/iscc
|
510 |
+
- https://iscc.codes
|
511 |
+
- https://iscc.io
|
512 |
+
- [ISO 24138:2024](https://www.iso.org/standard/77899.html)
|
513 |
"""
|
514 |
+
)
|
515 |
+
with gr.Row():
|
516 |
+
gr.Markdown(
|
517 |
+
f"iscc-sct v{sct.__version__} | Source Code: https://github.com/iscc/iscc-sct",
|
518 |
+
elem_classes="footer",
|
519 |
+
)
|
520 |
|
521 |
if __name__ == "__main__": # pragma: no cover
|
522 |
demo.launch()
|
poetry.lock
CHANGED
@@ -755,13 +755,13 @@ files = [
|
|
755 |
|
756 |
[[package]]
|
757 |
name = "importlib-metadata"
|
758 |
-
version = "8.
|
759 |
description = "Read metadata from Python packages"
|
760 |
optional = false
|
761 |
python-versions = ">=3.8"
|
762 |
files = [
|
763 |
-
{file = "importlib_metadata-8.
|
764 |
-
{file = "importlib_metadata-8.
|
765 |
]
|
766 |
|
767 |
[package.dependencies]
|
|
|
755 |
|
756 |
[[package]]
|
757 |
name = "importlib-metadata"
|
758 |
+
version = "8.3.0"
|
759 |
description = "Read metadata from Python packages"
|
760 |
optional = false
|
761 |
python-versions = ">=3.8"
|
762 |
files = [
|
763 |
+
{file = "importlib_metadata-8.3.0-py3-none-any.whl", hash = "sha256:42817a4a0be5845d22c6e212db66a94ad261e2318d80b3e0d363894a79df2b67"},
|
764 |
+
{file = "importlib_metadata-8.3.0.tar.gz", hash = "sha256:9c8fa6e8ea0f9516ad5c8db9246a731c948193c7754d3babb0114a05b27dd364"},
|
765 |
]
|
766 |
|
767 |
[package.dependencies]
|
pyproject.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
[tool.poetry]
|
2 |
name = "iscc-sct"
|
3 |
-
version = "0.1.
|
4 |
description = "ISCC - Semantic Code Text"
|
5 |
authors = ["Titusz <[email protected]>"]
|
6 |
license = "CC-BY-NC-SA-4.0"
|
|
|
1 |
[tool.poetry]
|
2 |
name = "iscc-sct"
|
3 |
+
version = "0.1.3"
|
4 |
description = "ISCC - Semantic Code Text"
|
5 |
authors = ["Titusz <[email protected]>"]
|
6 |
license = "CC-BY-NC-SA-4.0"
|
tests/test_iscc_sct.py
CHANGED
@@ -31,7 +31,7 @@ be matched based on lexical similarity.
|
|
31 |
|
32 |
|
33 |
def test_version():
|
34 |
-
assert sct.__version__ == "0.1.
|
35 |
|
36 |
|
37 |
def test_code_text_semantic_default():
|
|
|
31 |
|
32 |
|
33 |
def test_version():
|
34 |
+
assert sct.__version__ == "0.1.3"
|
35 |
|
36 |
|
37 |
def test_code_text_semantic_default():
|