Synced repo using 'sync_with_huggingface' Github Action
Browse files- iscc_sct/demo.py +149 -20
- tests/test_demo.py +48 -0
iscc_sct/demo.py
CHANGED
@@ -15,6 +15,7 @@ Text Codes for the Texts. Below the result outputs we show the similarity of the
|
|
15 |
from loguru import logger as log
|
16 |
import gradio as gr
|
17 |
import iscc_sct as sct
|
|
|
18 |
|
19 |
|
20 |
def compute_iscc_code(text1, text2, bit_length):
|
@@ -29,6 +30,10 @@ def compare_codes(code_a, code_b, bits):
|
|
29 |
return generate_similarity_bar(hamming_to_cosine(sct.iscc_distance(code_a, code_b), bits))
|
30 |
|
31 |
|
|
|
|
|
|
|
|
|
32 |
def hamming_to_cosine(hamming_distance: int, dim: int) -> float:
|
33 |
"""Aproximate the cosine similarity for a given hamming distance and dimension"""
|
34 |
result = 1 - (2 * hamming_distance) / dim
|
@@ -63,13 +68,66 @@ def generate_similarity_bar(similarity):
|
|
63 |
|
64 |
|
65 |
# Sample texts
|
66 |
-
sample_text_en = "
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
custom_css = """
|
70 |
#chunked-text span.label {
|
71 |
text-transform: none !important;
|
72 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
"""
|
74 |
|
75 |
iscc_theme = gr.themes.Default(
|
@@ -93,29 +151,41 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
93 |
minimum=64,
|
94 |
maximum=256,
|
95 |
step=32,
|
96 |
-
value=
|
97 |
)
|
98 |
with gr.Row(variant="panel"):
|
99 |
with gr.Column(variant="panel"):
|
100 |
in_text_a = gr.TextArea(
|
101 |
-
label="Text",
|
102 |
-
placeholder="
|
103 |
lines=12,
|
104 |
max_lines=12,
|
105 |
)
|
106 |
|
107 |
-
gr.Examples(
|
|
|
|
|
|
|
|
|
|
|
108 |
out_code_a = gr.Textbox(label="ISCC Code for Text A")
|
|
|
109 |
with gr.Column(variant="panel"):
|
110 |
in_text_b = gr.TextArea(
|
111 |
-
label="Text",
|
112 |
-
placeholder="
|
113 |
lines=12,
|
114 |
max_lines=12,
|
115 |
)
|
116 |
|
117 |
-
gr.Examples(
|
|
|
|
|
|
|
|
|
|
|
118 |
out_code_b = gr.Textbox(label="ISCC Code for Text B")
|
|
|
119 |
|
120 |
with gr.Row(variant="panel"):
|
121 |
with gr.Column(variant="panel"):
|
@@ -124,30 +194,89 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
124 |
def process_text(text, nbits, suffix):
|
125 |
log.debug(f"{text[:20]}")
|
126 |
if not text:
|
127 |
-
return
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
in_text_a.change(
|
134 |
-
|
135 |
inputs=[in_text_a, in_iscc_bits],
|
136 |
-
outputs=[out_code_a],
|
137 |
show_progress="full",
|
138 |
)
|
139 |
in_text_b.change(
|
140 |
-
|
141 |
inputs=[in_text_b, in_iscc_bits],
|
142 |
-
outputs=[out_code_b],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
show_progress="full",
|
144 |
)
|
145 |
|
146 |
out_code_a.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
|
147 |
out_code_b.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
|
148 |
-
with gr.Row():
|
149 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
if __name__ == "__main__": # pragma: no cover
|
153 |
demo.launch()
|
|
|
15 |
from loguru import logger as log
|
16 |
import gradio as gr
|
17 |
import iscc_sct as sct
|
18 |
+
import textwrap
|
19 |
|
20 |
|
21 |
def compute_iscc_code(text1, text2, bit_length):
|
|
|
30 |
return generate_similarity_bar(hamming_to_cosine(sct.iscc_distance(code_a, code_b), bits))
|
31 |
|
32 |
|
33 |
+
def truncate_text(text, max_length=70):
|
34 |
+
return textwrap.shorten(text, width=max_length, placeholder="...")
|
35 |
+
|
36 |
+
|
37 |
def hamming_to_cosine(hamming_distance: int, dim: int) -> float:
|
38 |
"""Aproximate the cosine similarity for a given hamming distance and dimension"""
|
39 |
result = 1 - (2 * hamming_distance) / dim
|
|
|
68 |
|
69 |
|
70 |
# Sample texts
|
71 |
+
sample_text_en = "\n\n".join(
|
72 |
+
[
|
73 |
+
" ".join(paragraph.split())
|
74 |
+
for paragraph in """
|
75 |
+
This document specifies the syntax and structure of the International Standard Content Code (ISCC),
|
76 |
+
as an identification system for digital assets (including encodings of text, images, audio, video or other content
|
77 |
+
across all media sectors). It also describes ISCC metadata and the use of ISCC in conjunction with other schemes, such
|
78 |
+
as DOI, ISAN, ISBN, ISRC, ISSN and ISWC.
|
79 |
+
|
80 |
+
An ISCC applies to a specific digital asset and is a data-descriptor deterministically constructed from multiple hash
|
81 |
+
digests using the algorithms and rules in this document. This document does not provide information on registration of
|
82 |
+
ISCCs.
|
83 |
+
""".strip().split("\n\n")
|
84 |
+
]
|
85 |
+
)
|
86 |
+
|
87 |
+
sample_text_de = "\n\n".join(
|
88 |
+
[
|
89 |
+
" ".join(paragraph.split())
|
90 |
+
for paragraph in """
|
91 |
+
Dieses Dokument spezifiziert die Syntax und Struktur des International Standard Content Code (ISCC) als
|
92 |
+
Identifizierungssystem für digitale Inhalte (einschließlich Kodierungen von Text, Bildern, Audio, Video oder anderen
|
93 |
+
Inhalten in allen Medienbereichen). Sie beschreibt auch ISCC-Metadaten und die Verwendung von ISCC in Verbindung mit
|
94 |
+
anderen Systemen wie DOI, ISAN, ISBN, ISRC, ISSN und ISWC.
|
95 |
+
|
96 |
+
Ein ISCC bezieht sich auf ein bestimmtes digitales Gut und ist ein Daten-Deskriptor, der deterministisch aus mehreren
|
97 |
+
Hash-Digests unter Verwendung der Algorithmen und Regeln in diesem Dokument erstellt wird. Dieses Dokument enthält
|
98 |
+
keine Informationen über die Registrierung von ISCCs.
|
99 |
+
""".strip().split("\n\n")
|
100 |
+
]
|
101 |
+
)
|
102 |
|
103 |
custom_css = """
|
104 |
#chunked-text span.label {
|
105 |
text-transform: none !important;
|
106 |
}
|
107 |
+
|
108 |
+
.clickable-example {
|
109 |
+
cursor: pointer;
|
110 |
+
transition: all 0.3s ease;
|
111 |
+
}
|
112 |
+
|
113 |
+
.clickable-example:hover {
|
114 |
+
background-color: #f0f0f0;
|
115 |
+
transform: scale(1.02);
|
116 |
+
}
|
117 |
+
|
118 |
+
.clickable-example .label-wrap {
|
119 |
+
font-weight: bold;
|
120 |
+
color: #4a90e2;
|
121 |
+
}
|
122 |
+
|
123 |
+
.truncate-text {
|
124 |
+
white-space: nowrap;
|
125 |
+
overflow: hidden;
|
126 |
+
text-overflow: ellipsis;
|
127 |
+
max-width: 300px;
|
128 |
+
display: inline-block;
|
129 |
+
}
|
130 |
+
|
131 |
"""
|
132 |
|
133 |
iscc_theme = gr.themes.Default(
|
|
|
151 |
minimum=64,
|
152 |
maximum=256,
|
153 |
step=32,
|
154 |
+
value=128,
|
155 |
)
|
156 |
with gr.Row(variant="panel"):
|
157 |
with gr.Column(variant="panel"):
|
158 |
in_text_a = gr.TextArea(
|
159 |
+
label="Text A",
|
160 |
+
placeholder="Click the sample text below or type or paste your text.",
|
161 |
lines=12,
|
162 |
max_lines=12,
|
163 |
)
|
164 |
|
165 |
+
gr.Examples(
|
166 |
+
label="Click to use sample text",
|
167 |
+
examples=[[truncate_text(sample_text_en)]],
|
168 |
+
inputs=[in_text_a],
|
169 |
+
examples_per_page=1,
|
170 |
+
)
|
171 |
out_code_a = gr.Textbox(label="ISCC Code for Text A")
|
172 |
+
gr.ClearButton(components=[in_text_a])
|
173 |
with gr.Column(variant="panel"):
|
174 |
in_text_b = gr.TextArea(
|
175 |
+
label="Text B",
|
176 |
+
placeholder="Click the sample text below or type or paste your text.",
|
177 |
lines=12,
|
178 |
max_lines=12,
|
179 |
)
|
180 |
|
181 |
+
gr.Examples(
|
182 |
+
label="Click to use sample text",
|
183 |
+
examples=[[truncate_text(sample_text_de)]],
|
184 |
+
inputs=[in_text_b],
|
185 |
+
examples_per_page=1,
|
186 |
+
)
|
187 |
out_code_b = gr.Textbox(label="ISCC Code for Text B")
|
188 |
+
gr.ClearButton(components=[in_text_b])
|
189 |
|
190 |
with gr.Row(variant="panel"):
|
191 |
with gr.Column(variant="panel"):
|
|
|
194 |
def process_text(text, nbits, suffix):
|
195 |
log.debug(f"{text[:20]}")
|
196 |
if not text:
|
197 |
+
return None, text
|
198 |
+
# Use the full sample text if it matches the truncated version
|
199 |
+
full_text = (
|
200 |
+
sample_text_en
|
201 |
+
if text == truncate_text(sample_text_en)
|
202 |
+
else sample_text_de
|
203 |
+
if text == truncate_text(sample_text_de)
|
204 |
+
else text
|
205 |
+
)
|
206 |
+
iscc = sct.Metadata(**sct.gen_text_code_semantic(full_text, bits=nbits))
|
207 |
+
return iscc.iscc, full_text
|
208 |
+
|
209 |
+
def recalculate_iscc(text_a, text_b, nbits):
|
210 |
+
code_a = sct.gen_text_code_semantic(text_a, bits=nbits)["iscc"] if text_a else None
|
211 |
+
code_b = sct.gen_text_code_semantic(text_b, bits=nbits)["iscc"] if text_b else None
|
212 |
+
|
213 |
+
if code_a and code_b:
|
214 |
+
similarity = compare_codes(code_a, code_b, nbits)
|
215 |
+
else:
|
216 |
+
similarity = None
|
217 |
+
|
218 |
+
return (
|
219 |
+
gr.Textbox(value=code_a) if code_a else gr.Textbox(),
|
220 |
+
gr.Textbox(value=code_b) if code_b else gr.Textbox(),
|
221 |
+
similarity,
|
222 |
+
)
|
223 |
|
224 |
in_text_a.change(
|
225 |
+
process_text,
|
226 |
inputs=[in_text_a, in_iscc_bits],
|
227 |
+
outputs=[out_code_a, in_text_a],
|
228 |
show_progress="full",
|
229 |
)
|
230 |
in_text_b.change(
|
231 |
+
process_text,
|
232 |
inputs=[in_text_b, in_iscc_bits],
|
233 |
+
outputs=[out_code_b, in_text_b],
|
234 |
+
show_progress="full",
|
235 |
+
)
|
236 |
+
|
237 |
+
in_iscc_bits.change(
|
238 |
+
recalculate_iscc,
|
239 |
+
inputs=[in_text_a, in_text_b, in_iscc_bits],
|
240 |
+
outputs=[out_code_a, out_code_b, out_similarity],
|
241 |
show_progress="full",
|
242 |
)
|
243 |
|
244 |
out_code_a.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
|
245 |
out_code_b.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
|
246 |
+
with gr.Row(variant="panel"):
|
247 |
+
with gr.Column(variant="panel"):
|
248 |
+
gr.Markdown(
|
249 |
+
"""
|
250 |
+
## Understanding ISCC Semantic Text-Codes
|
251 |
+
|
252 |
+
### What is an ISCC Semantic Text-Code?
|
253 |
+
An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of the text,
|
254 |
+
not just the exact words.
|
255 |
+
|
256 |
+
### How does it work?
|
257 |
+
1. **Input**: You provide a text in any language.
|
258 |
+
2. **Processing**: Our system analyzes the meaning of the text.
|
259 |
+
3. **Output**: A unique code is generated that represents the text's content.
|
260 |
+
|
261 |
+
### What can it do?
|
262 |
+
- **Cross-language matching**: It can recognize similar content across different languages.
|
263 |
+
- **Similarity detection**: It can measure how similar two texts are in meaning, not just in words.
|
264 |
+
- **Content identification**: It can help identify texts with similar content, even if the wording is different.
|
265 |
|
266 |
+
### How to use this demo:
|
267 |
+
1. **Enter text**: Type or paste text into either or both text boxes.
|
268 |
+
2. **Adjust bit length**: Use the slider to change the detail level of the code (higher = more detailed).
|
269 |
+
3. **View results**: See the generated ISCC code for each text.
|
270 |
+
4. **Compare**: Look at the similarity bar to see how alike the two texts are in meaning.
|
271 |
+
|
272 |
+
### Why is this useful?
|
273 |
+
- **Content creators**: Find similar content across languages.
|
274 |
+
- **Researchers**: Quickly compare documents or find related texts in different languages.
|
275 |
+
- **Publishers**: Identify potential translations or similar works efficiently.
|
276 |
+
|
277 |
+
This technology opens up new possibilities for understanding and managing text content across language barriers!
|
278 |
+
"""
|
279 |
+
)
|
280 |
|
281 |
if __name__ == "__main__": # pragma: no cover
|
282 |
demo.launch()
|
tests/test_demo.py
CHANGED
@@ -3,6 +3,7 @@ from iscc_sct.demo import (
|
|
3 |
compare_codes,
|
4 |
hamming_to_cosine,
|
5 |
generate_similarity_bar,
|
|
|
6 |
)
|
7 |
|
8 |
|
@@ -78,3 +79,50 @@ def test_process_text(mock_gen_text_code):
|
|
78 |
key, value = next(iter(result.items()))
|
79 |
assert isinstance(key, gr.components.Textbox)
|
80 |
assert isinstance(value, gr.components.Textbox)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
compare_codes,
|
4 |
hamming_to_cosine,
|
5 |
generate_similarity_bar,
|
6 |
+
recalculate_iscc,
|
7 |
)
|
8 |
|
9 |
|
|
|
79 |
key, value = next(iter(result.items()))
|
80 |
assert isinstance(key, gr.components.Textbox)
|
81 |
assert isinstance(value, gr.components.Textbox)
|
82 |
+
|
83 |
+
|
84 |
+
@patch("iscc_sct.demo.sct.gen_text_code_semantic")
|
85 |
+
@patch("iscc_sct.demo.compare_codes")
|
86 |
+
def test_recalculate_iscc(mock_compare_codes, mock_gen_text_code):
|
87 |
+
mock_gen_text_code.side_effect = lambda text, bits: {"iscc": f"ISCC:{text[:4].upper()}{bits}"}
|
88 |
+
mock_compare_codes.return_value = "<similarity_html>"
|
89 |
+
|
90 |
+
# Test with both texts non-empty
|
91 |
+
result = recalculate_iscc("Hello", "World", 64)
|
92 |
+
assert len(result) == 3
|
93 |
+
assert isinstance(result[0], gr.components.Textbox)
|
94 |
+
assert result[0].value == "ISCC:HELL64"
|
95 |
+
assert isinstance(result[1], gr.components.Textbox)
|
96 |
+
assert result[1].value == "ISCC:WORL64"
|
97 |
+
assert result[2] == "<similarity_html>"
|
98 |
+
|
99 |
+
# Test with first text empty
|
100 |
+
result = recalculate_iscc("", "World", 128)
|
101 |
+
assert len(result) == 3
|
102 |
+
assert isinstance(result[0], gr.components.Textbox)
|
103 |
+
assert result[0].value is None
|
104 |
+
assert isinstance(result[1], gr.components.Textbox)
|
105 |
+
assert result[1].value == "ISCC:WORL128"
|
106 |
+
assert result[2] is None
|
107 |
+
|
108 |
+
# Test with second text empty
|
109 |
+
result = recalculate_iscc("Hello", "", 256)
|
110 |
+
assert len(result) == 3
|
111 |
+
assert isinstance(result[0], gr.components.Textbox)
|
112 |
+
assert result[0].value == "ISCC:HELL256"
|
113 |
+
assert isinstance(result[1], gr.components.Textbox)
|
114 |
+
assert result[1].value is None
|
115 |
+
assert result[2] is None
|
116 |
+
|
117 |
+
# Test with both texts empty
|
118 |
+
result = recalculate_iscc("", "", 64)
|
119 |
+
assert len(result) == 3
|
120 |
+
assert isinstance(result[0], gr.components.Textbox)
|
121 |
+
assert result[0].value is None
|
122 |
+
assert isinstance(result[1], gr.components.Textbox)
|
123 |
+
assert result[1].value is None
|
124 |
+
assert result[2] is None
|
125 |
+
|
126 |
+
# Verify function calls
|
127 |
+
assert mock_gen_text_code.call_count == 4
|
128 |
+
assert mock_compare_codes.call_count == 1
|