titusz commited on
Commit
fda8a7a
·
verified ·
1 Parent(s): 9f0dcde

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (2) hide show
  1. iscc_sct/demo.py +149 -20
  2. tests/test_demo.py +48 -0
iscc_sct/demo.py CHANGED
@@ -15,6 +15,7 @@ Text Codes for the Texts. Below the result outputs we show the similarity of the
15
  from loguru import logger as log
16
  import gradio as gr
17
  import iscc_sct as sct
 
18
 
19
 
20
  def compute_iscc_code(text1, text2, bit_length):
@@ -29,6 +30,10 @@ def compare_codes(code_a, code_b, bits):
29
  return generate_similarity_bar(hamming_to_cosine(sct.iscc_distance(code_a, code_b), bits))
30
 
31
 
 
 
 
 
32
  def hamming_to_cosine(hamming_distance: int, dim: int) -> float:
33
  """Aproximate the cosine similarity for a given hamming distance and dimension"""
34
  result = 1 - (2 * hamming_distance) / dim
@@ -63,13 +68,66 @@ def generate_similarity_bar(similarity):
63
 
64
 
65
  # Sample texts
66
- sample_text_en = "This is a sample text in English to demonstrate the ISCC-CODE generation."
67
- sample_text_de = "Dies ist ein Beispieltext auf Deutsch, um die Erzeugung von ISCC-CODES zu demonstrieren."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  custom_css = """
70
  #chunked-text span.label {
71
  text-transform: none !important;
72
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  """
74
 
75
  iscc_theme = gr.themes.Default(
@@ -93,29 +151,41 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
93
  minimum=64,
94
  maximum=256,
95
  step=32,
96
- value=64,
97
  )
98
  with gr.Row(variant="panel"):
99
  with gr.Column(variant="panel"):
100
  in_text_a = gr.TextArea(
101
- label="Text",
102
- placeholder="Paste your text here or select sample from below",
103
  lines=12,
104
  max_lines=12,
105
  )
106
 
107
- gr.Examples(label="Sample Text", examples=[sample_text_en], inputs=[in_text_a])
 
 
 
 
 
108
  out_code_a = gr.Textbox(label="ISCC Code for Text A")
 
109
  with gr.Column(variant="panel"):
110
  in_text_b = gr.TextArea(
111
- label="Text",
112
- placeholder="Paste your text here or select sample from below",
113
  lines=12,
114
  max_lines=12,
115
  )
116
 
117
- gr.Examples(label="Sample Text", examples=[sample_text_de], inputs=[in_text_b])
 
 
 
 
 
118
  out_code_b = gr.Textbox(label="ISCC Code for Text B")
 
119
 
120
  with gr.Row(variant="panel"):
121
  with gr.Column(variant="panel"):
@@ -124,30 +194,89 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
124
  def process_text(text, nbits, suffix):
125
  log.debug(f"{text[:20]}")
126
  if not text:
127
- return
128
- out_code_func = globals().get(f"out_code_{suffix}")
129
- iscc = sct.Metadata(**sct.gen_text_code_semantic(text, bits=nbits))
130
- result = {out_code_func: gr.Textbox(value=iscc.iscc)}
131
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  in_text_a.change(
134
- lambda text, nbits: process_text(text, nbits, "a"),
135
  inputs=[in_text_a, in_iscc_bits],
136
- outputs=[out_code_a],
137
  show_progress="full",
138
  )
139
  in_text_b.change(
140
- lambda text, nbits: process_text(text, nbits, "b"),
141
  inputs=[in_text_b, in_iscc_bits],
142
- outputs=[out_code_b],
 
 
 
 
 
 
 
143
  show_progress="full",
144
  )
145
 
146
  out_code_a.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
147
  out_code_b.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
148
- with gr.Row():
149
- gr.ClearButton(components=[in_text_a, in_text_b])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  if __name__ == "__main__": # pragma: no cover
153
  demo.launch()
 
15
  from loguru import logger as log
16
  import gradio as gr
17
  import iscc_sct as sct
18
+ import textwrap
19
 
20
 
21
  def compute_iscc_code(text1, text2, bit_length):
 
30
  return generate_similarity_bar(hamming_to_cosine(sct.iscc_distance(code_a, code_b), bits))
31
 
32
 
33
+ def truncate_text(text, max_length=70):
34
+ return textwrap.shorten(text, width=max_length, placeholder="...")
35
+
36
+
37
  def hamming_to_cosine(hamming_distance: int, dim: int) -> float:
38
  """Aproximate the cosine similarity for a given hamming distance and dimension"""
39
  result = 1 - (2 * hamming_distance) / dim
 
68
 
69
 
70
  # Sample texts
71
+ sample_text_en = "\n\n".join(
72
+ [
73
+ " ".join(paragraph.split())
74
+ for paragraph in """
75
+ This document specifies the syntax and structure of the International Standard Content Code (ISCC),
76
+ as an identification system for digital assets (including encodings of text, images, audio, video or other content
77
+ across all media sectors). It also describes ISCC metadata and the use of ISCC in conjunction with other schemes, such
78
+ as DOI, ISAN, ISBN, ISRC, ISSN and ISWC.
79
+
80
+ An ISCC applies to a specific digital asset and is a data-descriptor deterministically constructed from multiple hash
81
+ digests using the algorithms and rules in this document. This document does not provide information on registration of
82
+ ISCCs.
83
+ """.strip().split("\n\n")
84
+ ]
85
+ )
86
+
87
+ sample_text_de = "\n\n".join(
88
+ [
89
+ " ".join(paragraph.split())
90
+ for paragraph in """
91
+ Dieses Dokument spezifiziert die Syntax und Struktur des International Standard Content Code (ISCC) als
92
+ Identifizierungssystem für digitale Inhalte (einschließlich Kodierungen von Text, Bildern, Audio, Video oder anderen
93
+ Inhalten in allen Medienbereichen). Sie beschreibt auch ISCC-Metadaten und die Verwendung von ISCC in Verbindung mit
94
+ anderen Systemen wie DOI, ISAN, ISBN, ISRC, ISSN und ISWC.
95
+
96
+ Ein ISCC bezieht sich auf ein bestimmtes digitales Gut und ist ein Daten-Deskriptor, der deterministisch aus mehreren
97
+ Hash-Digests unter Verwendung der Algorithmen und Regeln in diesem Dokument erstellt wird. Dieses Dokument enthält
98
+ keine Informationen über die Registrierung von ISCCs.
99
+ """.strip().split("\n\n")
100
+ ]
101
+ )
102
 
103
  custom_css = """
104
  #chunked-text span.label {
105
  text-transform: none !important;
106
  }
107
+
108
+ .clickable-example {
109
+ cursor: pointer;
110
+ transition: all 0.3s ease;
111
+ }
112
+
113
+ .clickable-example:hover {
114
+ background-color: #f0f0f0;
115
+ transform: scale(1.02);
116
+ }
117
+
118
+ .clickable-example .label-wrap {
119
+ font-weight: bold;
120
+ color: #4a90e2;
121
+ }
122
+
123
+ .truncate-text {
124
+ white-space: nowrap;
125
+ overflow: hidden;
126
+ text-overflow: ellipsis;
127
+ max-width: 300px;
128
+ display: inline-block;
129
+ }
130
+
131
  """
132
 
133
  iscc_theme = gr.themes.Default(
 
151
  minimum=64,
152
  maximum=256,
153
  step=32,
154
+ value=128,
155
  )
156
  with gr.Row(variant="panel"):
157
  with gr.Column(variant="panel"):
158
  in_text_a = gr.TextArea(
159
+ label="Text A",
160
+ placeholder="Click the sample text below or type or paste your text.",
161
  lines=12,
162
  max_lines=12,
163
  )
164
 
165
+ gr.Examples(
166
+ label="Click to use sample text",
167
+ examples=[[truncate_text(sample_text_en)]],
168
+ inputs=[in_text_a],
169
+ examples_per_page=1,
170
+ )
171
  out_code_a = gr.Textbox(label="ISCC Code for Text A")
172
+ gr.ClearButton(components=[in_text_a])
173
  with gr.Column(variant="panel"):
174
  in_text_b = gr.TextArea(
175
+ label="Text B",
176
+ placeholder="Click the sample text below or type or paste your text.",
177
  lines=12,
178
  max_lines=12,
179
  )
180
 
181
+ gr.Examples(
182
+ label="Click to use sample text",
183
+ examples=[[truncate_text(sample_text_de)]],
184
+ inputs=[in_text_b],
185
+ examples_per_page=1,
186
+ )
187
  out_code_b = gr.Textbox(label="ISCC Code for Text B")
188
+ gr.ClearButton(components=[in_text_b])
189
 
190
  with gr.Row(variant="panel"):
191
  with gr.Column(variant="panel"):
 
194
  def process_text(text, nbits, suffix):
195
  log.debug(f"{text[:20]}")
196
  if not text:
197
+ return None, text
198
+ # Use the full sample text if it matches the truncated version
199
+ full_text = (
200
+ sample_text_en
201
+ if text == truncate_text(sample_text_en)
202
+ else sample_text_de
203
+ if text == truncate_text(sample_text_de)
204
+ else text
205
+ )
206
+ iscc = sct.Metadata(**sct.gen_text_code_semantic(full_text, bits=nbits))
207
+ return iscc.iscc, full_text
208
+
209
+ def recalculate_iscc(text_a, text_b, nbits):
210
+ code_a = sct.gen_text_code_semantic(text_a, bits=nbits)["iscc"] if text_a else None
211
+ code_b = sct.gen_text_code_semantic(text_b, bits=nbits)["iscc"] if text_b else None
212
+
213
+ if code_a and code_b:
214
+ similarity = compare_codes(code_a, code_b, nbits)
215
+ else:
216
+ similarity = None
217
+
218
+ return (
219
+ gr.Textbox(value=code_a) if code_a else gr.Textbox(),
220
+ gr.Textbox(value=code_b) if code_b else gr.Textbox(),
221
+ similarity,
222
+ )
223
 
224
  in_text_a.change(
225
+ process_text,
226
  inputs=[in_text_a, in_iscc_bits],
227
+ outputs=[out_code_a, in_text_a],
228
  show_progress="full",
229
  )
230
  in_text_b.change(
231
+ process_text,
232
  inputs=[in_text_b, in_iscc_bits],
233
+ outputs=[out_code_b, in_text_b],
234
+ show_progress="full",
235
+ )
236
+
237
+ in_iscc_bits.change(
238
+ recalculate_iscc,
239
+ inputs=[in_text_a, in_text_b, in_iscc_bits],
240
+ outputs=[out_code_a, out_code_b, out_similarity],
241
  show_progress="full",
242
  )
243
 
244
  out_code_a.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
245
  out_code_b.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
246
+ with gr.Row(variant="panel"):
247
+ with gr.Column(variant="panel"):
248
+ gr.Markdown(
249
+ """
250
+ ## Understanding ISCC Semantic Text-Codes
251
+
252
+ ### What is an ISCC Semantic Text-Code?
253
+ An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of the text,
254
+ not just the exact words.
255
+
256
+ ### How does it work?
257
+ 1. **Input**: You provide a text in any language.
258
+ 2. **Processing**: Our system analyzes the meaning of the text.
259
+ 3. **Output**: A unique code is generated that represents the text's content.
260
+
261
+ ### What can it do?
262
+ - **Cross-language matching**: It can recognize similar content across different languages.
263
+ - **Similarity detection**: It can measure how similar two texts are in meaning, not just in words.
264
+ - **Content identification**: It can help identify texts with similar content, even if the wording is different.
265
 
266
+ ### How to use this demo:
267
+ 1. **Enter text**: Type or paste text into either or both text boxes.
268
+ 2. **Adjust bit length**: Use the slider to change the detail level of the code (higher = more detailed).
269
+ 3. **View results**: See the generated ISCC code for each text.
270
+ 4. **Compare**: Look at the similarity bar to see how alike the two texts are in meaning.
271
+
272
+ ### Why is this useful?
273
+ - **Content creators**: Find similar content across languages.
274
+ - **Researchers**: Quickly compare documents or find related texts in different languages.
275
+ - **Publishers**: Identify potential translations or similar works efficiently.
276
+
277
+ This technology opens up new possibilities for understanding and managing text content across language barriers!
278
+ """
279
+ )
280
 
281
  if __name__ == "__main__": # pragma: no cover
282
  demo.launch()
tests/test_demo.py CHANGED
@@ -3,6 +3,7 @@ from iscc_sct.demo import (
3
  compare_codes,
4
  hamming_to_cosine,
5
  generate_similarity_bar,
 
6
  )
7
 
8
 
@@ -78,3 +79,50 @@ def test_process_text(mock_gen_text_code):
78
  key, value = next(iter(result.items()))
79
  assert isinstance(key, gr.components.Textbox)
80
  assert isinstance(value, gr.components.Textbox)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  compare_codes,
4
  hamming_to_cosine,
5
  generate_similarity_bar,
6
+ recalculate_iscc,
7
  )
8
 
9
 
 
79
  key, value = next(iter(result.items()))
80
  assert isinstance(key, gr.components.Textbox)
81
  assert isinstance(value, gr.components.Textbox)
82
+
83
+
84
+ @patch("iscc_sct.demo.sct.gen_text_code_semantic")
85
+ @patch("iscc_sct.demo.compare_codes")
86
+ def test_recalculate_iscc(mock_compare_codes, mock_gen_text_code):
87
+ mock_gen_text_code.side_effect = lambda text, bits: {"iscc": f"ISCC:{text[:4].upper()}{bits}"}
88
+ mock_compare_codes.return_value = "<similarity_html>"
89
+
90
+ # Test with both texts non-empty
91
+ result = recalculate_iscc("Hello", "World", 64)
92
+ assert len(result) == 3
93
+ assert isinstance(result[0], gr.components.Textbox)
94
+ assert result[0].value == "ISCC:HELL64"
95
+ assert isinstance(result[1], gr.components.Textbox)
96
+ assert result[1].value == "ISCC:WORL64"
97
+ assert result[2] == "<similarity_html>"
98
+
99
+ # Test with first text empty
100
+ result = recalculate_iscc("", "World", 128)
101
+ assert len(result) == 3
102
+ assert isinstance(result[0], gr.components.Textbox)
103
+ assert result[0].value is None
104
+ assert isinstance(result[1], gr.components.Textbox)
105
+ assert result[1].value == "ISCC:WORL128"
106
+ assert result[2] is None
107
+
108
+ # Test with second text empty
109
+ result = recalculate_iscc("Hello", "", 256)
110
+ assert len(result) == 3
111
+ assert isinstance(result[0], gr.components.Textbox)
112
+ assert result[0].value == "ISCC:HELL256"
113
+ assert isinstance(result[1], gr.components.Textbox)
114
+ assert result[1].value is None
115
+ assert result[2] is None
116
+
117
+ # Test with both texts empty
118
+ result = recalculate_iscc("", "", 64)
119
+ assert len(result) == 3
120
+ assert isinstance(result[0], gr.components.Textbox)
121
+ assert result[0].value is None
122
+ assert isinstance(result[1], gr.components.Textbox)
123
+ assert result[1].value is None
124
+ assert result[2] is None
125
+
126
+ # Verify function calls
127
+ assert mock_gen_text_code.call_count == 4
128
+ assert mock_compare_codes.call_count == 1