Spaces:

iscc
/

iscc-sct

Running

App Files Files Community

iscc-sct / iscc_sct /demo.py

titusz

Synced repo using 'sync_with_huggingface' Github Action

681cad2 verified 5 months ago

raw

history blame contribute delete

18.4 kB

	"""
	Gradio demo showcasing ISCC Semantic Text Code.
	"""

	from loguru import logger as log
	import gradio as gr
	import iscc_sct as sct
	import textwrap
	import yaml
	import pathlib


	HERE = pathlib.Path(__file__).parent.absolute()


	custom_css = """
	.simbar {
	background: white;
	min-height: 30px;
	}
	"""


	newline_symbols = {
	"\u000a": "⏎", # Line Feed - Represented by the 'Return' symbol
	"\u000b": "↨", # Vertical Tab - Represented by the 'Up Down Arrow' symbol
	"\u000c": "␌", # Form Feed - Unicode Control Pictures representation
	"\u000d": "↵", # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol
	"\u0085": "⤓", # Next Line - 'Downwards Arrow with Double Stroke' symbol
	"\u2028": "↲", # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol
	"\u2029": "¶", # Paragraph Separator - Represented by the 'Pilcrow' symbol
	}


	def no_nl(text):
	"""Replace non-printable newline characters with printable symbols"""
	for char, symbol in newline_symbols.items():
	text = text.replace(char, symbol)
	return text


	def no_nl_inner(text):
	"""Replace non-printable newline characters with printable symbols, ignoring leading and
	trailing newlines"""
	# Strip leading and trailing whitespace
	stripped_text = text.strip()

	# Replace newline characters within the text
	for char, symbol in newline_symbols.items():
	stripped_text = stripped_text.replace(char, symbol)

	# Add back the leading and trailing newlines
	leading_newlines = len(text) - len(text.lstrip())
	trailing_newlines = len(text) - len(text.rstrip())

	return "\n" * leading_newlines + stripped_text + "\n" * trailing_newlines


	def clean_chunk(chunk):
	"""Strip consecutive line breaks in text to a maximum of 2."""
	return chunk.replace("\n\n", "\n")


	def compute_iscc_code(text1, text2, bit_length):
	code1 = sct.gen_text_code_semantic(text1, bits=bit_length)
	code2 = sct.gen_text_code_semantic(text2, bits=bit_length)
	similarity = compare_codes(code1["iscc"], code2["iscc"], bit_length)
	return code1["iscc"], code2["iscc"], similarity


	import binascii


	def compare_codes(code_a, code_b, bits):
	if code_a and code_b:
	code_a_str = code_a.value if hasattr(code_a, "value") else str(code_a)
	code_b_str = code_b.value if hasattr(code_b, "value") else str(code_b)
	if code_a_str and code_b_str:
	try:
	distance = sct.iscc_distance(code_a_str, code_b_str)
	return generate_similarity_bar(hamming_to_cosine(distance, bits))
	except binascii.Error:
	# Invalid ISCC code format
	return None
	return None


	def truncate_text(text, max_length=70):
	return textwrap.shorten(text, width=max_length, placeholder="...")


	def hamming_to_cosine(hamming_distance: int, dim: int) -> float:
	"""Aproximate the cosine similarity for a given hamming distance and dimension"""
	result = 1 - (2 * hamming_distance) / dim
	return result


	def generate_similarity_bar(similarity):
	"""Generate a horizontal bar representing the similarity value, scaled to -100% to +100%."""
	# Scale similarity from [-1, 1] to [-100, 100]
	display_similarity = similarity * 100

	# Calculate the width of the bar based on the absolute value of similarity
	bar_width = int(abs(similarity) * 50) # 50% is half the width of the container

	# Determine the color and starting position based on the sign of the similarity
	color = "green" if similarity >= 0 else "red"
	position = "left" if similarity >= 0 else "right"

	# Adjust the text position to be centered within the colored bar
	text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
	text_alignment = (
	"transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
	)

	tooltip = "Similarity based on ISCC code comparison, not direct text comparison."

	bar_html = f"""
	<div title="{tooltip}" style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
	<div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
	<span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
	</div>
	</div>
	"""
	return bar_html


	def load_samples():
	with open(HERE / "samples.yml", "r", encoding="utf-8") as file:
	return yaml.safe_load(file)["samples"]


	samples = load_samples()


	iscc_theme = gr.themes.Default(
	font=[gr.themes.GoogleFont("Readex Pro Light")],
	font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
	text_size=gr.themes.sizes.text_lg,
	radius_size=gr.themes.sizes.radius_none,
	)

	with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
	with gr.Row(variant="panel"):
	gr.Markdown(
	"""
	## 🔮️ ISCC - Semantic-Code Text
	Demo of cross-lingual Semantic Text-Code (proof of concept)
	""",
	)
	with gr.Row(variant="panel"):
	with gr.Column(variant="panel"):
	sample_dropdown_a = gr.Dropdown(
	choices=["None"] + [lang for lang in samples["a"]],
	label="Select sample for Text A",
	value="None",
	)
	with gr.Column(variant="panel"):
	sample_dropdown_b = gr.Dropdown(
	choices=["None"] + [lang for lang in samples["b"]],
	label="Select sample for Text B",
	value="None",
	)

	with gr.Row(variant="panel"):
	with gr.Column(variant="panel"):
	in_text_a = gr.TextArea(
	label="Text A",
	placeholder="Choose sample text from the dropdown above or type or paste your text.",
	lines=12,
	max_lines=12,
	)
	out_code_a = gr.Textbox(label="ISCC-SCT for Text A")
	with gr.Column(variant="panel"):
	in_text_b = gr.TextArea(
	label="Text B",
	placeholder="Choose sample text from the dropdown above or type or paste your text.",
	lines=12,
	max_lines=12,
	)
	out_code_b = gr.Textbox(label="ISCC-SCT for Text B")

	with gr.Row(variant="panel"):
	with gr.Column(variant="panel"):
	out_similarity_title = gr.Markdown("### ISCC-based Semantic Similarity")
	with gr.Row(elem_classes="simbar"):
	out_similarity = gr.HTML()
	gr.Markdown(
	"NOTE: Similarity is calculated based on the generated ISCC-SCT, not the original text."
	)

	with gr.Row(variant="panel"):
	reset_button = gr.Button("Reset All")

	with gr.Accordion(label="🔍 Explore Details & Advanced Options", open=True):
	with gr.Row(variant="panel"):
	with gr.Column(variant="panel"):
	in_iscc_bits = gr.Slider(
	label="ISCC Bit-Length",
	info="NUMBER OF BITS FOR OUTPUT ISCC",
	minimum=64,
	maximum=256,
	step=32,
	value=sct.sct_opts.bits,
	)
	with gr.Column(variant="panel"):
	in_max_tokens = gr.Slider(
	label="Max Tokens",
	info="MAXIMUM NUMBER OF TOKENS PER CHUNK",
	minimum=49,
	maximum=sct.sct_opts.max_tokens,
	step=1,
	value=127,
	)

	with gr.Row(variant="panel"):
	with gr.Column(variant="panel"):
	out_chunks_a = gr.HighlightedText(
	label="Chunked Text A",
	interactive=False,
	elem_id="chunked-text-a",
	)
	with gr.Column(variant="panel"):
	out_chunks_b = gr.HighlightedText(
	label="Chunked Text B",
	interactive=False,
	elem_id="chunked-text-b",
	)

	with gr.Row(variant="panel"):
	with gr.Column(variant="panel"):
	gr.Markdown("### Granular Matches")
	in_granular_matches = gr.Dataframe(
	headers=["Chunk A", "Similarity", "Chunk B"],
	column_widths=["45%", "10%", "45%"],
	wrap=True,
	elem_classes="granular-matches",
	)

	def update_sample_text(choice, group):
	if choice == "None":
	return ""
	return samples[group][choice]

	sample_dropdown_a.change(
	lambda choice: update_sample_text(choice, "a"),
	inputs=[sample_dropdown_a],
	outputs=[in_text_a],
	)
	sample_dropdown_b.change(
	lambda choice: update_sample_text(choice, "b"),
	inputs=[sample_dropdown_b],
	outputs=[in_text_b],
	)

	def process_and_calculate(text_a, text_b, nbits, max_tokens):
	log.debug(f"Processing text_a: {text_a[:20]}, text_b: {text_b[:20]}")

	def process_single_text(text, suffix):
	out_code_func = globals().get(f"out_code_{suffix}")
	out_chunks_func = globals().get(f"out_chunks_{suffix}")

	if not text:
	return {
	out_code_func: gr.Textbox(value=None),
	out_chunks_func: gr.HighlightedText(
	value=None, elem_id=f"chunked-text-{suffix}"
	),
	}

	result = sct.gen_text_code_semantic(
	text,
	bits=nbits,
	simprints=True,
	offsets=True,
	sizes=True,
	contents=True,
	max_tokens=max_tokens,
	)
	iscc = sct.Metadata(**result).to_object_format()

	# Generate chunked text with simprints and overlaps
	features = iscc.features[0]
	highlighted_chunks = []
	overlaps = iscc.get_overlaps()

	for i, feature in enumerate(features.simprints):
	feature: sct.Feature
	content = feature.content

	# Remove leading overlap
	if i > 0 and overlaps[i - 1]:
	content = content[len(overlaps[i - 1]) :]

	# Remove trailing overlap
	if i < len(overlaps) and overlaps[i]:
	content = content[: -len(overlaps[i])]

	label = f"{feature.size}:{feature.simprint}"
	highlighted_chunks.append((no_nl_inner(content), label))

	if i < len(overlaps):
	overlap = overlaps[i]
	if overlap:
	highlighted_chunks.append((f"\n{no_nl(overlap)}\n", "overlap"))

	return {
	out_code_func: gr.Textbox(value=iscc.iscc),
	out_chunks_func: gr.HighlightedText(
	value=highlighted_chunks, elem_id=f"chunked-text-{suffix}"
	),
	"metadata": iscc,
	}

	result_a = process_single_text(text_a, "a")
	result_b = process_single_text(text_b, "b")

	code_a = result_a[out_code_a] if text_a else None
	code_b = result_b[out_code_b] if text_b else None

	similarity = compare_codes(code_a, code_b, nbits) or out_similarity

	granular_matches = []
	if text_a and text_b:
	matches = sct.granular_similarity(
	result_a["metadata"], result_b["metadata"], threshold=80
	)
	for match in matches:
	granular_matches.append(
	[
	match[0].content,
	f"{match[1]}%",
	match[2].content,
	]
	)

	return (
	result_a[out_code_a],
	result_a[out_chunks_a],
	result_b[out_code_b],
	result_b[out_chunks_b],
	similarity,
	gr.Dataframe(value=granular_matches),
	)

	in_text_a.change(
	process_and_calculate,
	inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
	outputs=[
	out_code_a,
	out_chunks_a,
	out_code_b,
	out_chunks_b,
	out_similarity,
	in_granular_matches,
	],
	show_progress="full",
	trigger_mode="always_last",
	)

	in_text_b.change(
	process_and_calculate,
	inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
	outputs=[
	out_code_a,
	out_chunks_a,
	out_code_b,
	out_chunks_b,
	out_similarity,
	in_granular_matches,
	],
	show_progress="full",
	trigger_mode="always_last",
	)

	in_iscc_bits.change(
	process_and_calculate,
	inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
	outputs=[
	out_code_a,
	out_chunks_a,
	out_code_b,
	out_chunks_b,
	out_similarity,
	in_granular_matches,
	],
	show_progress="full",
	)

	in_max_tokens.change(
	process_and_calculate,
	inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens],
	outputs=[
	out_code_a,
	out_chunks_a,
	out_code_b,
	out_chunks_b,
	out_similarity,
	in_granular_matches,
	],
	show_progress="full",
	)

	out_code_a.change(
	compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
	)
	out_code_b.change(
	compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
	)

	def reset_all():
	return (
	gr.Slider(value=64), # Reset ISCC Bit-Length
	gr.Dropdown(
	value="None", choices=["None"] + [lang for lang in samples["a"]]
	), # Reset sample dropdown A
	gr.Dropdown(
	value="None", choices=["None"] + [lang for lang in samples["b"]]
	), # Reset sample dropdown B
	gr.TextArea(value=""), # Reset Text A
	gr.TextArea(value=""), # Reset Text B
	gr.Textbox(value=""), # Reset ISCC Code for Text A
	gr.Textbox(value=""), # Reset ISCC Code for Text B
	gr.HTML(value=""), # Reset Similarity
	gr.HighlightedText(value=[]), # Reset Chunked Text A
	gr.HighlightedText(value=[]), # Reset Chunked Text B
	)

	reset_button.click(
	reset_all,
	outputs=[
	in_iscc_bits,
	sample_dropdown_a,
	sample_dropdown_b,
	in_text_a,
	in_text_b,
	out_code_a,
	out_code_b,
	out_similarity,
	out_chunks_a,
	out_chunks_b,
	],
	)

	with gr.Row(variant="panel"):
	gr.Markdown(
	"""
	## Understanding ISCC Semantic Text-Codes

	### What is an ISCC Semantic Text-Code?
	An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of
	the text, not just the exact words. Technically it is am ISCC-encoded, binarized multi-lingual
	document-embedding.

	### How does it work?
	1. Input: You provide a text in any language.
	2. Processing: Vector embeddings are created for individual chunks of the text.
	3. Output: A unique ISCC-UNIT is generated that represents the entire text's content.

	### What can it do?
	- Cross-language matching: It can recognize similar content across different languages.
	- Similarity detection: It can measure how similar two texts are in meaning, not just in words.
	- Content identification: It can help identify texts with similar content, even if the wording
	is different.

	### How to use this demo:
	1. Enter text: Type or paste text into either or both text boxes.
	2. Adjust bit length: Use the slider to change the detail level of the code (higher = more
	detailed).
	3. View results: See the generated ISCC code for each text.
	4. Compare: Look at the similarity bar to see how alike the two texts are in meaning, based on
	their ISCC codes.

	### Important Note:
	The similarity shown is calculated by comparing the ISCC codes, not the original texts. This
	allows for efficient and privacy-preserving comparisons, as only the codes need to be shared
	or stored.
	"""
	)

	gr.Markdown(
	"""
	### Why is this useful?
	- Content creators: Find similar content across languages.
	- Researchers: Quickly compare documents or find related texts in different languages.
	- Publishers: Identify potential translations or similar works efficiently.

	This technology opens up new possibilities for understanding and managing text content across
	language barriers!

	### Explore Details & Advanced Options

	The "Explore Details & Advanced Options" section provides additional tools and information:

	1. ISCC Bit-Length: Adjust the precision of the ISCC code. Higher values provide more detailed
	comparisons but may be more sensitive to minor differences.

	2. Max Tokens: Set the maximum number of tokens per chunk. This affects how the text is split
	for processing.

	3. Chunked Text: View how each input text is divided into chunks for processing. Each chunk is
	color-coded and labeled with its size and simprint (a similarity preserving fingerprint).

	4. Granular Matches: See a detailed comparison of individual chunks between Text A and Text B.
	This table shows which specific parts of the texts are most similar (above 80%), along with their
	approximate cosine similarity (scaled -100% to +100%).

	For more information about the ISCC see:
	- https://github.com/iscc
	- https://iscc.codes
	- https://iscc.io
	- [ISO 24138:2024](https://www.iso.org/standard/77899.html)
	"""
	)
	with gr.Row():
	gr.Markdown(
	f"iscc-sct v{sct.__version__} \| Source Code: https://github.com/iscc/iscc-sct",
	elem_classes="footer",
	)

	if __name__ == "__main__": # pragma: no cover
	demo.launch()