iscc-sct / tests /visualize.py
titusz's picture
Synced repo using 'sync_with_huggingface' Github Action
b31f748 verified
raw
history blame
3.07 kB
from html import escape
import iscc_sct as ict
def generate_html(fingerprint_data):
chunks = fingerprint_data["features"]
# Sort chunks by offset
chunks.sort(key=lambda x: x["offset"])
html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Text Fingerprint Visualization</title>
<script src="https://cdn.tailwindcss.com"></script>
</head>
<body class="bg-gray-100 p-8">
<div class="max-w-4xl mx-auto bg-white p-6 rounded-lg shadow-lg">
<h1 class="text-2xl font-bold mb-4">Text Fingerprint Visualization</h1>
<div class="text-sm mb-4">
<span class="font-semibold">ISCC:</span> {fingerprint_data['iscc']}
</div>
<div class="text-sm mb-4">
<span class="font-semibold">Characters:</span> {fingerprint_data['characters']}
</div>
<div class="relative text-base leading-relaxed whitespace-pre-wrap">
"""
chunk_color = "bg-yellow-100"
overlap_color = "bg-red-100"
current_pos = 0
for i, chunk in enumerate(chunks):
start = max(chunk["offset"], current_pos)
end = chunk["offset"] + chunk["size"]
if start < end:
# Function to escape text and preserve line breaks
def escape_and_preserve_breaks(text):
return escape(text).replace("\n", "<br>")
# Non-overlapping part
html_content += f'<span class="{overlap_color}">{escape_and_preserve_breaks(chunk["text"][current_pos - chunk["offset"]:start - chunk["offset"]])}'
# Overlapping part (if any)
if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]:
overlap_end = chunks[i + 1]["offset"]
html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>'
html_content += escape_and_preserve_breaks(chunk["text"][overlap_end - chunk["offset"] :])
else:
html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :])
# Fingerprint badge
html_content += f'<span class="inline-block bg-gray-800 text-white text-xs px-2 py-1 rounded ml-1">{chunk["feature"]}</span>'
html_content += "</span>"
current_pos = end
html_content += """
</div>
</div>
</body>
</html>
"""
return html_content
def main():
with open("../README.md", "rb") as f:
data = f.read()
text = data.decode("utf-8")
result = ict.create(text, granular=True)
print(result.model_dump())
# Generate the HTML content
html_content = generate_html(result.model_dump())
# Write the HTML content to a file
with open("readme.html", "wt", encoding="utf-8") as f:
f.write(html_content)
if __name__ == "__main__":
main()