|
from html import escape |
|
import iscc_sct as ict |
|
|
|
|
|
def generate_html(fingerprint_data): |
|
chunks = fingerprint_data["features"] |
|
|
|
|
|
chunks.sort(key=lambda x: x["offset"]) |
|
|
|
html_content = f""" |
|
<!DOCTYPE html> |
|
<html lang="en"> |
|
<head> |
|
<meta charset="UTF-8"> |
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
<title>Text Fingerprint Visualization</title> |
|
<script src="https://cdn.tailwindcss.com"></script> |
|
</head> |
|
<body class="bg-gray-100 p-8"> |
|
<div class="max-w-4xl mx-auto bg-white p-6 rounded-lg shadow-lg"> |
|
<h1 class="text-2xl font-bold mb-4">Text Fingerprint Visualization</h1> |
|
<div class="text-sm mb-4"> |
|
<span class="font-semibold">ISCC:</span> {fingerprint_data['iscc']} |
|
</div> |
|
<div class="text-sm mb-4"> |
|
<span class="font-semibold">Characters:</span> {fingerprint_data['characters']} |
|
</div> |
|
<div class="relative text-base leading-relaxed whitespace-pre-wrap"> |
|
""" |
|
|
|
chunk_color = "bg-yellow-100" |
|
overlap_color = "bg-red-100" |
|
|
|
current_pos = 0 |
|
for i, chunk in enumerate(chunks): |
|
start = max(chunk["offset"], current_pos) |
|
end = chunk["offset"] + chunk["size"] |
|
|
|
if start < end: |
|
|
|
def escape_and_preserve_breaks(text): |
|
return escape(text).replace("\n", "<br>") |
|
|
|
|
|
html_content += f'<span class="{overlap_color}">{escape_and_preserve_breaks(chunk["text"][current_pos - chunk["offset"]:start - chunk["offset"]])}' |
|
|
|
|
|
if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]: |
|
overlap_end = chunks[i + 1]["offset"] |
|
html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>' |
|
html_content += escape_and_preserve_breaks(chunk["text"][overlap_end - chunk["offset"] :]) |
|
else: |
|
html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :]) |
|
|
|
|
|
html_content += f'<span class="inline-block bg-gray-800 text-white text-xs px-2 py-1 rounded ml-1">{chunk["feature"]}</span>' |
|
|
|
html_content += "</span>" |
|
|
|
current_pos = end |
|
|
|
html_content += """ |
|
</div> |
|
</div> |
|
</body> |
|
</html> |
|
""" |
|
return html_content |
|
|
|
|
|
def main(): |
|
with open("../README.md", "rb") as f: |
|
data = f.read() |
|
|
|
text = data.decode("utf-8") |
|
|
|
result = ict.create(text, granular=True) |
|
print(result.model_dump()) |
|
|
|
|
|
html_content = generate_html(result.model_dump()) |
|
|
|
|
|
with open("readme.html", "wt", encoding="utf-8") as f: |
|
f.write(html_content) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|