iscc-sct / tests /test_iscc_sct.py
titusz's picture
Synced repo using 'sync_with_huggingface' Github Action
681cad2 verified
from pathlib import Path
import pytest
from blake3 import blake3
import iscc_sct as sct
from iscc_sct.code_semantic_text import (
split_text,
tokenize_chunks,
embed_tokens,
embed_chunks,
compress,
)
import numpy as np
HERE = Path(__file__).parent.absolute()
TEXT = """
`iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the
[ISCC](https://core.iscc.codes) (*International Standard Content Code*). Semantic Text-Codes are
designed to capture and represent the language agnostic semantic content of text for improved
similarity detection.
The ISCC framework already comes with a Text-Code that is based on lexical similarity and can match
near duplicates. The ISCC Semantic Text-Code is planned as a new additional ISCC-UNIT focused on
capturing a more abstract and broad semantic similarity. As such the Semantic Text-Code is
engineered to be robust against a broader range of variations and translations of text that cannot
be matched based on lexical similarity.
"""
def test_version():
assert sct.__version__ == "0.1.3"
def test_code_text_semantic_default():
fp = HERE / "en.txt"
result = sct.code_text_semantic(fp)
assert result == {
"iscc": "ISCC:CAA636IXQD736IGJ",
"characters": 12076,
}
def test_code_text_semantic_no_chars():
fp = HERE / "en.txt"
result = sct.code_text_semantic(fp, characters=False)
assert result == {"iscc": "ISCC:CAA636IXQD736IGJ"}
def test_code_text_semantic_embedding():
fp = HERE / "en.txt"
result = sct.code_text_semantic(fp, embedding=True)
assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
assert len(result["features"][0]["embedding"]) == 384
def test_code_text_semantic_features():
fp = HERE / "en.txt"
result = sct.code_text_semantic(fp, simprints=True)
assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
assert result["characters"] == 12076
assert result["features"][0]["simprints"][:3] == ["5wkXkfEx4lE", "b2UVwfc3wgk", "qvlV0W63s90"]
assert result["features"][0]["simprints"][-3:] == ["PNsX9eGZQEs", "fFk3M2u5Qkk", "TPuXs2sRtk8"]
def test_code_text_semantic_offsets():
fp = HERE / "en.txt"
result = sct.code_text_semantic(fp, offsets=True)
assert result["features"][0]["offsets"][:3] == [0, 277, 612]
def test_code_text_semantic_chunks():
fp = HERE / "en.txt"
result = sct.code_text_semantic(fp, contents=True)
assert len(result["features"][0]["contents"]) == 39
assert result["features"][0]["contents"][0].startswith("\n Thank ")
assert result["features"][0]["contents"][-1].endswith("(Applause)\n")
def test_code_text_semantic_sizes():
fp = HERE / "en.txt"
result = sct.code_text_semantic(fp, sizes=True)
# fmt: off
assert result["features"][0]["sizes"] == [
440, 396, 431, 385, 440, 380, 406, 477, 415, 536, 280, 449, 446, 442, 443, 444, 451, 485,
477, 439, 517, 430, 468, 394, 531, 448, 421, 503, 376, 403, 513, 477, 393, 375, 555, 533,
312, 455, 413
]
# fmt: on
def test_gen_text_code_semantic_empty():
with pytest.raises(ValueError) as excinfo:
sct.gen_text_code_semantic("")
assert str(excinfo.value) == "Input text cannot be empty."
def test_gen_text_code_semantic_granular():
result = sct.gen_text_code_semantic(
TEXT,
simprints=True,
offsets=True,
contents=True,
)
assert (
result
== {
"characters": 726,
"iscc": "ISCC:CAARISHPJHEXQAYL",
"features": [
{
"maintype": "semantic",
"subtype": "text",
"version": 0,
"simprints": ["FWjtTcl4Aws", "lAjHSc1wAws"],
"offsets": [0, 297],
"contents": [
"\n"
"`iscc-sct` is a **proof of concept implementation** of a semantic "
"Text-Code for the\n"
"[ISCC](https://core.iscc.codes) (*International Standard Content "
"Code*). Semantic Text-Codes are\n"
"designed to capture and represent the language agnostic semantic "
"content of text for improved\n"
"similarity detection.\n"
"\n", # NOTE: end of first chunk (see comma :)
"\n"
"\n"
"The ISCC framework already comes with a Text-Code that is based "
"on lexical similarity and can match\n"
"near duplicates. The ISCC Semantic Text-Code is planned as a new "
"additional ISCC-UNIT focused on\n"
"capturing a more abstract and broad semantic similarity. As such "
"the Semantic Text-Code is\n"
"engineered to be robust against a broader range of variations and "
"translations of text that cannot\n"
"be matched based on lexical similarity.\n",
],
}
],
}
)
def test_gen_text_code_semantic_checks_bits():
with pytest.raises(ValueError):
sct.gen_text_code_semantic("Test", bits=99)
def test_split_text(text_en):
chunks = split_text(text_en)
assert chunks[0][1][:8] == "\n Thank "
assert chunks[-1][1][:8] == "\n (Laugh"
def test_split_text_override():
text = "Try some very small and granular text splitting. Use options override for it."
chunks = split_text(text, max_tokens=8, overlap=4)
assert chunks == [
(0, "Try some very small and granular text "),
(20, "and granular text splitting. "),
(49, "Use options override for it."),
]
def test_tokenize_chunks():
chunks = ["Hello World", "These are chunks"]
result = tokenize_chunks(chunks)
np.testing.assert_array_equal(
result["input_ids"],
np.array([[0, 35378, 6661, 2, 1, 1], [0, 32255, 621, 7839, 1224, 2]], dtype=np.int64),
)
def test_embed_tokens():
chunks = ["Hello World", "These are chunks"]
tokens = tokenize_chunks(chunks)
embeddings = embed_tokens(tokens)
assert list(embeddings[0][0][:3]) == pytest.approx(
[0.05907335, 0.11408358, 0.12727071], rel=1e-2
)
def test_embed_chunks():
chunks = ["Hello World"]
expected = [0.008697219, 0.038051583, 0.043976285]
embeddings = embed_chunks(chunks)
assert list(embeddings[0][:3]) == pytest.approx(expected, rel=1e-3)
def test_gen_text_code_semantic(text_en):
result = sct.gen_text_code_semantic(text_en, embedding=True)
assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
assert result["features"][0]["embedding"][:3] == pytest.approx(
[0.03241169825196266, 0.022712377831339836, 0.050273094326257706],
rel=1e-3,
)
def test_cross_lingual_match(text_en, text_de):
a = sct.gen_text_code_semantic(text_en)["iscc"]
assert a == "ISCC:CAA636IXQD736IGJ"
b = sct.gen_text_code_semantic(text_de)["iscc"]
assert b == "ISCC:CAA636IXQD4TMIGL" # hamming distance for the codes is 6 bits
def test_tokenizer_integrity(text_en):
# test if updates break tokenizer compatibility
hasher = blake3()
for idx, chunk in split_text(text_en):
hasher.update(chunk.encode("utf-8"))
checksum = hasher.hexdigest()
assert checksum == "7a7ad1ce83c36f853d31390150403e225bac7825a5573dd5c9e326b0917c7b52"
def test_soft_hash_text_semantic():
result = sct.soft_hash_text_semantic("Hello World")
assert (
result.hex()
== "f36789d8d1bbe351106bdf8e9b5006a3fc4cb1eb4042c75ea26b5058857c9177705429237858e9940e133c8b12ee1a3d"
)
def test_shift_resistance(text_en):
a = sct.soft_hash_text_semantic(text_en)
shifted = "Just put another sentence in the begginging of the text!\n" + text_en
b = sct.soft_hash_text_semantic(shifted)
# TODO improve algorithm with more shift resistant semantic chunking
# On 256-bit code
assert sct.hamming_distance(a, b) == 6
# On 64-bit code
assert sct.hamming_distance(b[:16], a[:16]) == 1
def test_compress():
arr1 = np.array([3.0, 15294.7789, 32977.7])
arr2 = np.array([3.0, 15294.7789, 32977.7], dtype=np.float32)
expected = [3.0, 15294.8, 32977.7]
assert compress(arr1, 1) == expected
assert compress(arr2, 1) == expected
def test_embedding_precision():
d16 = sct.gen_text_code_semantic("Hello World", embedding=True, precision=4)
assert d16["features"][0]["embedding"][0] == 0.0087