|
from pathlib import Path |
|
|
|
import pytest |
|
from blake3 import blake3 |
|
|
|
import iscc_sct as sct |
|
from iscc_sct.code_semantic_text import ( |
|
split_text, |
|
tokenize_chunks, |
|
embed_tokens, |
|
embed_chunks, |
|
compress, |
|
) |
|
import numpy as np |
|
|
|
|
|
HERE = Path(__file__).parent.absolute() |
|
|
|
TEXT = """ |
|
`iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the |
|
[ISCC](https://core.iscc.codes) (*International Standard Content Code*). Semantic Text-Codes are |
|
designed to capture and represent the language agnostic semantic content of text for improved |
|
similarity detection. |
|
|
|
The ISCC framework already comes with a Text-Code that is based on lexical similarity and can match |
|
near duplicates. The ISCC Semantic Text-Code is planned as a new additional ISCC-UNIT focused on |
|
capturing a more abstract and broad semantic similarity. As such the Semantic Text-Code is |
|
engineered to be robust against a broader range of variations and translations of text that cannot |
|
be matched based on lexical similarity. |
|
""" |
|
|
|
|
|
def test_version(): |
|
assert sct.__version__ == "0.1.2" |
|
|
|
|
|
def test_code_text_semantic_default(): |
|
fp = HERE / "en.txt" |
|
result = sct.code_text_semantic(fp) |
|
assert result == { |
|
"iscc": "ISCC:CAA636IXQD736IGJ", |
|
"characters": 12076, |
|
} |
|
|
|
|
|
def test_code_text_semantic_no_chars(): |
|
fp = HERE / "en.txt" |
|
result = sct.code_text_semantic(fp, characters=False) |
|
assert result == {"iscc": "ISCC:CAA636IXQD736IGJ"} |
|
|
|
|
|
def test_code_text_semantic_embedding(): |
|
fp = HERE / "en.txt" |
|
result = sct.code_text_semantic(fp, embedding=True) |
|
assert result["iscc"] == "ISCC:CAA636IXQD736IGJ" |
|
assert len(result["features"][0]["embedding"]) == 384 |
|
|
|
|
|
def test_code_text_semantic_features(): |
|
fp = HERE / "en.txt" |
|
result = sct.code_text_semantic(fp, simprints=True) |
|
assert result["iscc"] == "ISCC:CAA636IXQD736IGJ" |
|
assert result["characters"] == 12076 |
|
assert result["features"][0]["simprints"][:3] == ["5wkXkfEx4lE", "b2UVwfc3wgk", "qvlV0W63s90"] |
|
assert result["features"][0]["simprints"][-3:] == ["PNsX9eGZQEs", "fFk3M2u5Qkk", "TPuXs2sRtk8"] |
|
|
|
|
|
def test_code_text_semantic_offsets(): |
|
fp = HERE / "en.txt" |
|
result = sct.code_text_semantic(fp, offsets=True) |
|
assert result["features"][0]["offsets"][:3] == [0, 277, 612] |
|
|
|
|
|
def test_code_text_semantic_chunks(): |
|
fp = HERE / "en.txt" |
|
result = sct.code_text_semantic(fp, contents=True) |
|
assert len(result["features"][0]["contents"]) == 39 |
|
assert result["features"][0]["contents"][0].startswith("\n Thank ") |
|
assert result["features"][0]["contents"][-1].endswith("(Applause)\n") |
|
|
|
|
|
def test_code_text_semantic_sizes(): |
|
fp = HERE / "en.txt" |
|
result = sct.code_text_semantic(fp, sizes=True) |
|
|
|
assert result["features"][0]["sizes"] == [ |
|
440, 396, 431, 385, 440, 380, 406, 477, 415, 536, 280, 449, 446, 442, 443, 444, 451, 485, |
|
477, 439, 517, 430, 468, 394, 531, 448, 421, 503, 376, 403, 513, 477, 393, 375, 555, 533, |
|
312, 455, 413 |
|
] |
|
|
|
|
|
|
|
def test_gen_text_code_semantic_empty(): |
|
with pytest.raises(ValueError) as excinfo: |
|
sct.gen_text_code_semantic("") |
|
assert str(excinfo.value) == "Input text cannot be empty." |
|
|
|
|
|
def test_gen_text_code_semantic_granular(): |
|
result = sct.gen_text_code_semantic( |
|
TEXT, |
|
simprints=True, |
|
offsets=True, |
|
contents=True, |
|
) |
|
assert ( |
|
result |
|
== { |
|
"characters": 726, |
|
"iscc": "ISCC:CAARISHPJHEXQAYL", |
|
"features": [ |
|
{ |
|
"maintype": "semantic", |
|
"subtype": "text", |
|
"version": 0, |
|
"simprints": ["FWjtTcl4Aws", "lAjHSc1wAws"], |
|
"offsets": [0, 297], |
|
"contents": [ |
|
"\n" |
|
"`iscc-sct` is a **proof of concept implementation** of a semantic " |
|
"Text-Code for the\n" |
|
"[ISCC](https://core.iscc.codes) (*International Standard Content " |
|
"Code*). Semantic Text-Codes are\n" |
|
"designed to capture and represent the language agnostic semantic " |
|
"content of text for improved\n" |
|
"similarity detection.\n" |
|
"\n", |
|
"\n" |
|
"\n" |
|
"The ISCC framework already comes with a Text-Code that is based " |
|
"on lexical similarity and can match\n" |
|
"near duplicates. The ISCC Semantic Text-Code is planned as a new " |
|
"additional ISCC-UNIT focused on\n" |
|
"capturing a more abstract and broad semantic similarity. As such " |
|
"the Semantic Text-Code is\n" |
|
"engineered to be robust against a broader range of variations and " |
|
"translations of text that cannot\n" |
|
"be matched based on lexical similarity.\n", |
|
], |
|
} |
|
], |
|
} |
|
) |
|
|
|
|
|
def test_gen_text_code_semantic_checks_bits(): |
|
with pytest.raises(ValueError): |
|
sct.gen_text_code_semantic("Test", bits=99) |
|
|
|
|
|
def test_split_text(text_en): |
|
chunks = split_text(text_en) |
|
assert chunks[0][1][:8] == "\n Thank " |
|
assert chunks[-1][1][:8] == "\n (Laugh" |
|
|
|
|
|
def test_split_text_override(): |
|
text = "Try some very small and granular text splitting. Use options override for it." |
|
chunks = split_text(text, max_tokens=8, overlap=4) |
|
assert chunks == [ |
|
(0, "Try some very small and granular text "), |
|
(20, "and granular text splitting. "), |
|
(49, "Use options override for it."), |
|
] |
|
|
|
|
|
def test_tokenize_chunks(): |
|
chunks = ["Hello World", "These are chunks"] |
|
result = tokenize_chunks(chunks) |
|
np.testing.assert_array_equal( |
|
result["input_ids"], |
|
np.array([[0, 35378, 6661, 2, 1, 1], [0, 32255, 621, 7839, 1224, 2]], dtype=np.int64), |
|
) |
|
|
|
|
|
def test_embed_tokens(): |
|
chunks = ["Hello World", "These are chunks"] |
|
tokens = tokenize_chunks(chunks) |
|
embeddings = embed_tokens(tokens) |
|
assert list(embeddings[0][0][:3]) == pytest.approx([0.05907335, 0.11408358, 0.12727071], rel=1e-2) |
|
|
|
|
|
def test_embed_chunks(): |
|
chunks = ["Hello World"] |
|
expected = [0.008697219, 0.038051583, 0.043976285] |
|
embeddings = embed_chunks(chunks) |
|
assert list(embeddings[0][:3]) == pytest.approx(expected, rel=1e-3) |
|
|
|
|
|
def test_gen_text_code_semantic(text_en): |
|
result = sct.gen_text_code_semantic(text_en, embedding=True) |
|
assert result["iscc"] == "ISCC:CAA636IXQD736IGJ" |
|
assert result["features"][0]["embedding"][:3] == pytest.approx( |
|
[0.03241169825196266, 0.022712377831339836, 0.050273094326257706], |
|
rel=1e-3, |
|
) |
|
|
|
|
|
def test_cross_lingual_match(text_en, text_de): |
|
a = sct.gen_text_code_semantic(text_en)["iscc"] |
|
assert a == "ISCC:CAA636IXQD736IGJ" |
|
b = sct.gen_text_code_semantic(text_de)["iscc"] |
|
assert b == "ISCC:CAA636IXQD4TMIGL" |
|
|
|
|
|
def test_tokenizer_integrity(text_en): |
|
|
|
hasher = blake3() |
|
for idx, chunk in split_text(text_en): |
|
hasher.update(chunk.encode("utf-8")) |
|
checksum = hasher.hexdigest() |
|
assert checksum == "7a7ad1ce83c36f853d31390150403e225bac7825a5573dd5c9e326b0917c7b52" |
|
|
|
|
|
def test_soft_hash_text_semantic(): |
|
result = sct.soft_hash_text_semantic("Hello World") |
|
assert ( |
|
result.hex() |
|
== "f36789d8d1bbe351106bdf8e9b5006a3fc4cb1eb4042c75ea26b5058857c9177705429237858e9940e133c8b12ee1a3d" |
|
) |
|
|
|
|
|
def test_shift_resistance(text_en): |
|
a = sct.soft_hash_text_semantic(text_en) |
|
shifted = "Just put another sentence in the begginging of the text!\n" + text_en |
|
b = sct.soft_hash_text_semantic(shifted) |
|
|
|
|
|
assert sct.hamming_distance(a, b) == 6 |
|
|
|
assert sct.hamming_distance(b[:16], a[:16]) == 1 |
|
|
|
|
|
def test_compress(): |
|
arr1 = np.array([3.0, 15294.7789, 32977.7]) |
|
arr2 = np.array([3.0, 15294.7789, 32977.7], dtype=np.float32) |
|
expected = [3.0, 15294.8, 32977.7] |
|
assert compress(arr1, 1) == expected |
|
assert compress(arr2, 1) == expected |
|
|
|
|
|
def test_embedding_precision(): |
|
d16 = sct.gen_text_code_semantic("Hello World", embedding=True, precision=4) |
|
assert d16["features"][0]["embedding"][0] == 0.0087 |
|
|