Spaces:

iscc
/

iscc-sct

Running

App Files Files Community

iscc-sct / tests /test_iscc_sct.py

titusz

Synced repo using 'sync_with_huggingface' Github Action

681cad2 verified 5 months ago

raw

history blame contribute delete

8.67 kB

	from pathlib import Path

	import pytest
	from blake3 import blake3

	import iscc_sct as sct
	from iscc_sct.code_semantic_text import (
	split_text,
	tokenize_chunks,
	embed_tokens,
	embed_chunks,
	compress,
	)
	import numpy as np


	HERE = Path(__file__).parent.absolute()

	TEXT = """
	`iscc-sct` is a proof of concept implementation of a semantic Text-Code for the
	[ISCC](https://core.iscc.codes) (International Standard Content Code). Semantic Text-Codes are
	designed to capture and represent the language agnostic semantic content of text for improved
	similarity detection.

	The ISCC framework already comes with a Text-Code that is based on lexical similarity and can match
	near duplicates. The ISCC Semantic Text-Code is planned as a new additional ISCC-UNIT focused on
	capturing a more abstract and broad semantic similarity. As such the Semantic Text-Code is
	engineered to be robust against a broader range of variations and translations of text that cannot
	be matched based on lexical similarity.
	"""


	def test_version():
	assert sct.__version__ == "0.1.3"


	def test_code_text_semantic_default():
	fp = HERE / "en.txt"
	result = sct.code_text_semantic(fp)
	assert result == {
	"iscc": "ISCC:CAA636IXQD736IGJ",
	"characters": 12076,
	}


	def test_code_text_semantic_no_chars():
	fp = HERE / "en.txt"
	result = sct.code_text_semantic(fp, characters=False)
	assert result == {"iscc": "ISCC:CAA636IXQD736IGJ"}


	def test_code_text_semantic_embedding():
	fp = HERE / "en.txt"
	result = sct.code_text_semantic(fp, embedding=True)
	assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
	assert len(result["features"][0]["embedding"]) == 384


	def test_code_text_semantic_features():
	fp = HERE / "en.txt"
	result = sct.code_text_semantic(fp, simprints=True)
	assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
	assert result["characters"] == 12076
	assert result["features"][0]["simprints"][:3] == ["5wkXkfEx4lE", "b2UVwfc3wgk", "qvlV0W63s90"]
	assert result["features"][0]["simprints"][-3:] == ["PNsX9eGZQEs", "fFk3M2u5Qkk", "TPuXs2sRtk8"]


	def test_code_text_semantic_offsets():
	fp = HERE / "en.txt"
	result = sct.code_text_semantic(fp, offsets=True)
	assert result["features"][0]["offsets"][:3] == [0, 277, 612]


	def test_code_text_semantic_chunks():
	fp = HERE / "en.txt"
	result = sct.code_text_semantic(fp, contents=True)
	assert len(result["features"][0]["contents"]) == 39
	assert result["features"][0]["contents"][0].startswith("\n Thank ")
	assert result["features"][0]["contents"][-1].endswith("(Applause)\n")


	def test_code_text_semantic_sizes():
	fp = HERE / "en.txt"
	result = sct.code_text_semantic(fp, sizes=True)
	# fmt: off
	assert result["features"][0]["sizes"] == [
	440, 396, 431, 385, 440, 380, 406, 477, 415, 536, 280, 449, 446, 442, 443, 444, 451, 485,
	477, 439, 517, 430, 468, 394, 531, 448, 421, 503, 376, 403, 513, 477, 393, 375, 555, 533,
	312, 455, 413
	]
	# fmt: on


	def test_gen_text_code_semantic_empty():
	with pytest.raises(ValueError) as excinfo:
	sct.gen_text_code_semantic("")
	assert str(excinfo.value) == "Input text cannot be empty."


	def test_gen_text_code_semantic_granular():
	result = sct.gen_text_code_semantic(
	TEXT,
	simprints=True,
	offsets=True,
	contents=True,
	)
	assert (
	result
	== {
	"characters": 726,
	"iscc": "ISCC:CAARISHPJHEXQAYL",
	"features": [
	{
	"maintype": "semantic",
	"subtype": "text",
	"version": 0,
	"simprints": ["FWjtTcl4Aws", "lAjHSc1wAws"],
	"offsets": [0, 297],
	"contents": [
	"\n"
	"`iscc-sct` is a proof of concept implementation of a semantic "
	"Text-Code for the\n"
	"[ISCC](https://core.iscc.codes) (*International Standard Content "
	"Code*). Semantic Text-Codes are\n"
	"designed to capture and represent the language agnostic semantic "
	"content of text for improved\n"
	"similarity detection.\n"
	"\n", # NOTE: end of first chunk (see comma :)
	"\n"
	"\n"
	"The ISCC framework already comes with a Text-Code that is based "
	"on lexical similarity and can match\n"
	"near duplicates. The ISCC Semantic Text-Code is planned as a new "
	"additional ISCC-UNIT focused on\n"
	"capturing a more abstract and broad semantic similarity. As such "
	"the Semantic Text-Code is\n"
	"engineered to be robust against a broader range of variations and "
	"translations of text that cannot\n"
	"be matched based on lexical similarity.\n",
	],
	}
	],
	}
	)


	def test_gen_text_code_semantic_checks_bits():
	with pytest.raises(ValueError):
	sct.gen_text_code_semantic("Test", bits=99)


	def test_split_text(text_en):
	chunks = split_text(text_en)
	assert chunks[0][1][:8] == "\n Thank "
	assert chunks[-1][1][:8] == "\n (Laugh"


	def test_split_text_override():
	text = "Try some very small and granular text splitting. Use options override for it."
	chunks = split_text(text, max_tokens=8, overlap=4)
	assert chunks == [
	(0, "Try some very small and granular text "),
	(20, "and granular text splitting. "),
	(49, "Use options override for it."),
	]


	def test_tokenize_chunks():
	chunks = ["Hello World", "These are chunks"]
	result = tokenize_chunks(chunks)
	np.testing.assert_array_equal(
	result["input_ids"],
	np.array([[0, 35378, 6661, 2, 1, 1], [0, 32255, 621, 7839, 1224, 2]], dtype=np.int64),
	)


	def test_embed_tokens():
	chunks = ["Hello World", "These are chunks"]
	tokens = tokenize_chunks(chunks)
	embeddings = embed_tokens(tokens)
	assert list(embeddings[0][0][:3]) == pytest.approx(
	[0.05907335, 0.11408358, 0.12727071], rel=1e-2
	)


	def test_embed_chunks():
	chunks = ["Hello World"]
	expected = [0.008697219, 0.038051583, 0.043976285]
	embeddings = embed_chunks(chunks)
	assert list(embeddings[0][:3]) == pytest.approx(expected, rel=1e-3)


	def test_gen_text_code_semantic(text_en):
	result = sct.gen_text_code_semantic(text_en, embedding=True)
	assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
	assert result["features"][0]["embedding"][:3] == pytest.approx(
	[0.03241169825196266, 0.022712377831339836, 0.050273094326257706],
	rel=1e-3,
	)


	def test_cross_lingual_match(text_en, text_de):
	a = sct.gen_text_code_semantic(text_en)["iscc"]
	assert a == "ISCC:CAA636IXQD736IGJ"
	b = sct.gen_text_code_semantic(text_de)["iscc"]
	assert b == "ISCC:CAA636IXQD4TMIGL" # hamming distance for the codes is 6 bits


	def test_tokenizer_integrity(text_en):
	# test if updates break tokenizer compatibility
	hasher = blake3()
	for idx, chunk in split_text(text_en):
	hasher.update(chunk.encode("utf-8"))
	checksum = hasher.hexdigest()
	assert checksum == "7a7ad1ce83c36f853d31390150403e225bac7825a5573dd5c9e326b0917c7b52"


	def test_soft_hash_text_semantic():
	result = sct.soft_hash_text_semantic("Hello World")
	assert (
	result.hex()
	== "f36789d8d1bbe351106bdf8e9b5006a3fc4cb1eb4042c75ea26b5058857c9177705429237858e9940e133c8b12ee1a3d"
	)


	def test_shift_resistance(text_en):
	a = sct.soft_hash_text_semantic(text_en)
	shifted = "Just put another sentence in the begginging of the text!\n" + text_en
	b = sct.soft_hash_text_semantic(shifted)
	# TODO improve algorithm with more shift resistant semantic chunking
	# On 256-bit code
	assert sct.hamming_distance(a, b) == 6
	# On 64-bit code
	assert sct.hamming_distance(b[:16], a[:16]) == 1


	def test_compress():
	arr1 = np.array([3.0, 15294.7789, 32977.7])
	arr2 = np.array([3.0, 15294.7789, 32977.7], dtype=np.float32)
	expected = [3.0, 15294.8, 32977.7]
	assert compress(arr1, 1) == expected
	assert compress(arr2, 1) == expected


	def test_embedding_precision():
	d16 = sct.gen_text_code_semantic("Hello World", embedding=True, precision=4)
	assert d16["features"][0]["embedding"][0] == 0.0087