Spaces:

iscc
/

iscc-sct

Running

File size: 8,659 Bytes

b31f748

from pathlib import Path

import pytest
from blake3 import blake3

import iscc_sct as sct
from iscc_sct.code_semantic_text import (
    split_text,
    tokenize_chunks,
    embed_tokens,
    embed_chunks,
    compress,
)
import numpy as np


HERE = Path(__file__).parent.absolute()

TEXT = """
`iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the
[ISCC](https://core.iscc.codes) (*International Standard Content Code*). Semantic Text-Codes are
designed to capture and represent the language agnostic semantic content of text for improved
similarity detection.

The ISCC framework already comes with a Text-Code that is based on lexical similarity and can match
near duplicates. The ISCC Semantic Text-Code is planned as a new additional ISCC-UNIT focused on
capturing a more abstract and broad semantic similarity. As such the Semantic Text-Code is
engineered to be robust against a broader range of variations and translations of text that cannot
be matched based on lexical similarity.
"""


def test_version():
    assert sct.__version__ == "0.1.2"


def test_code_text_semantic_default():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp)
    assert result == {
        "iscc": "ISCC:CAA636IXQD736IGJ",
        "characters": 12076,
    }


def test_code_text_semantic_no_chars():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, characters=False)
    assert result == {"iscc": "ISCC:CAA636IXQD736IGJ"}


def test_code_text_semantic_embedding():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, embedding=True)
    assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
    assert len(result["features"][0]["embedding"]) == 384


def test_code_text_semantic_features():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, simprints=True)
    assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
    assert result["characters"] == 12076
    assert result["features"][0]["simprints"][:3] == ["5wkXkfEx4lE", "b2UVwfc3wgk", "qvlV0W63s90"]
    assert result["features"][0]["simprints"][-3:] == ["PNsX9eGZQEs", "fFk3M2u5Qkk", "TPuXs2sRtk8"]


def test_code_text_semantic_offsets():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, offsets=True)
    assert result["features"][0]["offsets"][:3] == [0, 277, 612]


def test_code_text_semantic_chunks():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, contents=True)
    assert len(result["features"][0]["contents"]) == 39
    assert result["features"][0]["contents"][0].startswith("\n Thank ")
    assert result["features"][0]["contents"][-1].endswith("(Applause)\n")


def test_code_text_semantic_sizes():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, sizes=True)
    # fmt: off
    assert result["features"][0]["sizes"] == [
        440, 396, 431, 385, 440, 380, 406, 477, 415, 536, 280, 449, 446, 442, 443, 444, 451, 485,
        477, 439, 517, 430, 468, 394, 531, 448, 421, 503, 376, 403, 513, 477, 393, 375, 555, 533,
        312, 455, 413
    ]
    # fmt: on


def test_gen_text_code_semantic_empty():
    with pytest.raises(ValueError) as excinfo:
        sct.gen_text_code_semantic("")
    assert str(excinfo.value) == "Input text cannot be empty."


def test_gen_text_code_semantic_granular():
    result = sct.gen_text_code_semantic(
        TEXT,
        simprints=True,
        offsets=True,
        contents=True,
    )
    assert (
        result
        == {
            "characters": 726,
            "iscc": "ISCC:CAARISHPJHEXQAYL",
            "features": [
                {
                    "maintype": "semantic",
                    "subtype": "text",
                    "version": 0,
                    "simprints": ["FWjtTcl4Aws", "lAjHSc1wAws"],
                    "offsets": [0, 297],
                    "contents": [
                        "\n"
                        "`iscc-sct` is a **proof of concept implementation** of a semantic "
                        "Text-Code for the\n"
                        "[ISCC](https://core.iscc.codes) (*International Standard Content "
                        "Code*). Semantic Text-Codes are\n"
                        "designed to capture and represent the language agnostic semantic "
                        "content of text for improved\n"
                        "similarity detection.\n"
                        "\n",  # NOTE: end of first chunk (see comma :)
                        "\n"
                        "\n"
                        "The ISCC framework already comes with a Text-Code that is based "
                        "on lexical similarity and can match\n"
                        "near duplicates. The ISCC Semantic Text-Code is planned as a new "
                        "additional ISCC-UNIT focused on\n"
                        "capturing a more abstract and broad semantic similarity. As such "
                        "the Semantic Text-Code is\n"
                        "engineered to be robust against a broader range of variations and "
                        "translations of text that cannot\n"
                        "be matched based on lexical similarity.\n",
                    ],
                }
            ],
        }
    )


def test_gen_text_code_semantic_checks_bits():
    with pytest.raises(ValueError):
        sct.gen_text_code_semantic("Test", bits=99)


def test_split_text(text_en):
    chunks = split_text(text_en)
    assert chunks[0][1][:8] == "\n Thank "
    assert chunks[-1][1][:8] == "\n (Laugh"


def test_split_text_override():
    text = "Try some very small and granular text splitting. Use options override for it."
    chunks = split_text(text, max_tokens=8, overlap=4)
    assert chunks == [
        (0, "Try some very small and granular text "),
        (20, "and granular text splitting. "),
        (49, "Use options override for it."),
    ]


def test_tokenize_chunks():
    chunks = ["Hello World", "These are chunks"]
    result = tokenize_chunks(chunks)
    np.testing.assert_array_equal(
        result["input_ids"],
        np.array([[0, 35378, 6661, 2, 1, 1], [0, 32255, 621, 7839, 1224, 2]], dtype=np.int64),
    )


def test_embed_tokens():
    chunks = ["Hello World", "These are chunks"]
    tokens = tokenize_chunks(chunks)
    embeddings = embed_tokens(tokens)
    assert list(embeddings[0][0][:3]) == pytest.approx([0.05907335, 0.11408358, 0.12727071], rel=1e-2)


def test_embed_chunks():
    chunks = ["Hello World"]
    expected = [0.008697219, 0.038051583, 0.043976285]
    embeddings = embed_chunks(chunks)
    assert list(embeddings[0][:3]) == pytest.approx(expected, rel=1e-3)


def test_gen_text_code_semantic(text_en):
    result = sct.gen_text_code_semantic(text_en, embedding=True)
    assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
    assert result["features"][0]["embedding"][:3] == pytest.approx(
        [0.03241169825196266, 0.022712377831339836, 0.050273094326257706],
        rel=1e-3,
    )


def test_cross_lingual_match(text_en, text_de):
    a = sct.gen_text_code_semantic(text_en)["iscc"]
    assert a == "ISCC:CAA636IXQD736IGJ"
    b = sct.gen_text_code_semantic(text_de)["iscc"]
    assert b == "ISCC:CAA636IXQD4TMIGL"  # hamming distance for the codes is 6 bits


def test_tokenizer_integrity(text_en):
    # test if updates break tokenizer compatibility
    hasher = blake3()
    for idx, chunk in split_text(text_en):
        hasher.update(chunk.encode("utf-8"))
    checksum = hasher.hexdigest()
    assert checksum == "7a7ad1ce83c36f853d31390150403e225bac7825a5573dd5c9e326b0917c7b52"


def test_soft_hash_text_semantic():
    result = sct.soft_hash_text_semantic("Hello World")
    assert (
        result.hex()
        == "f36789d8d1bbe351106bdf8e9b5006a3fc4cb1eb4042c75ea26b5058857c9177705429237858e9940e133c8b12ee1a3d"
    )


def test_shift_resistance(text_en):
    a = sct.soft_hash_text_semantic(text_en)
    shifted = "Just put another sentence in the begginging of the text!\n" + text_en
    b = sct.soft_hash_text_semantic(shifted)
    # TODO improve algorithm with more shift resistant semantic chunking
    # On 256-bit code
    assert sct.hamming_distance(a, b) == 6
    # On 64-bit code
    assert sct.hamming_distance(b[:16], a[:16]) == 1


def test_compress():
    arr1 = np.array([3.0, 15294.7789, 32977.7])
    arr2 = np.array([3.0, 15294.7789, 32977.7], dtype=np.float32)
    expected = [3.0, 15294.8, 32977.7]
    assert compress(arr1, 1) == expected
    assert compress(arr2, 1) == expected


def test_embedding_precision():
    d16 = sct.gen_text_code_semantic("Hello World", embedding=True, precision=4)
    assert d16["features"][0]["embedding"][0] == 0.0087