File size: 8,659 Bytes
b31f748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
from pathlib import Path

import pytest
from blake3 import blake3

import iscc_sct as sct
from iscc_sct.code_semantic_text import (
    split_text,
    tokenize_chunks,
    embed_tokens,
    embed_chunks,
    compress,
)
import numpy as np


HERE = Path(__file__).parent.absolute()

TEXT = """
`iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the
[ISCC](https://core.iscc.codes) (*International Standard Content Code*). Semantic Text-Codes are
designed to capture and represent the language agnostic semantic content of text for improved
similarity detection.

The ISCC framework already comes with a Text-Code that is based on lexical similarity and can match
near duplicates. The ISCC Semantic Text-Code is planned as a new additional ISCC-UNIT focused on
capturing a more abstract and broad semantic similarity. As such the Semantic Text-Code is
engineered to be robust against a broader range of variations and translations of text that cannot
be matched based on lexical similarity.
"""


def test_version():
    assert sct.__version__ == "0.1.2"


def test_code_text_semantic_default():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp)
    assert result == {
        "iscc": "ISCC:CAA636IXQD736IGJ",
        "characters": 12076,
    }


def test_code_text_semantic_no_chars():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, characters=False)
    assert result == {"iscc": "ISCC:CAA636IXQD736IGJ"}


def test_code_text_semantic_embedding():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, embedding=True)
    assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
    assert len(result["features"][0]["embedding"]) == 384


def test_code_text_semantic_features():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, simprints=True)
    assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
    assert result["characters"] == 12076
    assert result["features"][0]["simprints"][:3] == ["5wkXkfEx4lE", "b2UVwfc3wgk", "qvlV0W63s90"]
    assert result["features"][0]["simprints"][-3:] == ["PNsX9eGZQEs", "fFk3M2u5Qkk", "TPuXs2sRtk8"]


def test_code_text_semantic_offsets():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, offsets=True)
    assert result["features"][0]["offsets"][:3] == [0, 277, 612]


def test_code_text_semantic_chunks():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, contents=True)
    assert len(result["features"][0]["contents"]) == 39
    assert result["features"][0]["contents"][0].startswith("\n Thank ")
    assert result["features"][0]["contents"][-1].endswith("(Applause)\n")


def test_code_text_semantic_sizes():
    fp = HERE / "en.txt"
    result = sct.code_text_semantic(fp, sizes=True)
    # fmt: off
    assert result["features"][0]["sizes"] == [
        440, 396, 431, 385, 440, 380, 406, 477, 415, 536, 280, 449, 446, 442, 443, 444, 451, 485,
        477, 439, 517, 430, 468, 394, 531, 448, 421, 503, 376, 403, 513, 477, 393, 375, 555, 533,
        312, 455, 413
    ]
    # fmt: on


def test_gen_text_code_semantic_empty():
    with pytest.raises(ValueError) as excinfo:
        sct.gen_text_code_semantic("")
    assert str(excinfo.value) == "Input text cannot be empty."


def test_gen_text_code_semantic_granular():
    result = sct.gen_text_code_semantic(
        TEXT,
        simprints=True,
        offsets=True,
        contents=True,
    )
    assert (
        result
        == {
            "characters": 726,
            "iscc": "ISCC:CAARISHPJHEXQAYL",
            "features": [
                {
                    "maintype": "semantic",
                    "subtype": "text",
                    "version": 0,
                    "simprints": ["FWjtTcl4Aws", "lAjHSc1wAws"],
                    "offsets": [0, 297],
                    "contents": [
                        "\n"
                        "`iscc-sct` is a **proof of concept implementation** of a semantic "
                        "Text-Code for the\n"
                        "[ISCC](https://core.iscc.codes) (*International Standard Content "
                        "Code*). Semantic Text-Codes are\n"
                        "designed to capture and represent the language agnostic semantic "
                        "content of text for improved\n"
                        "similarity detection.\n"
                        "\n",  # NOTE: end of first chunk (see comma :)
                        "\n"
                        "\n"
                        "The ISCC framework already comes with a Text-Code that is based "
                        "on lexical similarity and can match\n"
                        "near duplicates. The ISCC Semantic Text-Code is planned as a new "
                        "additional ISCC-UNIT focused on\n"
                        "capturing a more abstract and broad semantic similarity. As such "
                        "the Semantic Text-Code is\n"
                        "engineered to be robust against a broader range of variations and "
                        "translations of text that cannot\n"
                        "be matched based on lexical similarity.\n",
                    ],
                }
            ],
        }
    )


def test_gen_text_code_semantic_checks_bits():
    with pytest.raises(ValueError):
        sct.gen_text_code_semantic("Test", bits=99)


def test_split_text(text_en):
    chunks = split_text(text_en)
    assert chunks[0][1][:8] == "\n Thank "
    assert chunks[-1][1][:8] == "\n (Laugh"


def test_split_text_override():
    text = "Try some very small and granular text splitting. Use options override for it."
    chunks = split_text(text, max_tokens=8, overlap=4)
    assert chunks == [
        (0, "Try some very small and granular text "),
        (20, "and granular text splitting. "),
        (49, "Use options override for it."),
    ]


def test_tokenize_chunks():
    chunks = ["Hello World", "These are chunks"]
    result = tokenize_chunks(chunks)
    np.testing.assert_array_equal(
        result["input_ids"],
        np.array([[0, 35378, 6661, 2, 1, 1], [0, 32255, 621, 7839, 1224, 2]], dtype=np.int64),
    )


def test_embed_tokens():
    chunks = ["Hello World", "These are chunks"]
    tokens = tokenize_chunks(chunks)
    embeddings = embed_tokens(tokens)
    assert list(embeddings[0][0][:3]) == pytest.approx([0.05907335, 0.11408358, 0.12727071], rel=1e-2)


def test_embed_chunks():
    chunks = ["Hello World"]
    expected = [0.008697219, 0.038051583, 0.043976285]
    embeddings = embed_chunks(chunks)
    assert list(embeddings[0][:3]) == pytest.approx(expected, rel=1e-3)


def test_gen_text_code_semantic(text_en):
    result = sct.gen_text_code_semantic(text_en, embedding=True)
    assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
    assert result["features"][0]["embedding"][:3] == pytest.approx(
        [0.03241169825196266, 0.022712377831339836, 0.050273094326257706],
        rel=1e-3,
    )


def test_cross_lingual_match(text_en, text_de):
    a = sct.gen_text_code_semantic(text_en)["iscc"]
    assert a == "ISCC:CAA636IXQD736IGJ"
    b = sct.gen_text_code_semantic(text_de)["iscc"]
    assert b == "ISCC:CAA636IXQD4TMIGL"  # hamming distance for the codes is 6 bits


def test_tokenizer_integrity(text_en):
    # test if updates break tokenizer compatibility
    hasher = blake3()
    for idx, chunk in split_text(text_en):
        hasher.update(chunk.encode("utf-8"))
    checksum = hasher.hexdigest()
    assert checksum == "7a7ad1ce83c36f853d31390150403e225bac7825a5573dd5c9e326b0917c7b52"


def test_soft_hash_text_semantic():
    result = sct.soft_hash_text_semantic("Hello World")
    assert (
        result.hex()
        == "f36789d8d1bbe351106bdf8e9b5006a3fc4cb1eb4042c75ea26b5058857c9177705429237858e9940e133c8b12ee1a3d"
    )


def test_shift_resistance(text_en):
    a = sct.soft_hash_text_semantic(text_en)
    shifted = "Just put another sentence in the begginging of the text!\n" + text_en
    b = sct.soft_hash_text_semantic(shifted)
    # TODO improve algorithm with more shift resistant semantic chunking
    # On 256-bit code
    assert sct.hamming_distance(a, b) == 6
    # On 64-bit code
    assert sct.hamming_distance(b[:16], a[:16]) == 1


def test_compress():
    arr1 = np.array([3.0, 15294.7789, 32977.7])
    arr2 = np.array([3.0, 15294.7789, 32977.7], dtype=np.float32)
    expected = [3.0, 15294.8, 32977.7]
    assert compress(arr1, 1) == expected
    assert compress(arr2, 1) == expected


def test_embedding_precision():
    d16 = sct.gen_text_code_semantic("Hello World", embedding=True, precision=4)
    assert d16["features"][0]["embedding"][0] == 0.0087