File size: 6,624 Bytes
b31f748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c51bed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import pytest
import iscc_sct as sct
from iscc_sct import utils
from blake3 import blake3


def test_check_integrity(tmp_path):
    # Create a temporary file with known content
    file_path = tmp_path / "testfile.txt"
    content = "This is a test file."
    with open(file_path, "w") as f:
        f.write(content)

    # Generate a correct checksum and then alter it to simulate failure
    hasher = blake3()
    hasher.update(content.encode())
    correct_checksum = hasher.hexdigest()
    assert utils.check_integrity(file_path, correct_checksum) == file_path

    wrong_checksum = correct_checksum + "wrong"  # Deliberately incorrect checksum

    # Test the function with the wrong checksum
    with pytest.raises(RuntimeError) as exc_info:
        utils.check_integrity(file_path, wrong_checksum)

    # Check that the exception message contains expected text
    assert "Failed integrity check" in str(exc_info.value)


def test_hamming_distance_identical():
    a = b"abc"
    b = b"abc"
    assert utils.hamming_distance(a, b) == 0


def test_hamming_distance_different():
    a = b"abc"
    b = b"abd"
    assert utils.hamming_distance(a, b) == 3


def test_hamming_distance_completely_different():
    a = b"\x00"
    b = b"\xff"
    assert utils.hamming_distance(a, b) == 8


def test_hamming_distance_raises_value_error():
    a = b"abc"
    b = b"abcd"
    with pytest.raises(ValueError):
        utils.hamming_distance(a, b)


def test_encode_decode_base32():
    original = b"Hello, World!"
    encoded = utils.encode_base32(original)
    assert isinstance(encoded, str)
    assert encoded == "JBSWY3DPFQQFO33SNRSCC"
    decoded = utils.decode_base32(encoded)
    assert isinstance(decoded, bytes)
    assert decoded == original


def test_encode_decode_base64():
    original = b"Hello, World!"
    encoded = utils.encode_base64(original)
    assert isinstance(encoded, str)
    assert encoded == "SGVsbG8sIFdvcmxkIQ"
    decoded = utils.decode_base64(encoded)
    assert isinstance(decoded, bytes)
    assert decoded == original


def test_encode_decode_edge_cases():
    # Test empty input
    assert utils.encode_base32(b"") == ""
    assert utils.decode_base32("") == b""
    assert utils.encode_base64(b"") == ""
    assert utils.decode_base64("") == b""

    # Test input with padding
    original = b"a"
    assert utils.decode_base32(utils.encode_base32(original)) == original
    assert utils.decode_base64(utils.encode_base64(original)) == original


def test_iscc_distance_different_lengths():
    iscc1 = sct.create("Hello", bits=64).iscc
    iscc2 = sct.create("Hello", bits=96).iscc
    with pytest.raises(ValueError, match="The input ISCCs must have the same length"):
        utils.iscc_distance(iscc1, iscc2)


def test_cosine_similarity_identical():
    a = b"\x00\x00\x00\x00"
    b = b"\x00\x00\x00\x00"
    assert utils.cosine_similarity(a, b) == 100


def test_cosine_similarity_opposite():
    a = b"\x00\x00\x00\x00"
    b = b"\xff\xff\xff\xff"
    assert utils.cosine_similarity(a, b) == -100


def test_cosine_similarity_half_similar():
    a = b"\x00\x00\xff\xff"
    b = b"\x00\x00\x00\x00"
    assert utils.cosine_similarity(a, b) == 0


def test_cosine_similarity_quarter_similar():
    a = b"\x00\xff\xff\xff"
    b = b"\x00\x00\x00\x00"
    assert utils.cosine_similarity(a, b) == -50


def test_cosine_similarity_three_quarter_similar():
    a = b"\x00\x00\x00\xff"
    b = b"\x00\x00\x00\x00"
    assert utils.cosine_similarity(a, b) == 50


def test_cosine_similarity_different_lengths():
    a = b"\x00\x00\x00"
    b = b"\x00\x00\x00\x00"
    with pytest.raises(ValueError, match="The lengths of the two bytes objects must be the same"):
        utils.cosine_similarity(a, b)


def test_granular_similarity():
    from iscc_sct.models import Metadata, FeatureSet, Feature

    # Create two Metadata objects with some matching and non-matching simprints
    metadata_a = Metadata(
        iscc="ISCC:KACYPXW563EDNM",
        features=[
            FeatureSet(
                simprints=[
                    Feature(simprint="AAECAwQFBgc"),  # Will match
                    Feature(simprint="CAkKCwwNDg8"),  # Will not match
                ]
            )
        ],
    )

    metadata_b = Metadata(
        iscc="ISCC:KACYPXW563EDNM",
        features=[
            FeatureSet(
                simprints=[
                    Feature(simprint="AAECAwQFBgc"),  # Will match
                    Feature(simprint="EBESExQVFhc"),  # Will not match
                ]
            )
        ],
    )

    # Test with default threshold
    matches = utils.granular_similarity(metadata_a, metadata_b)
    assert len(matches) == 1
    assert matches[0][0].simprint == "AAECAwQFBgc"
    assert matches[0][1] == 100
    assert matches[0][2].simprint == "AAECAwQFBgc"

    # Test with lower threshold
    matches = utils.granular_similarity(metadata_a, metadata_b, threshold=0)
    assert len(matches) == 2  # All combinations should match

    # Test with higher threshold
    matches = utils.granular_similarity(metadata_a, metadata_b, threshold=101)
    assert len(matches) == 0  # No matches should be found


def test_granular_similarity_no_matches():
    from iscc_sct.models import Metadata, FeatureSet, Feature

    metadata_a = Metadata(
        iscc="ISCC:KACYPXW563EDNM",
        features=[FeatureSet(simprints=[Feature(simprint="AAECAwQFBgc")])],
    )

    metadata_b = Metadata(
        iscc="ISCC:KACYPXW563EDNM",
        features=[FeatureSet(simprints=[Feature(simprint="CAkKCwwNDg8")])],
    )

    matches = utils.granular_similarity(metadata_a, metadata_b)
    assert len(matches) == 0


def test_granular_similarity_multiple_matches():
    from iscc_sct.models import Metadata, FeatureSet, Feature

    metadata_a = Metadata(
        iscc="ISCC:KACYPXW563EDNM",
        features=[
            FeatureSet(
                simprints=[Feature(simprint="AAECAwQFBgc"), Feature(simprint="CAkKCwwNDg8")]
            ),
            FeatureSet(simprints=[Feature(simprint="EBESExQVFhc")]),
        ],
    )

    metadata_b = Metadata(
        iscc="ISCC:KACYPXW563EDNM",
        features=[
            FeatureSet(
                simprints=[Feature(simprint="AAECAwQFBgc"), Feature(simprint="GBkaGxwdHh8")]
            ),
            FeatureSet(simprints=[Feature(simprint="EBESExQVFhc")]),
        ],
    )

    matches = utils.granular_similarity(metadata_a, metadata_b)
    assert len(matches) == 2
    assert {(match[0].simprint, match[2].simprint) for match in matches} == {
        ("AAECAwQFBgc", "AAECAwQFBgc"),
        ("EBESExQVFhc", "EBESExQVFhc"),
    }