iscc-sct / tests /test_models.py
titusz's picture
Synced repo using 'sync_with_huggingface' Github Action
73ab668 verified
raw
history blame
7.11 kB
import pytest
from pydantic import ValidationError
from iscc_sct.models import Metadata, Feature, FeatureSet
import iscc_sct as sct
def test_feature_initialization():
# Test empty initialization
with pytest.raises(ValidationError):
Feature()
feature = Feature(simprint="XZjeSfdyVi0")
assert feature.simprint == "XZjeSfdyVi0"
assert feature.offset is None
assert feature.content is None
# Test initialization with values
feature = Feature(simprint="feature", offset=5, content="example text")
assert feature.simprint == "feature"
assert feature.offset == 5
assert feature.content == "example text"
def test_feature_set_initialization():
fs = FeatureSet()
assert fs.model_dump(exclude_none=True) == {
"maintype": "semantic",
"subtype": "text",
"version": 0,
}
def test_sct_meta_initialization():
# Test initialization with minimal required fields
meta = Metadata(iscc="ISCC1234567890")
assert meta.iscc == "ISCC1234567890"
assert meta.characters is None
assert meta.features is None
# Test initialization with all fields
features = [
FeatureSet(
simprints=[Feature(simprint="feature1", offset=0, content="text1")],
embedding=[0.1, 0.2],
)
]
meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features)
assert meta.iscc == "ISCC1234567890"
assert meta.characters == 1000
assert meta.features == features
assert meta.features[0].embedding == [0.1, 0.2]
def test_metadata_to_index_format():
# Test conversion from Object-Format to Index-Format
features = [
FeatureSet(
simprints=[
Feature(simprint="feature1", offset=0, size=5, content="text1"),
Feature(simprint="feature2", offset=5, size=5, content="text2"),
]
)
]
meta = Metadata(iscc="ISCC1234567890", features=features)
index_meta = meta.to_index_format()
assert isinstance(index_meta.features[0].simprints[0], str)
assert index_meta.features[0].simprints == ["feature1", "feature2"]
assert index_meta.features[0].offsets == [0, 5]
assert index_meta.features[0].sizes == [5, 5]
assert index_meta.features[0].contents == ["text1", "text2"]
# Test that Index-Format remains unchanged
index_meta2 = index_meta.to_index_format()
assert index_meta2.model_dump() == index_meta.model_dump()
def test_metadata_to_object_format():
# Test conversion from Index-Format to Object-Format
features = [
FeatureSet(
simprints=["feature1", "feature2"],
offsets=[0, 5],
sizes=[5, 5],
contents=["text1", "text2"],
)
]
meta = Metadata(iscc="ISCC1234567890", features=features)
object_meta = meta.to_object_format()
assert isinstance(object_meta.features[0].simprints[0], Feature)
assert object_meta.features[0].simprints[0].simprint == "feature1"
assert object_meta.features[0].simprints[0].offset == 0
assert object_meta.features[0].simprints[0].size == 5
assert object_meta.features[0].simprints[0].content == "text1"
assert object_meta.features[0].offsets is None
assert object_meta.features[0].sizes is None
assert object_meta.features[0].contents is None
# Test that Object-Format remains unchanged
object_meta2 = object_meta.to_object_format()
assert object_meta2.model_dump() == object_meta.model_dump()
def test_metadata_to_index_format_with_none_simprints():
# Test conversion when feature_set.simprints is None
features = [FeatureSet(simprints=None, embedding=[0.1, 0.2])]
meta = Metadata(iscc="ISCC1234567890", features=features)
index_meta = meta.to_index_format()
assert index_meta.features[0].simprints is None
assert index_meta.features[0].embedding == [0.1, 0.2]
assert index_meta.model_dump() == meta.model_dump()
def test_metadata_format_conversion_with_no_features():
meta = Metadata(iscc="ISCC1234567890")
index_meta = meta.to_index_format()
object_meta = meta.to_object_format()
assert index_meta.model_dump() == meta.model_dump()
assert object_meta.model_dump() == meta.model_dump()
def test_metadata_get_content(text_en):
iscc = sct.create(text_en, granular=True)
assert iscc.get_content() == text_en
def test_metadata_get_content_no_fetures():
meta = Metadata(iscc="ISCC1234567890")
assert meta.get_content() is None
def test_metadata_get_content_index_format():
meta = sct.create("Hello World", granular=True).to_index_format()
assert meta.get_content() == "Hello World"
def test_metadata_get_content_no_content():
meta = sct.create("Hello World", granular=True, contents=False)
assert meta.get_content() is None
def test_metadata_get_overlaps():
# Test with no features
meta = Metadata(iscc="ISCC1234567890")
assert meta.get_overlaps() == []
# Test with features but no simprints
meta = Metadata(iscc="ISCC1234567890", features=[FeatureSet()])
assert meta.get_overlaps() == []
# Test with non-overlapping chunks
features = [
FeatureSet(
simprints=[
Feature(simprint="feature1", offset=0, content="Hello"),
Feature(simprint="feature2", offset=5, content="World"),
]
)
]
meta = Metadata(iscc="ISCC1234567890", features=features)
assert meta.get_overlaps() == [""]
# Test with overlapping chunks
features = [
FeatureSet(
simprints=[
Feature(simprint="feature1", offset=0, content="Hello W"),
Feature(simprint="feature2", offset=5, content="World"),
]
)
]
meta = Metadata(iscc="ISCC1234567890", features=features)
assert meta.get_overlaps() == [" W"]
# Test with multiple overlaps
features = [
FeatureSet(
simprints=[
Feature(simprint="feature1", offset=0, content="Hello W"),
Feature(simprint="feature2", offset=5, content="World!"),
Feature(simprint="feature3", offset=10, content="! How are you?"),
]
)
]
meta = Metadata(iscc="ISCC1234567890", features=features)
assert meta.get_overlaps() == [" W", "!"]
# Test with index format
features = [
FeatureSet(
simprints=["feature1", "feature2", "feature3"],
offsets=[0, 5, 10],
contents=["Hello W", "World!", "! How are you?"],
)
]
meta = Metadata(iscc="ISCC1234567890", features=features)
assert meta.get_overlaps() == [" W", "!"]
# Test with missing content or offset
features = [
FeatureSet(
simprints=[
Feature(simprint="feature1", offset=0, content="Hello"),
Feature(simprint="feature2", content="World"),
Feature(simprint="feature3", offset=10),
]
)
]
meta = Metadata(iscc="ISCC1234567890", features=features)
assert meta.get_overlaps() == []