iscc-sct / iscc_sct /options.py
titusz's picture
Synced repo using 'sync_with_huggingface' Github Action
73ab668 verified
from dotenv import load_dotenv
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
__all__ = [
"SctOptions",
"sct_opts",
]
load_dotenv()
class SctOptions(BaseSettings):
bits: int = Field(
64,
description="ISCC_SCT_BITS - Default bit-length of generated Semantic Text-Code in bits",
ge=32,
le=256,
multiple_of=32,
)
bits_granular: int = Field(
64,
description="ISCC_SCT_BITS_GRANULAR - Default bit-length of granular features",
ge=32,
le=256,
multiple_of=32,
)
characters: bool = Field(
True, description="ISCC_SCT_CHARACTERS - Include document character count"
)
embedding: bool = Field(
False, description="ISCC_SCT_EMBEDDING - Include global document embedding"
)
precision: int = Field(
8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)"
)
simprints: bool = Field(
False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints"
)
offsets: bool = Field(
False, description="ISCC_SCT_OFFSETS - Include offsets of granular features"
)
sizes: bool = Field(
False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)"
)
contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
max_tokens: int = Field(
127,
description="ISCC_SCT_MAX_TOKENS - Max tokens per chunk (Default 127)",
le=127,
)
overlap: int = Field(
48,
description="ISCC_SCT_OVERLAP - Max tokens allowed to overlap between chunks (Default 48)",
)
trim: bool = Field(
False, description="ISCC_SCT_TRIM - Trim whitespace from chunks (Default False)"
)
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
env_prefix="ISCC_SCT_",
extra="ignore",
validate_assignment=True,
)
def override(self, update=None):
# type: (dict|None) -> SctOptions
"""Returns an updated and validated deep copy of the current settings instance."""
update = update or {} # sets {} if update is None
opts = self.model_copy(deep=True)
# We need update fields individually so validation gets triggered
for field, value in update.items():
setattr(opts, field, value)
return opts
sct_opts = SctOptions()