titusz's picture
Synced repo using 'sync_with_huggingface' Github Action
b31f748 verified
raw
history blame
2.03 kB
import argparse
import glob
from pathlib import Path
from loguru import logger
from iscc_sct.main import create
from charset_normalizer import from_bytes
def main():
parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
parser.add_argument("path", type=str, help="Path to text files (supports glob patterns).", nargs="?")
parser.add_argument("-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)")
parser.add_argument("-g", "--granular", action="store_true", help="Activate granular processing.")
parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
args = parser.parse_args()
if args.path is None:
parser.print_help()
return
if not args.debug:
logger.remove()
for path in glob.glob(args.path):
path = Path(path)
if path.is_file():
logger.debug(f"Processing {path.name}")
with path.open("rb") as file:
data = file.read()
try:
text = data.decode("utf-8")
if not text.strip():
logger.warning(f"SKIPPED empty: {path}")
continue
except UnicodeDecodeError:
logger.debug(f"Could not decode {path.name} as UTF-8.")
charset_match = from_bytes(data).best()
if not charset_match: # pragma: no cover
logger.error(f"SKIPPING {path.name} - failed to detect text encoding")
continue
logger.debug(f"Decode {path.name} with {charset_match.encoding}.")
text = str(charset_match)
sct_meta = create(text, granular=args.granular, bits=args.bits)
if args.granular:
print(repr(sct_meta))
else:
print(sct_meta.iscc)
if __name__ == "__main__": # pragma: no cover
main()