Spaces:

iscc
/

iscc-sct

Running

App Files Files Community

iscc-sct / iscc_sct /cli.py

titusz

Synced repo using 'sync_with_huggingface' Github Action

b31f748 verified 5 months ago

raw

history blame

2.03 kB

	import argparse
	import glob
	from pathlib import Path
	from loguru import logger
	from iscc_sct.main import create
	from charset_normalizer import from_bytes


	def main():
	parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
	parser.add_argument("path", type=str, help="Path to text files (supports glob patterns).", nargs="?")
	parser.add_argument("-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)")
	parser.add_argument("-g", "--granular", action="store_true", help="Activate granular processing.")
	parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
	args = parser.parse_args()

	if args.path is None:
	parser.print_help()
	return

	if not args.debug:
	logger.remove()

	for path in glob.glob(args.path):
	path = Path(path)
	if path.is_file():
	logger.debug(f"Processing {path.name}")
	with path.open("rb") as file:
	data = file.read()
	try:
	text = data.decode("utf-8")
	if not text.strip():
	logger.warning(f"SKIPPED empty: {path}")
	continue
	except UnicodeDecodeError:
	logger.debug(f"Could not decode {path.name} as UTF-8.")
	charset_match = from_bytes(data).best()
	if not charset_match: # pragma: no cover
	logger.error(f"SKIPPING {path.name} - failed to detect text encoding")
	continue
	logger.debug(f"Decode {path.name} with {charset_match.encoding}.")
	text = str(charset_match)
	sct_meta = create(text, granular=args.granular, bits=args.bits)
	if args.granular:
	print(repr(sct_meta))
	else:
	print(sct_meta.iscc)


	if __name__ == "__main__": # pragma: no cover
	main()