File size: 2,032 Bytes
b31f748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import argparse
import glob
from pathlib import Path
from loguru import logger
from iscc_sct.main import create
from charset_normalizer import from_bytes


def main():
    parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
    parser.add_argument("path", type=str, help="Path to text files (supports glob patterns).", nargs="?")
    parser.add_argument("-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)")
    parser.add_argument("-g", "--granular", action="store_true", help="Activate granular processing.")
    parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
    args = parser.parse_args()

    if args.path is None:
        parser.print_help()
        return

    if not args.debug:
        logger.remove()

    for path in glob.glob(args.path):
        path = Path(path)
        if path.is_file():
            logger.debug(f"Processing {path.name}")
            with path.open("rb") as file:
                data = file.read()
                try:
                    text = data.decode("utf-8")
                    if not text.strip():
                        logger.warning(f"SKIPPED empty: {path}")
                        continue
                except UnicodeDecodeError:
                    logger.debug(f"Could not decode {path.name} as UTF-8.")
                    charset_match = from_bytes(data).best()
                    if not charset_match:  # pragma: no cover
                        logger.error(f"SKIPPING {path.name} - failed to detect text encoding")
                        continue
                    logger.debug(f"Decode {path.name} with {charset_match.encoding}.")
                    text = str(charset_match)
                sct_meta = create(text, granular=args.granular, bits=args.bits)
                if args.granular:
                    print(repr(sct_meta))
                else:
                    print(sct_meta.iscc)


if __name__ == "__main__":  # pragma: no cover
    main()