iscc-sct / tests /freeze_tokenizer.py
titusz's picture
Synced repo using 'sync_with_huggingface' Github Action
b31f748 verified
raw
history blame
432 Bytes
"""Helper script do dump/freeze the current tokenizer"""
from tokenizers import Tokenizer
from pathlib import Path
HERE = Path(__file__).parent.absolute()
def main():
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
tokenizer = Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.save((HERE.parent / "iscc_sct/tokenizer.json").as_posix(), pretty=False)
if __name__ == "__main__":
main()