File size: 432 Bytes
b31f748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
"""Helper script do dump/freeze the current tokenizer"""

from tokenizers import Tokenizer
from pathlib import Path


HERE = Path(__file__).parent.absolute()


def main():
    MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    tokenizer = Tokenizer.from_pretrained(MODEL_NAME)
    tokenizer.save((HERE.parent / "iscc_sct/tokenizer.json").as_posix(), pretty=False)


if __name__ == "__main__":
    main()