dhigurashi commited on
Commit
b0f6037
·
1 Parent(s): 14a911c

support transformers==4.34.0

Browse files
Files changed (1) hide show
  1. tokenization_plamo.py +6 -14
tokenization_plamo.py CHANGED
@@ -5,7 +5,6 @@ from shutil import copyfile
5
  from typing import Any, Dict, List, Optional, Tuple
6
 
7
  import sentencepiece as spm
8
- import transformers
9
  from transformers.tokenization_utils import PreTrainedTokenizer
10
  from transformers.utils import logging
11
 
@@ -35,6 +34,12 @@ class PlamoTokenizer(PreTrainedTokenizer): # type: ignore
35
  kwargs["add_bos_token"] = False
36
  if "add_eos_token" not in kwargs:
37
  kwargs["add_eos_token"] = False
 
 
 
 
 
 
38
 
39
  super().__init__(
40
  vocab_file=vocab_file,
@@ -50,15 +55,6 @@ class PlamoTokenizer(PreTrainedTokenizer): # type: ignore
50
  **kwargs,
51
  )
52
 
53
- self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
54
- self.vocab_file = vocab_file
55
- self.add_bos_token = kwargs["add_bos_token"]
56
- self.add_eos_token = kwargs["add_eos_token"]
57
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
58
- self.sp_model.Load(vocab_file)
59
-
60
- self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
61
-
62
  # the functions below are copied from hf transformers LlamaTokenizer's implementation to fix the behaviour of the tokenizer
63
  # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/llama/tokenization_llama.py
64
 
@@ -155,7 +151,3 @@ class PlamoTokenizer(PreTrainedTokenizer): # type: ignore
155
  fi.write(content_spiece_model)
156
 
157
  return (out_vocab_file,)
158
-
159
-
160
- class PlamoConfig(transformers.LlamaConfig): # type: ignore
161
- model_type = "plamo"
 
5
  from typing import Any, Dict, List, Optional, Tuple
6
 
7
  import sentencepiece as spm
 
8
  from transformers.tokenization_utils import PreTrainedTokenizer
9
  from transformers.utils import logging
10
 
 
34
  kwargs["add_bos_token"] = False
35
  if "add_eos_token" not in kwargs:
36
  kwargs["add_eos_token"] = False
37
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
38
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
39
+ self.sp_model.Load(vocab_file)
40
+ self.vocab_file = vocab_file
41
+ self.add_bos_token = kwargs["add_bos_token"]
42
+ self.add_eos_token = kwargs["add_eos_token"]
43
 
44
  super().__init__(
45
  vocab_file=vocab_file,
 
55
  **kwargs,
56
  )
57
 
 
 
 
 
 
 
 
 
 
58
  # the functions below are copied from hf transformers LlamaTokenizer's implementation to fix the behaviour of the tokenizer
59
  # https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/llama/tokenization_llama.py
60
 
 
151
  fi.write(content_spiece_model)
152
 
153
  return (out_vocab_file,)