KoichiYasuoka commited on
Commit
35645d0
·
1 Parent(s): c6f55b9

juman separated

Browse files
Files changed (3) hide show
  1. juman.py +49 -0
  2. tokenizer_config.json +1 -1
  3. ud.py +1 -48
juman.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from transformers import DebertaV2TokenizerFast
3
+ from transformers.models.bert_japanese.tokenization_bert_japanese import MecabTokenizer
4
+ try:
5
+ from transformers.utils import cached_file
6
+ except:
7
+ from transformers.file_utils import cached_path,hf_bucket_url
8
+ cached_file=lambda x,y:os.path.join(x,y) if os.path.isdir(x) else cached_path(hf_bucket_url(x,y))
9
+
10
+ class MecabPreTokenizer(MecabTokenizer):
11
+ def mecab_split(self,i,normalized_string):
12
+ t=str(normalized_string)
13
+ z=[]
14
+ e=0
15
+ for c in self.tokenize(t):
16
+ s=t.find(c,e)
17
+ e=e if s<0 else s+len(c)
18
+ z.append((0,0) if s<0 else (s,e))
19
+ return [normalized_string[s:e] for s,e in z if e>0]
20
+ def pre_tokenize(self,pretok):
21
+ pretok.split(self.mecab_split)
22
+
23
+ class JumanDebertaV2TokenizerFast(DebertaV2TokenizerFast):
24
+ def __init__(self,**kwargs):
25
+ from tokenizers.pre_tokenizers import PreTokenizer,Metaspace,Sequence
26
+ super().__init__(**kwargs)
27
+ d,r="/var/lib/mecab/dic/juman-utf8","/etc/mecabrc"
28
+ if not (os.path.isdir(d) and os.path.isfile(r)):
29
+ import zipfile
30
+ import tempfile
31
+ self.dicdir=tempfile.TemporaryDirectory()
32
+ d=self.dicdir.name
33
+ with zipfile.ZipFile(cached_file(self.name_or_path,"mecab-jumandic-utf8.zip")) as z:
34
+ z.extractall(d)
35
+ r=os.path.join(d,"mecabrc")
36
+ with open(r,"w",encoding="utf-8") as w:
37
+ print("dicdir =",d,file=w)
38
+ self.custom_pre_tokenizer=Sequence([PreTokenizer.custom(MecabPreTokenizer(mecab_dic=None,mecab_option="-d "+d+" -r "+r)),Metaspace()])
39
+ self._tokenizer.pre_tokenizer=self.custom_pre_tokenizer
40
+ def save_pretrained(self,save_directory,**kwargs):
41
+ import shutil
42
+ from tokenizers.pre_tokenizers import Metaspace
43
+ self._auto_map={"AutoTokenizer":[None,"juman.JumanDebertaV2TokenizerFast"]}
44
+ self._tokenizer.pre_tokenizer=Metaspace()
45
+ super().save_pretrained(save_directory,**kwargs)
46
+ self._tokenizer.pre_tokenizer=self.custom_pre_tokenizer
47
+ shutil.copy(os.path.abspath(__file__),os.path.join(save_directory,"juman.py"))
48
+ shutil.copy(cached_file(self.name_or_path,"mecab-jumandic-utf8.zip"),os.path.join(save_directory,"mecab-jumandic-utf8.zip"))
49
+
tokenizer_config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "auto_map": {"AutoTokenizer":[null,"ud.JumanDebertaV2TokenizerFast"]},
3
  "bos_token": "[CLS]",
4
  "cls_token": "[CLS]",
5
  "do_lower_case": false,
 
1
  {
2
+ "auto_map": {"AutoTokenizer":[null,"juman.JumanDebertaV2TokenizerFast"]},
3
  "bos_token": "[CLS]",
4
  "cls_token": "[CLS]",
5
  "do_lower_case": false,
ud.py CHANGED
@@ -1,11 +1,4 @@
1
- import os
2
- from transformers import TokenClassificationPipeline,DebertaV2TokenizerFast
3
- from transformers.models.bert_japanese.tokenization_bert_japanese import MecabTokenizer
4
- try:
5
- from transformers.utils import cached_file
6
- except:
7
- from transformers.file_utils import cached_path,hf_bucket_url
8
- cached_file=lambda x,y:os.path.join(x,y) if os.path.isdir(x) else cached_path(hf_bucket_url(x,y))
9
 
10
  class UniversalDependenciesPipeline(TokenClassificationPipeline):
11
  def _forward(self,model_inputs):
@@ -67,43 +60,3 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
67
  i=y[numpy.nanargmax(z[x[k[-1]],y] if k[-1]<len(x) else z[y,y])]
68
  h[i]=x[k[-1]] if k[-1]<len(x) else i
69
  return h
70
-
71
- class MecabPreTokenizer(MecabTokenizer):
72
- def mecab_split(self,i,normalized_string):
73
- t=str(normalized_string)
74
- z=[]
75
- e=0
76
- for c in self.tokenize(t):
77
- s=t.find(c,e)
78
- e=e if s<0 else s+len(c)
79
- z.append((0,0) if s<0 else (s,e))
80
- return [normalized_string[s:e] for s,e in z if e>0]
81
- def pre_tokenize(self,pretok):
82
- pretok.split(self.mecab_split)
83
-
84
- class JumanDebertaV2TokenizerFast(DebertaV2TokenizerFast):
85
- def __init__(self,**kwargs):
86
- from tokenizers.pre_tokenizers import PreTokenizer,Metaspace,Sequence
87
- super().__init__(**kwargs)
88
- d,r="/var/lib/mecab/dic/juman-utf8","/etc/mecabrc"
89
- if not (os.path.isdir(d) and os.path.isfile(r)):
90
- import zipfile
91
- import tempfile
92
- self.dicdir=tempfile.TemporaryDirectory()
93
- d=self.dicdir.name
94
- with zipfile.ZipFile(cached_file(self.name_or_path,"mecab-jumandic-utf8.zip")) as z:
95
- z.extractall(d)
96
- r=os.path.join(d,"mecabrc")
97
- with open(r,"w",encoding="utf-8") as w:
98
- print("dicdir =",d,file=w)
99
- self.custom_pre_tokenizer=Sequence([PreTokenizer.custom(MecabPreTokenizer(mecab_dic=None,mecab_option="-d "+d+" -r "+r)),Metaspace()])
100
- self._tokenizer.pre_tokenizer=self.custom_pre_tokenizer
101
- def save_pretrained(self,save_directory,**kwargs):
102
- import shutil
103
- from tokenizers.pre_tokenizers import Metaspace
104
- self._auto_map={"AutoTokenizer":[None,"ud.JumanDebertaV2TokenizerFast"]}
105
- self._tokenizer.pre_tokenizer=Metaspace()
106
- super().save_pretrained(save_directory,**kwargs)
107
- self._tokenizer.pre_tokenizer=self.custom_pre_tokenizer
108
- shutil.copy(os.path.abspath(__file__),os.path.join(save_directory,"ud.py"))
109
- shutil.copy(cached_file(self.name_or_path,"mecab-jumandic-utf8.zip"),os.path.join(save_directory,"mecab-jumandic-utf8.zip"))
 
1
+ from transformers import TokenClassificationPipeline
 
 
 
 
 
 
 
2
 
3
  class UniversalDependenciesPipeline(TokenClassificationPipeline):
4
  def _forward(self,model_inputs):
 
60
  i=y[numpy.nanargmax(z[x[k[-1]],y] if k[-1]<len(x) else z[y,y])]
61
  h[i]=x[k[-1]] if k[-1]<len(x) else i
62
  return h