KoichiYasuoka commited on
Commit
1a1a985
·
1 Parent(s): 03cac51

exclude pytextspan

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. ud.py +10 -3
README.md CHANGED
@@ -28,4 +28,4 @@ nlp=pipeline("universal-dependencies","KoichiYasuoka/deberta-base-japanese-juman
28
  print(nlp("全学年にわたって小学校の国語の教科書に挿し絵が用いられている"))
29
  ```
30
 
31
- [fugashi](https://pypi.org/project/fugashi) and [pytextspan](https://pypi.org/project/pytextspan) are required.
 
28
  print(nlp("全学年にわたって小学校の国語の教科書に挿し絵が用いられている"))
29
  ```
30
 
31
+ [fugashi](https://pypi.org/project/fugashi) is required.
ud.py CHANGED
@@ -68,10 +68,17 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
68
 
69
  class MecabPreTokenizer(MecabTokenizer):
70
  def mecab_split(self,i,normalized_string):
71
- import textspan
72
  t=str(normalized_string)
73
- k=self.tokenize(t)
74
- return [normalized_string[s:e] for c in textspan.get_original_spans(k,t) for s,e in c]
 
 
 
 
 
 
 
 
75
  def pre_tokenize(self,pretok):
76
  pretok.split(self.mecab_split)
77
 
 
68
 
69
  class MecabPreTokenizer(MecabTokenizer):
70
  def mecab_split(self,i,normalized_string):
 
71
  t=str(normalized_string)
72
+ z=[]
73
+ e=0
74
+ for c in self.tokenize(t):
75
+ s=t.find(c,e)
76
+ if s<0:
77
+ z.append((0,0))
78
+ else:
79
+ e=s+len(c)
80
+ z.append((s,e))
81
+ return [normalized_string[s:e] for s,e in z]
82
  def pre_tokenize(self,pretok):
83
  pretok.split(self.mecab_split)
84