KoichiYasuoka commited on
Commit
d8ebfde
·
1 Parent(s): a31ff2e

initial release

Browse files
Files changed (9) hide show
  1. README.md +58 -0
  2. config.json +0 -0
  3. maker.py +59 -0
  4. merges.txt +0 -0
  5. pytorch_model.bin +3 -0
  6. special_tokens_map.json +15 -0
  7. tokenizer.json +0 -0
  8. tokenizer_config.json +15 -0
  9. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "la"
4
+ tags:
5
+ - "latin"
6
+ - "token-classification"
7
+ - "pos"
8
+ - "dependency-parsing"
9
+ datasets:
10
+ - "universal_dependencies"
11
+ license: "cc-by-sa-4.0"
12
+ pipeline_tag: "token-classification"
13
+ widget:
14
+ - text: "deus videt te non sentientem"
15
+ ---
16
+
17
+ # roberta-base-latin-ud-goeswith
18
+
19
+ ## Model Description
20
+
21
+ This is a RoBERTa model pre-trained on CC-100 Latin texts for POS-tagging and dependency-parsing (using `goeswith` for subwords), derived from [roberta-base-latin-v2](https://huggingface.co/ClassCat/roberta-base-latin-v2).
22
+
23
+ ## How to Use
24
+
25
+ ```py
26
+ class UDgoeswith(object):
27
+ def __init__(self,bert):
28
+ from transformers import AutoTokenizer,AutoModelForTokenClassification
29
+ self.tokenizer=AutoTokenizer.from_pretrained(bert)
30
+ self.model=AutoModelForTokenClassification.from_pretrained(bert)
31
+ def __call__(self,text):
32
+ import numpy,torch,ufal.chu_liu_edmonds
33
+ w=self.tokenizer(text,return_offsets_mapping=True)
34
+ v=w["input_ids"]
35
+ n=len(v)-1
36
+ with torch.no_grad():
37
+ d=self.model(input_ids=torch.tensor([v[0:i]+[self.tokenizer.mask_token_id]+v[i+1:]+[v[i]] for i in range(1,n)]))
38
+ e=d.logits.numpy()[:,1:n,:]
39
+ e[:,:,0]=numpy.nan
40
+ m=numpy.full((n,n),numpy.nan)
41
+ m[1:,1:]=numpy.nanmax(e,axis=2).transpose()
42
+ p=numpy.zeros((n,n))
43
+ p[1:,1:]=numpy.nanargmax(e,axis=2).transpose()
44
+ for i in range(1,n):
45
+ m[i,0],m[i,i],p[i,0]=m[i,i],numpy.nan,p[i,i]
46
+ h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
47
+ u="# text = "+text+"\n"
48
+ v=[(s,e) for s,e in w["offset_mapping"] if s<e]
49
+ for i,(s,e) in enumerate(v,1):
50
+ q=self.model.config.id2label[p[i,h[i]]].split("|")
51
+ u+="\t".join([str(i),text[s:e],"_",q[0],"_","|".join(q[1:-1]),str(h[i]),q[-1],"_","_" if i<len(v) and e<v[i][0] else "SpaceAfter=No"])+"\n"
52
+ return u+"\n"
53
+
54
+ nlp=UDgoeswith("KoichiYasuoka/roberta-base-latin-ud-goeswith")
55
+ print(nlp("deus videt te non sentientem"))
56
+ ```
57
+
58
+ [ufal.chu-liu-edmonds](https://pypi.org/project/ufal.chu-liu-edmonds/) is required.
config.json ADDED
The diff for this file is too large to render. See raw diff
 
maker.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/python3
2
+ src="ClassCat/roberta-base-latin-v2"
3
+ tgt="KoichiYasuoka/roberta-base-latin-ud-goeswith"
4
+ import os
5
+ url="https://github.com/UniversalDependencies/UD_Latin-ITTB"
6
+ os.system("test -d "+os.path.basename(url)+" || git clone --depth=1 "+url)
7
+ url="https://github.com/UniversalDependencies/UD_Latin-LLCT"
8
+ os.system("test -d "+os.path.basename(url)+" || git clone --depth=1 "+url)
9
+ os.system("for F in train dev test ; do cat UD_Latin-*/*-$F.conllu > $F.conllu ; done")
10
+ class UDgoeswithDataset(object):
11
+ def __init__(self,conllu,tokenizer):
12
+ self.ids,self.tags,label=[],[],set()
13
+ with open(conllu,"r",encoding="utf-8") as r:
14
+ cls,sep,msk=tokenizer.cls_token_id,tokenizer.sep_token_id,tokenizer.mask_token_id
15
+ dep,c="-|_|dep",[]
16
+ for s in r:
17
+ t=s.split("\t")
18
+ if len(t)==10 and t[0].isdecimal():
19
+ c.append(t)
20
+ elif c!=[]:
21
+ v=tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"]
22
+ for i in range(len(v)-1,-1,-1):
23
+ for j in range(1,len(v[i])):
24
+ c.insert(i+1,[c[i][0],"_","_","X","_","_",c[i][0],"goeswith","_","_"])
25
+ y=["0"]+[t[0] for t in c]
26
+ h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(c,1)]
27
+ p,v=[t[3]+"|"+t[5]+"|"+t[7] for t in c],sum(v,[])
28
+ if len(v)<tokenizer.model_max_length-3:
29
+ self.ids.append([cls]+v+[sep])
30
+ self.tags.append([dep]+p+[dep])
31
+ label=set(sum([self.tags[-1],list(label)],[]))
32
+ for i,k in enumerate(v):
33
+ self.ids.append([cls]+v[0:i]+[msk]+v[i+1:]+[sep,k])
34
+ self.tags.append([dep]+[t if h[j]==i+1 else dep for j,t in enumerate(p)]+[dep,dep])
35
+ c=[]
36
+ self.label2id={l:i for i,l in enumerate(sorted(label))}
37
+ def __call__(*args):
38
+ label=set(sum([list(t.label2id) for t in args],[]))
39
+ lid={l:i for i,l in enumerate(sorted(label))}
40
+ for t in args:
41
+ t.label2id=lid
42
+ return lid
43
+ __len__=lambda self:len(self.ids)
44
+ __getitem__=lambda self,i:{"input_ids":self.ids[i],"labels":[self.label2id[t] for t in self.tags[i]]}
45
+ from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer
46
+ from tokenizers.pre_tokenizers import ByteLevel
47
+ tkz=AutoTokenizer.from_pretrained(src)
48
+ tkz.backend_tokenizer.pre_tokenizer=ByteLevel(add_prefix_space=True)
49
+ tkz.model_max_length=514
50
+ trainDS=UDgoeswithDataset("train.conllu",tkz)
51
+ devDS=UDgoeswithDataset("dev.conllu",tkz)
52
+ testDS=UDgoeswithDataset("test.conllu",tkz)
53
+ lid=trainDS(devDS,testDS)
54
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()})
55
+ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
56
+ trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),train_dataset=trainDS,eval_dataset=devDS)
57
+ trn.train()
58
+ trn.save_model(tgt)
59
+ tkz.save_pretrained(tgt)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12b13f895f382b674d4cb369aee50ff34804ecafddf30f676f675355a3102acf
3
+ size 529322353
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "bos_token": "<s>",
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "errors": "replace",
7
+ "keep_accents": true,
8
+ "mask_token": "<mask>",
9
+ "model_max_length": 514,
10
+ "pad_token": "<pad>",
11
+ "sep_token": "</s>",
12
+ "tokenizer_class": "RobertaTokenizerFast",
13
+ "trim_offsets": true,
14
+ "unk_token": "<unk>"
15
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff