PereLluis13
commited on
Commit
·
1501279
1
Parent(s):
cbf1f0e
Update README.md
Browse files
README.md
CHANGED
@@ -32,23 +32,25 @@ from transformers import pipeline
|
|
32 |
|
33 |
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')
|
34 |
# We need to use the tokenizer manually since we need special tokens.
|
35 |
-
extracted_text = triplet_extractor.tokenizer.
|
36 |
-
print(extracted_text)
|
37 |
# Function to parse the generated text and extract the triplets
|
38 |
def extract_triplets(text):
|
39 |
triplets = []
|
40 |
-
relation = ''
|
41 |
-
|
|
|
|
|
42 |
if token == "<triplet>":
|
43 |
current = 't'
|
44 |
if relation != '':
|
45 |
-
triplets.append((
|
46 |
relation = ''
|
47 |
subject = ''
|
48 |
elif token == "<subj>":
|
49 |
current = 's'
|
50 |
if relation != '':
|
51 |
-
triplets.append((
|
52 |
object_ = ''
|
53 |
elif token == "<obj>":
|
54 |
current = 'o'
|
@@ -60,32 +62,34 @@ def extract_triplets(text):
|
|
60 |
object_ += ' ' + token
|
61 |
elif current == 'o':
|
62 |
relation += ' ' + token
|
63 |
-
|
|
|
64 |
return triplets
|
65 |
-
extracted_triplets = extract_triplets(extracted_text)
|
66 |
print(extracted_triplets)
|
67 |
```
|
68 |
|
69 |
## Model and Tokenizer using transformers
|
70 |
|
71 |
```python3
|
72 |
-
|
73 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
74 |
|
75 |
def extract_triplets(text):
|
76 |
triplets = []
|
77 |
-
relation = ''
|
78 |
-
|
|
|
|
|
79 |
if token == "<triplet>":
|
80 |
current = 't'
|
81 |
if relation != '':
|
82 |
-
triplets.append((
|
83 |
relation = ''
|
84 |
subject = ''
|
85 |
elif token == "<subj>":
|
86 |
current = 's'
|
87 |
if relation != '':
|
88 |
-
triplets.append((
|
89 |
object_ = ''
|
90 |
elif token == "<obj>":
|
91 |
current = 'o'
|
@@ -97,7 +101,8 @@ def extract_triplets(text):
|
|
97 |
object_ += ' ' + token
|
98 |
elif current == 'o':
|
99 |
relation += ' ' + token
|
100 |
-
|
|
|
101 |
return triplets
|
102 |
|
103 |
# Load model and tokenizer
|
|
|
32 |
|
33 |
triplet_extractor = pipeline('text2text-generation', model='Babelscape/rebel-large', tokenizer='Babelscape/rebel-large')
|
34 |
# We need to use the tokenizer manually since we need special tokens.
|
35 |
+
extracted_text = triplet_extractor.tokenizer.batch_decode(triplet_extractor("Punta Cana is a resort town in the municipality of Higuey, in La Altagracia Province, the eastern most province of the Dominican Republic", return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"])
|
36 |
+
print(extracted_text[0])
|
37 |
# Function to parse the generated text and extract the triplets
|
38 |
def extract_triplets(text):
|
39 |
triplets = []
|
40 |
+
relation, subject, relation, object_ = '', '', '', ''
|
41 |
+
text = text.strip()
|
42 |
+
current = 'x'
|
43 |
+
for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
|
44 |
if token == "<triplet>":
|
45 |
current = 't'
|
46 |
if relation != '':
|
47 |
+
triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
|
48 |
relation = ''
|
49 |
subject = ''
|
50 |
elif token == "<subj>":
|
51 |
current = 's'
|
52 |
if relation != '':
|
53 |
+
triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
|
54 |
object_ = ''
|
55 |
elif token == "<obj>":
|
56 |
current = 'o'
|
|
|
62 |
object_ += ' ' + token
|
63 |
elif current == 'o':
|
64 |
relation += ' ' + token
|
65 |
+
if subject != '' and relation != '' and object_ != '':
|
66 |
+
triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
|
67 |
return triplets
|
68 |
+
extracted_triplets = extract_triplets(extracted_text[0])
|
69 |
print(extracted_triplets)
|
70 |
```
|
71 |
|
72 |
## Model and Tokenizer using transformers
|
73 |
|
74 |
```python3
|
|
|
75 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
76 |
|
77 |
def extract_triplets(text):
|
78 |
triplets = []
|
79 |
+
relation, subject, relation, object_ = '', '', '', ''
|
80 |
+
text = text.strip()
|
81 |
+
current = 'x'
|
82 |
+
for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
|
83 |
if token == "<triplet>":
|
84 |
current = 't'
|
85 |
if relation != '':
|
86 |
+
triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
|
87 |
relation = ''
|
88 |
subject = ''
|
89 |
elif token == "<subj>":
|
90 |
current = 's'
|
91 |
if relation != '':
|
92 |
+
triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
|
93 |
object_ = ''
|
94 |
elif token == "<obj>":
|
95 |
current = 'o'
|
|
|
101 |
object_ += ' ' + token
|
102 |
elif current == 'o':
|
103 |
relation += ' ' + token
|
104 |
+
if subject != '' and relation != '' and object_ != '':
|
105 |
+
triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
|
106 |
return triplets
|
107 |
|
108 |
# Load model and tokenizer
|