updated model
Browse files- .gitignore +5 -1
- demo.py +1 -1
- pipeline/__pycache__/custom_functions.cpython-39.pyc +0 -0
- pipeline/__pycache__/post_processors.cpython-39.pyc +0 -0
- pipeline/post_processors.py +134 -3
- resources/__pycache__/colors.cpython-39.pyc +0 -0
- resources/__pycache__/template_list.cpython-39.pyc +0 -0
- resources/__pycache__/text_list.cpython-39.pyc +0 -0
- resources/colors.py +13 -0
- resources/template_list.py +48 -0
- resources/text_list.py +0 -0
- utils/__pycache__/util.cpython-39.pyc +0 -0
- utils/util.py +1 -0
.gitignore
CHANGED
@@ -1,2 +1,6 @@
|
|
1 |
test_run.py
|
2 |
-
.DS_Store
|
|
|
|
|
|
|
|
|
|
1 |
test_run.py
|
2 |
+
.DS_Store
|
3 |
+
analyzer.py
|
4 |
+
main.py
|
5 |
+
results/*
|
6 |
+
inputtexts/*
|
demo.py
CHANGED
@@ -32,7 +32,7 @@ st.set_page_config(page_title="ENGAGEMENT analyzer (beta ver 0.3)",
|
|
32 |
@st.cache(allow_output_mutation=True)
|
33 |
def load_model():
|
34 |
# nlp = spacy.load("en_engagement_RoBERTa_context_flz")
|
35 |
-
nlp = spacy.load("
|
36 |
return (nlp)
|
37 |
|
38 |
|
|
|
32 |
@st.cache(allow_output_mutation=True)
|
33 |
def load_model():
|
34 |
# nlp = spacy.load("en_engagement_RoBERTa_context_flz")
|
35 |
+
nlp = spacy.load("en_engagement_LSTM")
|
36 |
return (nlp)
|
37 |
|
38 |
|
pipeline/__pycache__/custom_functions.cpython-39.pyc
ADDED
Binary file (3.61 kB). View file
|
|
pipeline/__pycache__/post_processors.cpython-39.pyc
CHANGED
Binary files a/pipeline/__pycache__/post_processors.cpython-39.pyc and b/pipeline/__pycache__/post_processors.cpython-39.pyc differ
|
|
pipeline/post_processors.py
CHANGED
@@ -19,12 +19,138 @@ def simple_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
|
19 |
return data, columns
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
23 |
spans_key: str = "sc",
|
24 |
attrs: List[str] = SPAN_ATTRS):
|
25 |
-
columns = attrs + ["Conf. score", 'span dep',
|
26 |
-
"POS", "POS sequence", "head"]
|
27 |
data = []
|
|
|
|
|
28 |
|
29 |
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']):
|
30 |
|
@@ -32,11 +158,16 @@ def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
|
32 |
span_info.extend([str(getattr(span, attr)) for attr in attrs])
|
33 |
|
34 |
span_info.append(score)
|
|
|
|
|
35 |
span_info.append(span.root.dep_)
|
|
|
36 |
span_info.append(span.root.tag_)
|
|
|
37 |
span_info.append("_".join([t.tag_ for t in span]))
|
38 |
span_info.append(span.root.head.norm_)
|
39 |
-
|
|
|
40 |
data.append(span_info)
|
41 |
|
42 |
return data, columns
|
|
|
19 |
return data, columns
|
20 |
|
21 |
|
22 |
+
# def span_info_aggregator()
|
23 |
+
|
24 |
+
def construction_classifier(doc, span):
|
25 |
+
category = span.root.dep_
|
26 |
+
spanroot = span.root
|
27 |
+
|
28 |
+
##
|
29 |
+
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
|
30 |
+
span_dep = [t.dep_ for t in span]
|
31 |
+
span_token = [t.norm_ for t in span]
|
32 |
+
span_tag = [t.tag_ for t in span]
|
33 |
+
|
34 |
+
|
35 |
+
c_dep = [c.dep_ for c in spanroot.children]
|
36 |
+
c_pos = [c.pos_ for c in spanroot.children]
|
37 |
+
c_tag = [c.tag_ for c in spanroot.children]
|
38 |
+
|
39 |
+
## nesting classifiers
|
40 |
+
if spanroot.dep_ == "conj":
|
41 |
+
while spanroot.dep_ == 'conj':
|
42 |
+
spanroot = spanroot.head
|
43 |
+
if spanroot.dep_ == "poss":
|
44 |
+
while spanroot.dep_ == 'poss':
|
45 |
+
spanroot = spanroot.head
|
46 |
+
|
47 |
+
|
48 |
+
## Simple classifier
|
49 |
+
if spanroot.dep_ in ['pcomp']:
|
50 |
+
if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
51 |
+
category = "Gerund"
|
52 |
+
|
53 |
+
|
54 |
+
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj"]:
|
55 |
+
category = "Object"
|
56 |
+
if spanroot.dep_ in ["nsubj", "nsubjpass"]:
|
57 |
+
category = "Subject"
|
58 |
+
if spanroot.dep_ in ["cc"]:
|
59 |
+
category = "Coordinating conjunction"
|
60 |
+
|
61 |
+
if spanroot.dep_ in ["ROOT", "advcl"]:
|
62 |
+
if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
|
63 |
+
category = "It is X that-clause"
|
64 |
+
elif "nsubj" in c_dep and "acomp" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
|
65 |
+
category = "It is X that-clause"
|
66 |
+
elif "nsubj" in c_dep and "oprd" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
|
67 |
+
category = "It is X that-clause"
|
68 |
+
elif "nsubj" in c_dep and "it" in span_token and spanroot.pos_ == "VERB":
|
69 |
+
category = "It VERB that-clause"
|
70 |
+
elif "expl" in c_dep and "NOUN" in c_pos:
|
71 |
+
category = "There is/are NOUN"
|
72 |
+
elif spanroot.pos_ in ["AUX", 'VERB']:
|
73 |
+
category = "Main verb"
|
74 |
+
else:
|
75 |
+
category = spanroot.dep_
|
76 |
+
|
77 |
+
if spanroot.dep_ in ['attr']:
|
78 |
+
c_head = [c.dep_ for c in spanroot.head.children]
|
79 |
+
if "expl" in c_head and "no_det" in span_t_dep_:
|
80 |
+
category = "There is/are no NOUN"
|
81 |
+
|
82 |
+
|
83 |
+
# Modal verbs
|
84 |
+
if spanroot.tag_ == "MD":
|
85 |
+
category = "Modal auxiliary"
|
86 |
+
# prep phrases
|
87 |
+
if spanroot.dep_ in ['prep']:
|
88 |
+
category = 'Prepositional Phrase'
|
89 |
+
# adverbial phrases
|
90 |
+
if spanroot.dep_ in ['advmod']:
|
91 |
+
category = "Adverbial modifier"
|
92 |
+
# adverbial phrases
|
93 |
+
if spanroot.dep_ in ['acomp']:
|
94 |
+
category = "Adjectival complement"
|
95 |
+
|
96 |
+
if spanroot.dep_ in ['neg']:
|
97 |
+
category = "Negative particle"
|
98 |
+
|
99 |
+
# Preconjunctions
|
100 |
+
if spanroot.dep_ in ['preconj']:
|
101 |
+
category = "Conjunction"
|
102 |
+
|
103 |
+
# Adverbial clauses
|
104 |
+
## Check the status of the adverbial clauses carefully
|
105 |
+
if spanroot.dep_ in ['advcl', 'mark', 'acl']:
|
106 |
+
if "mark" in span_dep:
|
107 |
+
category = "Finite adverbial clause"
|
108 |
+
if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and "aux" not in c_dep:
|
109 |
+
category = "Non-finite adv clause"
|
110 |
+
# Check whether it has a subject or not
|
111 |
+
# elif "nsubj" in [c.dep_ for c in spanroot.children]:
|
112 |
+
# category = "Adverbial clauses"
|
113 |
+
# else:
|
114 |
+
# category = "Other advcl"
|
115 |
+
|
116 |
+
if spanroot.dep_ in ['relcl', 'ccomp']:
|
117 |
+
head = spanroot.head
|
118 |
+
if ";" in [t.norm_ for t in head.children]:
|
119 |
+
category = "Main verb"
|
120 |
+
elif "nsubj" not in span_dep:
|
121 |
+
category = "Dependent verb"
|
122 |
+
|
123 |
+
if spanroot.dep_ in ['dep']:
|
124 |
+
if spanroot.head.dep_ in ['ROOT', 'ccomp'] and spanroot.head.pos_ in ['AUX', 'VERB'] and spanroot.pos_ in ['AUX', 'VERB']:
|
125 |
+
if spanroot.morph == spanroot.head.morph:
|
126 |
+
category = "Main verb"
|
127 |
+
else:
|
128 |
+
category = "Dependent verb"
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
if span.label_ == "CITATION":
|
134 |
+
if "NNP" in span_tag or "NNPS" in span_tag:
|
135 |
+
if span_dep[0] == 'punct' and span_dep[-1] == 'punct':
|
136 |
+
category = "Parenthetical Citation"
|
137 |
+
elif span_tag[0] in ["NNP", "NNPS"]:
|
138 |
+
category = "Narrative Citation"
|
139 |
+
else:
|
140 |
+
category = "Other Citation"
|
141 |
+
|
142 |
+
|
143 |
+
return category
|
144 |
+
|
145 |
+
|
146 |
def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
147 |
spans_key: str = "sc",
|
148 |
attrs: List[str] = SPAN_ATTRS):
|
149 |
+
columns = attrs + ["Conf. score", "sent no.", "grammatical realization", 'span dep', "ner",
|
150 |
+
"POS", 'span dep seq', "POS sequence", "head", "children", "morphology", ]
|
151 |
data = []
|
152 |
+
# data = span_info_aggregator(doc, columns)
|
153 |
+
sentences = {s: i for i, s in enumerate(doc.sents)}
|
154 |
|
155 |
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']):
|
156 |
|
|
|
158 |
span_info.extend([str(getattr(span, attr)) for attr in attrs])
|
159 |
|
160 |
span_info.append(score)
|
161 |
+
span_info.append(sentences[span.sent])
|
162 |
+
span_info.append(construction_classifier(doc, span))
|
163 |
span_info.append(span.root.dep_)
|
164 |
+
span_info.append(span.root.ent_type_)
|
165 |
span_info.append(span.root.tag_)
|
166 |
+
span_info.append("_".join([t.dep_ for t in span]))
|
167 |
span_info.append("_".join([t.tag_ for t in span]))
|
168 |
span_info.append(span.root.head.norm_)
|
169 |
+
span_info.append("_".join([c.dep_ for c in span.root.children]))
|
170 |
+
span_info.append(span.root.morph)
|
171 |
data.append(span_info)
|
172 |
|
173 |
return data, columns
|
resources/__pycache__/colors.cpython-39.pyc
ADDED
Binary file (442 Bytes). View file
|
|
resources/__pycache__/template_list.cpython-39.pyc
ADDED
Binary file (2.35 kB). View file
|
|
resources/__pycache__/text_list.cpython-39.pyc
ADDED
Binary file (121 kB). View file
|
|
resources/colors.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
COLORS_1 = {
|
2 |
+
"ENTERTAIN": "#82b74b",
|
3 |
+
"DENY": '#c94c4c',
|
4 |
+
"COUNTER": "#eea29a",
|
5 |
+
"PRONOUNCE": "#92a8d1",
|
6 |
+
"ENDORSE": "#034f84",
|
7 |
+
"CITATION": "#b2b2b2",
|
8 |
+
"MONOGLOSS": "#3e4444",
|
9 |
+
"ATTRIBUTE": "#f7786b",
|
10 |
+
"ATTRIBUTION": "#f7786b",
|
11 |
+
"PROCLAIM": "#92a8d1"
|
12 |
+
}
|
13 |
+
|
resources/template_list.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TPL_ENT = """
|
2 |
+
<mark class="entity" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
3 |
+
{text}
|
4 |
+
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">{label}</span>
|
5 |
+
</mark>
|
6 |
+
"""
|
7 |
+
|
8 |
+
TPL_SPANS = """
|
9 |
+
<div class="spans" style="line-height: 4.5;">
|
10 |
+
{text}
|
11 |
+
{span_slices}
|
12 |
+
{span_starts}
|
13 |
+
</div>
|
14 |
+
"""
|
15 |
+
|
16 |
+
TPL_SPAN = """
|
17 |
+
<span style="font-weight: bold; display: inline-block; line-height: 3; padding-bottom: 12px;position: relative;">
|
18 |
+
{text}
|
19 |
+
{span_slices}
|
20 |
+
{span_starts}
|
21 |
+
</span>
|
22 |
+
"""
|
23 |
+
|
24 |
+
TPL_SPAN_SLICE = """
|
25 |
+
<span style="background: {bg}; top: {top_offset}px; display: inline-block; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
26 |
+
</span>
|
27 |
+
"""
|
28 |
+
|
29 |
+
TPL_SPAN_START = """
|
30 |
+
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
31 |
+
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
32 |
+
|
33 |
+
{label}{kb_link}
|
34 |
+
</span>
|
35 |
+
</span>
|
36 |
+
|
37 |
+
"""
|
38 |
+
|
39 |
+
TPL_SPAN_START_RTL = """
|
40 |
+
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
41 |
+
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
42 |
+
{label}{kb_link}
|
43 |
+
</span>
|
44 |
+
</span>
|
45 |
+
"""
|
46 |
+
|
47 |
+
DEFAULT_TEXT = """Tickner said regardless of the result, the royal commission was a waste of money and he would proceed with a separate inquiry into the issue headed by Justice Jane Matthews. His attack came as the Aboriginal women involved in the case demanded a female minister examine the religious beliefs they claim are inherent in their fight against a bridge to the island near Goolwa in South Australia."""
|
48 |
+
|
resources/text_list.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
utils/__pycache__/util.cpython-39.pyc
CHANGED
Binary files a/utils/__pycache__/util.cpython-39.pyc and b/utils/__pycache__/util.cpython-39.pyc differ
|
|
utils/util.py
CHANGED
@@ -8,6 +8,7 @@ def preprocess(text):
|
|
8 |
text = re.sub('\n', ' ', text)
|
9 |
text = re.sub('\s+', " ", text)
|
10 |
text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
|
|
|
11 |
return text
|
12 |
|
13 |
|
|
|
8 |
text = re.sub('\n', ' ', text)
|
9 |
text = re.sub('\s+', " ", text)
|
10 |
text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
|
11 |
+
text = re.sub("--- Para SEP ---", '\n', text)
|
12 |
return text
|
13 |
|
14 |
|