new UI
Browse files- demo.py +30 -73
- pipeline/__pycache__/post_processors.cpython-39.pyc +0 -0
- pipeline/custom_functions.py +190 -0
- pipeline/post_processors.py +51 -0
- utils/__pycache__/util.cpython-39.pyc +0 -0
- utils/__pycache__/visualize.cpython-39.pyc +0 -0
- utils/util.py +63 -0
- utils/visualize.py +128 -0
demo.py
CHANGED
@@ -4,10 +4,13 @@ from collections import Counter
|
|
4 |
|
5 |
import spacy
|
6 |
from spacy.tokens import Doc
|
7 |
-
from spacy_streamlit import visualize_spans
|
8 |
|
9 |
import streamlit as st
|
10 |
|
|
|
|
|
|
|
11 |
# nlp = spacy.load(
|
12 |
# "packages/en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2"
|
13 |
# )
|
@@ -27,13 +30,13 @@ st.set_page_config(page_title="ENGAGEMENT analyzer (beta ver 0.2)",
|
|
27 |
|
28 |
|
29 |
@st.cache(allow_output_mutation=True)
|
30 |
-
def load_model(
|
31 |
# nlp = spacy.load("en_engagement_RoBERTa_context_flz")
|
32 |
nlp = spacy.load("en_engagement_spl_RoBERTa_acad")
|
33 |
return (nlp)
|
34 |
|
35 |
|
36 |
-
nlp = load_model(
|
37 |
|
38 |
doc = nlp(
|
39 |
'Welcome! Probably this is one of the few attempts to teach a machine how to read the discourse...! Although it is not perfect, you should be able to get a good place to start for your stance-taking analyses. The result will be presented here.'
|
@@ -140,54 +143,6 @@ def delete_span(span_sc: dict):
|
|
140 |
del span_sc[idx]
|
141 |
|
142 |
|
143 |
-
def delete_overlapping_span(span_sc: dict):
|
144 |
-
start_token_list = [spn.start for spn in span_sc]
|
145 |
-
dict_ = Counter(start_token_list)
|
146 |
-
overlap = {k: v for k, v in dict_.items() if v > 1}
|
147 |
-
|
148 |
-
id_del = []
|
149 |
-
id_comp = {}
|
150 |
-
|
151 |
-
info = {}
|
152 |
-
for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
|
153 |
-
start=1):
|
154 |
-
res = {
|
155 |
-
'score': score,
|
156 |
-
'spn': spn,
|
157 |
-
'label': spn.label_,
|
158 |
-
'start': spn.start,
|
159 |
-
'compare': spn.start in overlap,
|
160 |
-
"sents": len(list(spn.sents))
|
161 |
-
}
|
162 |
-
# print(res)
|
163 |
-
info[n] = res
|
164 |
-
|
165 |
-
if res['compare']:
|
166 |
-
if spn.start not in id_comp:
|
167 |
-
id_comp[spn.start] = n
|
168 |
-
else:
|
169 |
-
update = res['score'] > info[id_comp[spn.start]]['score']
|
170 |
-
if update:
|
171 |
-
id_del.append(id_comp[spn.start])
|
172 |
-
id_comp[spn.start] = n
|
173 |
-
else:
|
174 |
-
id_del.append(n)
|
175 |
-
print(update)
|
176 |
-
|
177 |
-
# delete span beyond sentences
|
178 |
-
if len(list(spn.sents)) > 1:
|
179 |
-
id_del.append(n)
|
180 |
-
|
181 |
-
# print(id_comp)
|
182 |
-
|
183 |
-
for n, idx in enumerate(id_del):
|
184 |
-
# print(idx)
|
185 |
-
try:
|
186 |
-
del span_sc[idx - n]
|
187 |
-
except IndexError:
|
188 |
-
continue
|
189 |
-
|
190 |
-
|
191 |
# st.markdown('''
|
192 |
# <style>
|
193 |
# .sidebar .sidebar-content {{
|
@@ -308,28 +263,30 @@ with st.form("my_form"):
|
|
308 |
|
309 |
delete_overlapping_span(doc.spans['sc'])
|
310 |
|
311 |
-
visualize_spans(
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
|
|
|
|
333 |
|
334 |
st.subheader("Bibliography")
|
335 |
st.markdown("""
|
|
|
4 |
|
5 |
import spacy
|
6 |
from spacy.tokens import Doc
|
7 |
+
# from spacy_streamlit import visualize_spans
|
8 |
|
9 |
import streamlit as st
|
10 |
|
11 |
+
from utils.util import delete_overlapping_span
|
12 |
+
from utils.visualize import visualize_spans
|
13 |
+
|
14 |
# nlp = spacy.load(
|
15 |
# "packages/en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2"
|
16 |
# )
|
|
|
30 |
|
31 |
|
32 |
@st.cache(allow_output_mutation=True)
|
33 |
+
def load_model():
|
34 |
# nlp = spacy.load("en_engagement_RoBERTa_context_flz")
|
35 |
nlp = spacy.load("en_engagement_spl_RoBERTa_acad")
|
36 |
return (nlp)
|
37 |
|
38 |
|
39 |
+
nlp = load_model()
|
40 |
|
41 |
doc = nlp(
|
42 |
'Welcome! Probably this is one of the few attempts to teach a machine how to read the discourse...! Although it is not perfect, you should be able to get a good place to start for your stance-taking analyses. The result will be presented here.'
|
|
|
143 |
del span_sc[idx]
|
144 |
|
145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
# st.markdown('''
|
147 |
# <style>
|
148 |
# .sidebar .sidebar-content {{
|
|
|
263 |
|
264 |
delete_overlapping_span(doc.spans['sc'])
|
265 |
|
266 |
+
visualize_spans(
|
267 |
+
doc,
|
268 |
+
spans_key="sc",
|
269 |
+
displacy_options={
|
270 |
+
'template': {
|
271 |
+
"span": TPL_SPAN,
|
272 |
+
'slice': TPL_SPAN_SLICE,
|
273 |
+
'start': TPL_SPAN_START,
|
274 |
+
},
|
275 |
+
"colors": {
|
276 |
+
"ENTERTAIN": "#73C6B6",
|
277 |
+
"DENY": '#CD6155',
|
278 |
+
"COUNTER": "#D35400",
|
279 |
+
"PRONOUNCE": "#2ECC71",
|
280 |
+
"ENDORSE": "#A569BD",
|
281 |
+
"CONCUR": "#F39C12",
|
282 |
+
"CITATION": "#F8C471",
|
283 |
+
"SOURCES": "#F7DC6F",
|
284 |
+
"MONOGLOSS": "#85929E",
|
285 |
+
"ATTRIBUTE": "#85C1E9",
|
286 |
+
"JUSTIFYING": "#2ECC71",
|
287 |
+
},
|
288 |
+
},
|
289 |
+
)
|
290 |
|
291 |
st.subheader("Bibliography")
|
292 |
st.markdown("""
|
pipeline/__pycache__/post_processors.cpython-39.pyc
ADDED
Binary file (2.15 kB). View file
|
|
pipeline/custom_functions.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import partial
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Iterable, Callable
|
4 |
+
import spacy
|
5 |
+
from spacy.training import Example
|
6 |
+
from spacy.tokens import DocBin, Doc
|
7 |
+
|
8 |
+
# make the factory work
|
9 |
+
# from scripts.rel_pipe import make_relation_extractor
|
10 |
+
|
11 |
+
# make the config work
|
12 |
+
# from scripts.rel_model import create_relation_model, create_classification_layer, create_instances, create_tensors
|
13 |
+
# from scripts.custom_comps.SpanCat_extention import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3, build_mean_max_reducer4
|
14 |
+
|
15 |
+
from typing import List, Tuple, cast
|
16 |
+
from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
|
17 |
+
from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init, PyTorchLSTM
|
18 |
+
from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
|
19 |
+
from thinc.types import Ragged, Floats2d
|
20 |
+
|
21 |
+
from spacy.util import registry
|
22 |
+
from spacy.tokens import Doc
|
23 |
+
from spacy.ml.extract_spans import extract_spans
|
24 |
+
|
25 |
+
# @registry.layers("spacy.LinearLogistic.v1")
|
26 |
+
# def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
|
27 |
+
# """An output layer for multi-label classification. It uses a linear layer
|
28 |
+
# followed by a logistic activation.
|
29 |
+
# """
|
30 |
+
# return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
|
31 |
+
|
32 |
+
|
33 |
+
@registry.layers("mean_max_reducer.v1.5")
|
34 |
+
def build_mean_max_reducer1(hidden_size: int,
|
35 |
+
dropout: float = 0.0) -> Model[Ragged, Floats2d]:
|
36 |
+
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
37 |
+
and then combine the concatenated vectors with a hidden layer.
|
38 |
+
"""
|
39 |
+
return chain(
|
40 |
+
concatenate(
|
41 |
+
cast(Model[Ragged, Floats2d], reduce_last()),
|
42 |
+
cast(Model[Ragged, Floats2d], reduce_first()),
|
43 |
+
reduce_mean(),
|
44 |
+
reduce_max(),
|
45 |
+
),
|
46 |
+
Maxout(nO=hidden_size, normalize=True, dropout=dropout),
|
47 |
+
)
|
48 |
+
|
49 |
+
|
50 |
+
@registry.layers("mean_max_reducer.v2")
|
51 |
+
def build_mean_max_reducer2(hidden_size: int,
|
52 |
+
dropout: float = 0.0) -> Model[Ragged, Floats2d]:
|
53 |
+
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
54 |
+
and then combine the concatenated vectors with a hidden layer.
|
55 |
+
"""
|
56 |
+
return chain(
|
57 |
+
concatenate(
|
58 |
+
cast(Model[Ragged, Floats2d], reduce_last()),
|
59 |
+
cast(Model[Ragged, Floats2d], reduce_first()),
|
60 |
+
reduce_mean(),
|
61 |
+
reduce_max(),
|
62 |
+
), Maxout(nO=hidden_size, normalize=True, dropout=dropout),
|
63 |
+
Maxout(nO=hidden_size, normalize=True, dropout=dropout))
|
64 |
+
|
65 |
+
|
66 |
+
# @registry.layers("mean_max_reducer.v2")
|
67 |
+
# def build_mean_max_reducer2(hidden_size: int,
|
68 |
+
# depth: int) -> Model[Ragged, Floats2d]:
|
69 |
+
# """Reduce sequences by concatenating their mean and max pooled vectors,
|
70 |
+
# and then combine the concatenated vectors with a hidden layer.
|
71 |
+
# """
|
72 |
+
# return chain(
|
73 |
+
# concatenate(
|
74 |
+
# cast(Model[Ragged, Floats2d], reduce_last()),
|
75 |
+
# cast(Model[Ragged, Floats2d], reduce_first()),
|
76 |
+
# reduce_mean(),
|
77 |
+
# reduce_max(),
|
78 |
+
# ), Maxout(nO=hidden_size, normalize=True, dropout=0.0),
|
79 |
+
# PyTorchLSTM(nO=64, nI=hidden_size, bi=True, depth=depth, dropout=0.2))
|
80 |
+
|
81 |
+
|
82 |
+
@registry.layers("mean_max_reducer.v3")
|
83 |
+
def build_mean_max_reducer3(hidden_size: int,
|
84 |
+
maxout_pieces: int = 3,
|
85 |
+
dropout: float = 0.0) -> Model[Ragged, Floats2d]:
|
86 |
+
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
87 |
+
and then combine the concatenated vectors with a hidden layer.
|
88 |
+
"""
|
89 |
+
hidden_size2 = int(hidden_size / 2)
|
90 |
+
hidden_size3 = int(hidden_size / 2)
|
91 |
+
return chain(
|
92 |
+
concatenate(
|
93 |
+
cast(Model[Ragged, Floats2d], reduce_last()),
|
94 |
+
cast(Model[Ragged, Floats2d], reduce_first()),
|
95 |
+
reduce_mean(),
|
96 |
+
reduce_max(),
|
97 |
+
),
|
98 |
+
Maxout(nO=hidden_size,
|
99 |
+
nP=maxout_pieces,
|
100 |
+
normalize=True,
|
101 |
+
dropout=dropout),
|
102 |
+
Maxout(nO=hidden_size2,
|
103 |
+
nP=maxout_pieces,
|
104 |
+
normalize=True,
|
105 |
+
dropout=dropout),
|
106 |
+
Maxout(nO=hidden_size3,
|
107 |
+
nP=maxout_pieces,
|
108 |
+
normalize=True,
|
109 |
+
dropout=dropout))
|
110 |
+
|
111 |
+
|
112 |
+
@registry.layers("mean_max_reducer.v3.3")
|
113 |
+
def build_mean_max_reducer4(hidden_size: int,
|
114 |
+
depth: int) -> Model[Ragged, Floats2d]:
|
115 |
+
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
116 |
+
and then combine the concatenated vectors with a hidden layer.
|
117 |
+
"""
|
118 |
+
hidden_size2 = int(hidden_size / 2)
|
119 |
+
hidden_size3 = int(hidden_size / 2)
|
120 |
+
return chain(
|
121 |
+
concatenate(
|
122 |
+
cast(Model[Ragged, Floats2d], reduce_last()),
|
123 |
+
cast(Model[Ragged, Floats2d], reduce_first()),
|
124 |
+
reduce_mean(),
|
125 |
+
reduce_max(),
|
126 |
+
), Maxout(nO=hidden_size, nP=3, normalize=True, dropout=0.0),
|
127 |
+
Maxout(nO=hidden_size2, nP=3, normalize=True, dropout=0.0),
|
128 |
+
Maxout(nO=hidden_size3, nP=3, normalize=True, dropout=0.0))
|
129 |
+
|
130 |
+
|
131 |
+
@registry.architectures("CustomSpanCategorizer.v2")
|
132 |
+
def build_spancat_model(
|
133 |
+
tok2vec: Model[List[Doc], List[Floats2d]],
|
134 |
+
reducer: Model[Ragged, Floats2d],
|
135 |
+
scorer: Model[Floats2d, Floats2d],
|
136 |
+
) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
|
137 |
+
"""Build a span categorizer model, given a token-to-vector model, a
|
138 |
+
reducer model to map the sequence of vectors for each span down to a single
|
139 |
+
vector, and a scorer model to map the vectors to probabilities.
|
140 |
+
tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
|
141 |
+
reducer (Model[Ragged, Floats2d]): The reducer model.
|
142 |
+
scorer (Model[Floats2d, Floats2d]): The scorer model.
|
143 |
+
"""
|
144 |
+
model = chain(
|
145 |
+
cast(
|
146 |
+
Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]],
|
147 |
+
with_getitem(
|
148 |
+
0,
|
149 |
+
chain(tok2vec,
|
150 |
+
cast(Model[List[Floats2d], Ragged], list2ragged()))),
|
151 |
+
),
|
152 |
+
extract_spans(),
|
153 |
+
reducer,
|
154 |
+
scorer,
|
155 |
+
)
|
156 |
+
model.set_ref("tok2vec", tok2vec)
|
157 |
+
model.set_ref("reducer", reducer)
|
158 |
+
model.set_ref("scorer", scorer)
|
159 |
+
return model
|
160 |
+
|
161 |
+
|
162 |
+
# @registry.architectures("spacy.SpanCategorizer.v1")
|
163 |
+
# def build_spancat_model(
|
164 |
+
# tok2vec: Model[List[Doc], List[Floats2d]],
|
165 |
+
# reducer: Model[Ragged, Floats2d],
|
166 |
+
# scorer: Model[Floats2d, Floats2d],
|
167 |
+
# ) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
|
168 |
+
# """Build a span categorizer model, given a token-to-vector model, a
|
169 |
+
# reducer model to map the sequence of vectors for each span down to a single
|
170 |
+
# vector, and a scorer model to map the vectors to probabilities.
|
171 |
+
# tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
|
172 |
+
# reducer (Model[Ragged, Floats2d]): The reducer model.
|
173 |
+
# scorer (Model[Floats2d, Floats2d]): The scorer model.
|
174 |
+
# """
|
175 |
+
# model = chain(
|
176 |
+
# cast(
|
177 |
+
# Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]],
|
178 |
+
# with_getitem(
|
179 |
+
# 0,
|
180 |
+
# chain(tok2vec,
|
181 |
+
# cast(Model[List[Floats2d], Ragged], list2ragged()))),
|
182 |
+
# ),
|
183 |
+
# extract_spans(),
|
184 |
+
# reducer,
|
185 |
+
# scorer,
|
186 |
+
# )
|
187 |
+
# model.set_ref("tok2vec", tok2vec)
|
188 |
+
# model.set_ref("reducer", reducer)
|
189 |
+
# model.set_ref("scorer", scorer)
|
190 |
+
# return model
|
pipeline/post_processors.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
|
3 |
+
import pandas as pd
|
4 |
+
import spacy
|
5 |
+
from spacy.language import Language
|
6 |
+
|
7 |
+
SPAN_ATTRS = ["text", "label_", "start", "end"]
|
8 |
+
|
9 |
+
|
10 |
+
def simple_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
11 |
+
spans_key: str = "sc",
|
12 |
+
attrs: List[str] = SPAN_ATTRS):
|
13 |
+
columns = attrs + ["Conf. score"]
|
14 |
+
data = [
|
15 |
+
[str(getattr(span, attr))
|
16 |
+
for attr in attrs] + [score] # [f'{score:.5f}']
|
17 |
+
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
|
18 |
+
]
|
19 |
+
return data, columns
|
20 |
+
|
21 |
+
|
22 |
+
def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
23 |
+
spans_key: str = "sc",
|
24 |
+
attrs: List[str] = SPAN_ATTRS):
|
25 |
+
columns = attrs + ["Conf. score", 'span dep',
|
26 |
+
"POS", "POS sequence", "head"]
|
27 |
+
data = []
|
28 |
+
|
29 |
+
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']):
|
30 |
+
|
31 |
+
span_info = []
|
32 |
+
span_info.extend([str(getattr(span, attr)) for attr in attrs])
|
33 |
+
|
34 |
+
span_info.append(score)
|
35 |
+
span_info.append(span.root.dep_)
|
36 |
+
span_info.append(span.root.tag_)
|
37 |
+
span_info.append("_".join([t.tag_ for t in span]))
|
38 |
+
span_info.append(span.root.head.norm_)
|
39 |
+
# span_info.append(span.root.head.dep_ == "ROOT")
|
40 |
+
data.append(span_info)
|
41 |
+
|
42 |
+
return data, columns
|
43 |
+
|
44 |
+
|
45 |
+
def ngrammar(seq: list, n=2):
|
46 |
+
result = []
|
47 |
+
n_item = len(seq)
|
48 |
+
for idx, item in enumerate(seq):
|
49 |
+
if idx + n <= n_item:
|
50 |
+
result.append(seq[idx: idx + n])
|
51 |
+
return result
|
utils/__pycache__/util.cpython-39.pyc
ADDED
Binary file (1.68 kB). View file
|
|
utils/__pycache__/visualize.cpython-39.pyc
ADDED
Binary file (3.45 kB). View file
|
|
utils/util.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from collections import Counter
|
3 |
+
|
4 |
+
|
5 |
+
def preprocess(text):
|
6 |
+
text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text)
|
7 |
+
text = re.sub('\n', ' ', text)
|
8 |
+
text = re.sub('\s+', " ", text)
|
9 |
+
text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
|
10 |
+
return text
|
11 |
+
|
12 |
+
|
13 |
+
def delete_overlapping_span(span_sc: dict):
|
14 |
+
# print(span_sc)
|
15 |
+
start_token_list = [spn.start for spn in span_sc]
|
16 |
+
dict_ = Counter(start_token_list)
|
17 |
+
overlap = {k: v for k, v in dict_.items() if v > 1}
|
18 |
+
|
19 |
+
id_del = []
|
20 |
+
id_comp = {}
|
21 |
+
|
22 |
+
info = {}
|
23 |
+
for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
|
24 |
+
start=1):
|
25 |
+
res = {
|
26 |
+
'score': score,
|
27 |
+
'spn': spn,
|
28 |
+
'label': spn.label_,
|
29 |
+
'start': spn.start,
|
30 |
+
'end': spn.end,
|
31 |
+
'compare': spn.start in overlap,
|
32 |
+
"sents": len(list(spn.sents))
|
33 |
+
}
|
34 |
+
# print(res)
|
35 |
+
info[n] = res
|
36 |
+
|
37 |
+
if res['compare']:
|
38 |
+
if spn.start not in id_comp:
|
39 |
+
id_comp[spn.start] = n
|
40 |
+
else:
|
41 |
+
same_lbl = res['label'] == info[id_comp[spn.start]]['label']
|
42 |
+
update = res['score'] > info[id_comp[spn.start]]['score']
|
43 |
+
if update and same_lbl:
|
44 |
+
print(res['label'], info[id_comp[spn.start]]['label'])
|
45 |
+
print(same_lbl)
|
46 |
+
id_del.append(id_comp[spn.start])
|
47 |
+
id_comp[spn.start] = n
|
48 |
+
else:
|
49 |
+
id_del.append(n)
|
50 |
+
# print(update)
|
51 |
+
|
52 |
+
# delete span beyond sentences
|
53 |
+
if len(list(spn.sents)) > 1:
|
54 |
+
id_del.append(n)
|
55 |
+
|
56 |
+
# print(id_comp)
|
57 |
+
|
58 |
+
for n, idx in enumerate(id_del):
|
59 |
+
# print(idx)
|
60 |
+
try:
|
61 |
+
del span_sc[idx - n]
|
62 |
+
except IndexError:
|
63 |
+
continue
|
utils/visualize.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
#
|
5 |
+
# This code is adapted from spacy-streamlit package by explosion
|
6 |
+
# https://github.com/explosion/spacy-streamlit/blob/master/spacy_streamlit/__init__.py
|
7 |
+
#
|
8 |
+
|
9 |
+
from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
|
10 |
+
import streamlit as st
|
11 |
+
import spacy
|
12 |
+
from spacy.language import Language
|
13 |
+
from spacy import displacy
|
14 |
+
import pandas as pd
|
15 |
+
|
16 |
+
import streamlit as st
|
17 |
+
from spacy_streamlit import visualize_spans
|
18 |
+
from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO
|
19 |
+
|
20 |
+
from pipeline.post_processors import simple_table, const_table, ngrammar
|
21 |
+
# from skbio import diversity as dv
|
22 |
+
|
23 |
+
SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))
|
24 |
+
|
25 |
+
# fmt: off
|
26 |
+
# SPAN_ATTRS = ["text", "label_", "start", "end", "start_char", "end_char"]
|
27 |
+
SPAN_ATTRS = [
|
28 |
+
"text",
|
29 |
+
"label_",
|
30 |
+
"start",
|
31 |
+
"end",
|
32 |
+
]
|
33 |
+
|
34 |
+
|
35 |
+
def visualize_spans(
|
36 |
+
doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
37 |
+
*,
|
38 |
+
spans_key: str = "sc",
|
39 |
+
attrs: List[str] = SPAN_ATTRS,
|
40 |
+
show_table: bool = True,
|
41 |
+
title: Optional[str] = "Spans",
|
42 |
+
manual: bool = False,
|
43 |
+
displacy_options: Optional[Dict] = None,
|
44 |
+
simple: bool = True,
|
45 |
+
):
|
46 |
+
"""
|
47 |
+
Visualizer for spans.
|
48 |
+
doc (Doc, Dict): The document to visualize.
|
49 |
+
spans_key (str): Which spans key to render spans from. Default is "sc".
|
50 |
+
attrs (list): The attributes on the entity Span to be labeled. Attributes are displayed only when the show_table
|
51 |
+
argument is True.
|
52 |
+
show_table (bool): Flag signifying whether to show a table with accompanying span attributes.
|
53 |
+
title (str): The title displayed at the top of the Spans visualization.
|
54 |
+
manual (bool): Flag signifying whether the doc argument is a Doc object or a List of Dicts containing span information.
|
55 |
+
displacy_options (Dict): Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered.
|
56 |
+
See https://spacy.io/api/top-level#displacy_options-span
|
57 |
+
"""
|
58 |
+
if SPACY_VERSION < (3, 3, 0):
|
59 |
+
raise ValueError(
|
60 |
+
f"'visualize_spans' requires spacy>=3.3.0. You have spacy=={spacy.__version__}"
|
61 |
+
)
|
62 |
+
if not displacy_options:
|
63 |
+
displacy_options = dict()
|
64 |
+
displacy_options["spans_key"] = spans_key
|
65 |
+
|
66 |
+
if title:
|
67 |
+
st.header(title)
|
68 |
+
|
69 |
+
if manual:
|
70 |
+
if show_table:
|
71 |
+
st.warning(
|
72 |
+
"When the parameter 'manual' is set to True, the parameter 'show_table' must be set to False."
|
73 |
+
)
|
74 |
+
if not isinstance(doc, dict):
|
75 |
+
st.warning(
|
76 |
+
"When the parameter 'manual' is set to True, the parameter 'doc' must be of type 'Dict', not 'spacy.tokens.Doc'."
|
77 |
+
)
|
78 |
+
html = displacy.render(
|
79 |
+
doc,
|
80 |
+
style="span",
|
81 |
+
options=displacy_options,
|
82 |
+
manual=manual,
|
83 |
+
)
|
84 |
+
st.write(f"{get_html(html)}", unsafe_allow_html=True)
|
85 |
+
|
86 |
+
if show_table:
|
87 |
+
# data = [
|
88 |
+
# [str(getattr(span, attr)) for attr in attrs] + [str(score)]
|
89 |
+
# for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
|
90 |
+
# ]
|
91 |
+
if simple:
|
92 |
+
data, cols = simple_table(doc, spans_key='sc', attrs=attrs)
|
93 |
+
else:
|
94 |
+
data, cols = const_table(doc, spans_key='sc', attrs=attrs)
|
95 |
+
|
96 |
+
seq = [s for s in doc.spans[spans_key]]
|
97 |
+
|
98 |
+
span_ngrams = ngrammar(seq=seq, n=3)
|
99 |
+
# st.code(span_ngrams)
|
100 |
+
|
101 |
+
if data:
|
102 |
+
df = pd.DataFrame(data, columns=cols)
|
103 |
+
|
104 |
+
st.subheader("Span information")
|
105 |
+
st.dataframe(
|
106 |
+
df.style.highlight_between(subset='Conf. score', right=.7))
|
107 |
+
|
108 |
+
st.subheader("Label counts & Diagnostic confidence score summary")
|
109 |
+
counts = df['label_'].value_counts()
|
110 |
+
label_counts = df.groupby('label_').agg({
|
111 |
+
"label_":
|
112 |
+
'count',
|
113 |
+
"Conf. score": ['median', 'min', 'max']
|
114 |
+
}).round(4)
|
115 |
+
|
116 |
+
st.dataframe(label_counts)
|
117 |
+
|
118 |
+
# st.subheader("Engagement label by grammatical function")
|
119 |
+
# label_dep = pd.crosstab(df['span dep'], df['label_'])
|
120 |
+
# st.dataframe(label_dep)
|
121 |
+
|
122 |
+
# st.subheader('Quantitative results')
|
123 |
+
# st.markdown(
|
124 |
+
# f"Shannon's index: {dv.alpha.shannon(counts, base=2): .3f}")
|
125 |
+
# st.markdown(
|
126 |
+
# f"Simpson's e index: {dv.alpha.simpson_e(counts): .3f}")
|
127 |
+
# st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
|
128 |
+
# print(dv.get_alpha_diversity_metrics())
|