egumasa commited on
Commit
a937724
·
1 Parent(s): 65d7587
demo.py CHANGED
@@ -4,10 +4,13 @@ from collections import Counter
4
 
5
  import spacy
6
  from spacy.tokens import Doc
7
- from spacy_streamlit import visualize_spans
8
 
9
  import streamlit as st
10
 
 
 
 
11
  # nlp = spacy.load(
12
  # "packages/en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2"
13
  # )
@@ -27,13 +30,13 @@ st.set_page_config(page_title="ENGAGEMENT analyzer (beta ver 0.2)",
27
 
28
 
29
  @st.cache(allow_output_mutation=True)
30
- def load_model(spacy_model):
31
  # nlp = spacy.load("en_engagement_RoBERTa_context_flz")
32
  nlp = spacy.load("en_engagement_spl_RoBERTa_acad")
33
  return (nlp)
34
 
35
 
36
- nlp = load_model("en_engagement_RoBERTa_context_flz")
37
 
38
  doc = nlp(
39
  'Welcome! Probably this is one of the few attempts to teach a machine how to read the discourse...! Although it is not perfect, you should be able to get a good place to start for your stance-taking analyses. The result will be presented here.'
@@ -140,54 +143,6 @@ def delete_span(span_sc: dict):
140
  del span_sc[idx]
141
 
142
 
143
- def delete_overlapping_span(span_sc: dict):
144
- start_token_list = [spn.start for spn in span_sc]
145
- dict_ = Counter(start_token_list)
146
- overlap = {k: v for k, v in dict_.items() if v > 1}
147
-
148
- id_del = []
149
- id_comp = {}
150
-
151
- info = {}
152
- for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
153
- start=1):
154
- res = {
155
- 'score': score,
156
- 'spn': spn,
157
- 'label': spn.label_,
158
- 'start': spn.start,
159
- 'compare': spn.start in overlap,
160
- "sents": len(list(spn.sents))
161
- }
162
- # print(res)
163
- info[n] = res
164
-
165
- if res['compare']:
166
- if spn.start not in id_comp:
167
- id_comp[spn.start] = n
168
- else:
169
- update = res['score'] > info[id_comp[spn.start]]['score']
170
- if update:
171
- id_del.append(id_comp[spn.start])
172
- id_comp[spn.start] = n
173
- else:
174
- id_del.append(n)
175
- print(update)
176
-
177
- # delete span beyond sentences
178
- if len(list(spn.sents)) > 1:
179
- id_del.append(n)
180
-
181
- # print(id_comp)
182
-
183
- for n, idx in enumerate(id_del):
184
- # print(idx)
185
- try:
186
- del span_sc[idx - n]
187
- except IndexError:
188
- continue
189
-
190
-
191
  # st.markdown('''
192
  # <style>
193
  # .sidebar .sidebar-content {{
@@ -308,28 +263,30 @@ with st.form("my_form"):
308
 
309
  delete_overlapping_span(doc.spans['sc'])
310
 
311
- visualize_spans(doc,
312
- spans_key="sc",
313
- displacy_options={
314
- 'template': {
315
- "span": TPL_SPAN,
316
- 'slice': TPL_SPAN_SLICE,
317
- 'start': TPL_SPAN_START,
318
- },
319
- "colors": {
320
- "ENTERTAIN": "#73C6B6",
321
- "DENY": '#CD6155',
322
- "COUNTER": "#D35400",
323
- "PRONOUNCE": "#2ECC71",
324
- "ENDORSE": "#A569BD",
325
- "CONCUR": "#F39C12",
326
- "CITATION": "#F8C471",
327
- "SOURCES": "#F7DC6F",
328
- "MONOGLOSS": "#85929E",
329
- "ATTRIBUTE": "#85C1E9",
330
- "JUSTIFYING": "#2ECC71",
331
- },
332
- })
 
 
333
 
334
  st.subheader("Bibliography")
335
  st.markdown("""
 
4
 
5
  import spacy
6
  from spacy.tokens import Doc
7
+ # from spacy_streamlit import visualize_spans
8
 
9
  import streamlit as st
10
 
11
+ from utils.util import delete_overlapping_span
12
+ from utils.visualize import visualize_spans
13
+
14
  # nlp = spacy.load(
15
  # "packages/en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2"
16
  # )
 
30
 
31
 
32
  @st.cache(allow_output_mutation=True)
33
+ def load_model():
34
  # nlp = spacy.load("en_engagement_RoBERTa_context_flz")
35
  nlp = spacy.load("en_engagement_spl_RoBERTa_acad")
36
  return (nlp)
37
 
38
 
39
+ nlp = load_model()
40
 
41
  doc = nlp(
42
  'Welcome! Probably this is one of the few attempts to teach a machine how to read the discourse...! Although it is not perfect, you should be able to get a good place to start for your stance-taking analyses. The result will be presented here.'
 
143
  del span_sc[idx]
144
 
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  # st.markdown('''
147
  # <style>
148
  # .sidebar .sidebar-content {{
 
263
 
264
  delete_overlapping_span(doc.spans['sc'])
265
 
266
+ visualize_spans(
267
+ doc,
268
+ spans_key="sc",
269
+ displacy_options={
270
+ 'template': {
271
+ "span": TPL_SPAN,
272
+ 'slice': TPL_SPAN_SLICE,
273
+ 'start': TPL_SPAN_START,
274
+ },
275
+ "colors": {
276
+ "ENTERTAIN": "#73C6B6",
277
+ "DENY": '#CD6155',
278
+ "COUNTER": "#D35400",
279
+ "PRONOUNCE": "#2ECC71",
280
+ "ENDORSE": "#A569BD",
281
+ "CONCUR": "#F39C12",
282
+ "CITATION": "#F8C471",
283
+ "SOURCES": "#F7DC6F",
284
+ "MONOGLOSS": "#85929E",
285
+ "ATTRIBUTE": "#85C1E9",
286
+ "JUSTIFYING": "#2ECC71",
287
+ },
288
+ },
289
+ )
290
 
291
  st.subheader("Bibliography")
292
  st.markdown("""
pipeline/__pycache__/post_processors.cpython-39.pyc ADDED
Binary file (2.15 kB). View file
 
pipeline/custom_functions.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+ from pathlib import Path
3
+ from typing import Iterable, Callable
4
+ import spacy
5
+ from spacy.training import Example
6
+ from spacy.tokens import DocBin, Doc
7
+
8
+ # make the factory work
9
+ # from scripts.rel_pipe import make_relation_extractor
10
+
11
+ # make the config work
12
+ # from scripts.rel_model import create_relation_model, create_classification_layer, create_instances, create_tensors
13
+ # from scripts.custom_comps.SpanCat_extention import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3, build_mean_max_reducer4
14
+
15
+ from typing import List, Tuple, cast
16
+ from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
17
+ from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init, PyTorchLSTM
18
+ from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
19
+ from thinc.types import Ragged, Floats2d
20
+
21
+ from spacy.util import registry
22
+ from spacy.tokens import Doc
23
+ from spacy.ml.extract_spans import extract_spans
24
+
25
+ # @registry.layers("spacy.LinearLogistic.v1")
26
+ # def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
27
+ # """An output layer for multi-label classification. It uses a linear layer
28
+ # followed by a logistic activation.
29
+ # """
30
+ # return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
31
+
32
+
33
+ @registry.layers("mean_max_reducer.v1.5")
34
+ def build_mean_max_reducer1(hidden_size: int,
35
+ dropout: float = 0.0) -> Model[Ragged, Floats2d]:
36
+ """Reduce sequences by concatenating their mean and max pooled vectors,
37
+ and then combine the concatenated vectors with a hidden layer.
38
+ """
39
+ return chain(
40
+ concatenate(
41
+ cast(Model[Ragged, Floats2d], reduce_last()),
42
+ cast(Model[Ragged, Floats2d], reduce_first()),
43
+ reduce_mean(),
44
+ reduce_max(),
45
+ ),
46
+ Maxout(nO=hidden_size, normalize=True, dropout=dropout),
47
+ )
48
+
49
+
50
+ @registry.layers("mean_max_reducer.v2")
51
+ def build_mean_max_reducer2(hidden_size: int,
52
+ dropout: float = 0.0) -> Model[Ragged, Floats2d]:
53
+ """Reduce sequences by concatenating their mean and max pooled vectors,
54
+ and then combine the concatenated vectors with a hidden layer.
55
+ """
56
+ return chain(
57
+ concatenate(
58
+ cast(Model[Ragged, Floats2d], reduce_last()),
59
+ cast(Model[Ragged, Floats2d], reduce_first()),
60
+ reduce_mean(),
61
+ reduce_max(),
62
+ ), Maxout(nO=hidden_size, normalize=True, dropout=dropout),
63
+ Maxout(nO=hidden_size, normalize=True, dropout=dropout))
64
+
65
+
66
+ # @registry.layers("mean_max_reducer.v2")
67
+ # def build_mean_max_reducer2(hidden_size: int,
68
+ # depth: int) -> Model[Ragged, Floats2d]:
69
+ # """Reduce sequences by concatenating their mean and max pooled vectors,
70
+ # and then combine the concatenated vectors with a hidden layer.
71
+ # """
72
+ # return chain(
73
+ # concatenate(
74
+ # cast(Model[Ragged, Floats2d], reduce_last()),
75
+ # cast(Model[Ragged, Floats2d], reduce_first()),
76
+ # reduce_mean(),
77
+ # reduce_max(),
78
+ # ), Maxout(nO=hidden_size, normalize=True, dropout=0.0),
79
+ # PyTorchLSTM(nO=64, nI=hidden_size, bi=True, depth=depth, dropout=0.2))
80
+
81
+
82
+ @registry.layers("mean_max_reducer.v3")
83
+ def build_mean_max_reducer3(hidden_size: int,
84
+ maxout_pieces: int = 3,
85
+ dropout: float = 0.0) -> Model[Ragged, Floats2d]:
86
+ """Reduce sequences by concatenating their mean and max pooled vectors,
87
+ and then combine the concatenated vectors with a hidden layer.
88
+ """
89
+ hidden_size2 = int(hidden_size / 2)
90
+ hidden_size3 = int(hidden_size / 2)
91
+ return chain(
92
+ concatenate(
93
+ cast(Model[Ragged, Floats2d], reduce_last()),
94
+ cast(Model[Ragged, Floats2d], reduce_first()),
95
+ reduce_mean(),
96
+ reduce_max(),
97
+ ),
98
+ Maxout(nO=hidden_size,
99
+ nP=maxout_pieces,
100
+ normalize=True,
101
+ dropout=dropout),
102
+ Maxout(nO=hidden_size2,
103
+ nP=maxout_pieces,
104
+ normalize=True,
105
+ dropout=dropout),
106
+ Maxout(nO=hidden_size3,
107
+ nP=maxout_pieces,
108
+ normalize=True,
109
+ dropout=dropout))
110
+
111
+
112
+ @registry.layers("mean_max_reducer.v3.3")
113
+ def build_mean_max_reducer4(hidden_size: int,
114
+ depth: int) -> Model[Ragged, Floats2d]:
115
+ """Reduce sequences by concatenating their mean and max pooled vectors,
116
+ and then combine the concatenated vectors with a hidden layer.
117
+ """
118
+ hidden_size2 = int(hidden_size / 2)
119
+ hidden_size3 = int(hidden_size / 2)
120
+ return chain(
121
+ concatenate(
122
+ cast(Model[Ragged, Floats2d], reduce_last()),
123
+ cast(Model[Ragged, Floats2d], reduce_first()),
124
+ reduce_mean(),
125
+ reduce_max(),
126
+ ), Maxout(nO=hidden_size, nP=3, normalize=True, dropout=0.0),
127
+ Maxout(nO=hidden_size2, nP=3, normalize=True, dropout=0.0),
128
+ Maxout(nO=hidden_size3, nP=3, normalize=True, dropout=0.0))
129
+
130
+
131
+ @registry.architectures("CustomSpanCategorizer.v2")
132
+ def build_spancat_model(
133
+ tok2vec: Model[List[Doc], List[Floats2d]],
134
+ reducer: Model[Ragged, Floats2d],
135
+ scorer: Model[Floats2d, Floats2d],
136
+ ) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
137
+ """Build a span categorizer model, given a token-to-vector model, a
138
+ reducer model to map the sequence of vectors for each span down to a single
139
+ vector, and a scorer model to map the vectors to probabilities.
140
+ tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
141
+ reducer (Model[Ragged, Floats2d]): The reducer model.
142
+ scorer (Model[Floats2d, Floats2d]): The scorer model.
143
+ """
144
+ model = chain(
145
+ cast(
146
+ Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]],
147
+ with_getitem(
148
+ 0,
149
+ chain(tok2vec,
150
+ cast(Model[List[Floats2d], Ragged], list2ragged()))),
151
+ ),
152
+ extract_spans(),
153
+ reducer,
154
+ scorer,
155
+ )
156
+ model.set_ref("tok2vec", tok2vec)
157
+ model.set_ref("reducer", reducer)
158
+ model.set_ref("scorer", scorer)
159
+ return model
160
+
161
+
162
+ # @registry.architectures("spacy.SpanCategorizer.v1")
163
+ # def build_spancat_model(
164
+ # tok2vec: Model[List[Doc], List[Floats2d]],
165
+ # reducer: Model[Ragged, Floats2d],
166
+ # scorer: Model[Floats2d, Floats2d],
167
+ # ) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
168
+ # """Build a span categorizer model, given a token-to-vector model, a
169
+ # reducer model to map the sequence of vectors for each span down to a single
170
+ # vector, and a scorer model to map the vectors to probabilities.
171
+ # tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
172
+ # reducer (Model[Ragged, Floats2d]): The reducer model.
173
+ # scorer (Model[Floats2d, Floats2d]): The scorer model.
174
+ # """
175
+ # model = chain(
176
+ # cast(
177
+ # Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]],
178
+ # with_getitem(
179
+ # 0,
180
+ # chain(tok2vec,
181
+ # cast(Model[List[Floats2d], Ragged], list2ragged()))),
182
+ # ),
183
+ # extract_spans(),
184
+ # reducer,
185
+ # scorer,
186
+ # )
187
+ # model.set_ref("tok2vec", tok2vec)
188
+ # model.set_ref("reducer", reducer)
189
+ # model.set_ref("scorer", scorer)
190
+ # return model
pipeline/post_processors.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
3
+ import pandas as pd
4
+ import spacy
5
+ from spacy.language import Language
6
+
7
+ SPAN_ATTRS = ["text", "label_", "start", "end"]
8
+
9
+
10
+ def simple_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
11
+ spans_key: str = "sc",
12
+ attrs: List[str] = SPAN_ATTRS):
13
+ columns = attrs + ["Conf. score"]
14
+ data = [
15
+ [str(getattr(span, attr))
16
+ for attr in attrs] + [score] # [f'{score:.5f}']
17
+ for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
18
+ ]
19
+ return data, columns
20
+
21
+
22
+ def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
23
+ spans_key: str = "sc",
24
+ attrs: List[str] = SPAN_ATTRS):
25
+ columns = attrs + ["Conf. score", 'span dep',
26
+ "POS", "POS sequence", "head"]
27
+ data = []
28
+
29
+ for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']):
30
+
31
+ span_info = []
32
+ span_info.extend([str(getattr(span, attr)) for attr in attrs])
33
+
34
+ span_info.append(score)
35
+ span_info.append(span.root.dep_)
36
+ span_info.append(span.root.tag_)
37
+ span_info.append("_".join([t.tag_ for t in span]))
38
+ span_info.append(span.root.head.norm_)
39
+ # span_info.append(span.root.head.dep_ == "ROOT")
40
+ data.append(span_info)
41
+
42
+ return data, columns
43
+
44
+
45
+ def ngrammar(seq: list, n=2):
46
+ result = []
47
+ n_item = len(seq)
48
+ for idx, item in enumerate(seq):
49
+ if idx + n <= n_item:
50
+ result.append(seq[idx: idx + n])
51
+ return result
utils/__pycache__/util.cpython-39.pyc ADDED
Binary file (1.68 kB). View file
 
utils/__pycache__/visualize.cpython-39.pyc ADDED
Binary file (3.45 kB). View file
 
utils/util.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from collections import Counter
3
+
4
+
5
+ def preprocess(text):
6
+ text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text)
7
+ text = re.sub('\n', ' ', text)
8
+ text = re.sub('\s+', " ", text)
9
+ text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
10
+ return text
11
+
12
+
13
+ def delete_overlapping_span(span_sc: dict):
14
+ # print(span_sc)
15
+ start_token_list = [spn.start for spn in span_sc]
16
+ dict_ = Counter(start_token_list)
17
+ overlap = {k: v for k, v in dict_.items() if v > 1}
18
+
19
+ id_del = []
20
+ id_comp = {}
21
+
22
+ info = {}
23
+ for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
24
+ start=1):
25
+ res = {
26
+ 'score': score,
27
+ 'spn': spn,
28
+ 'label': spn.label_,
29
+ 'start': spn.start,
30
+ 'end': spn.end,
31
+ 'compare': spn.start in overlap,
32
+ "sents": len(list(spn.sents))
33
+ }
34
+ # print(res)
35
+ info[n] = res
36
+
37
+ if res['compare']:
38
+ if spn.start not in id_comp:
39
+ id_comp[spn.start] = n
40
+ else:
41
+ same_lbl = res['label'] == info[id_comp[spn.start]]['label']
42
+ update = res['score'] > info[id_comp[spn.start]]['score']
43
+ if update and same_lbl:
44
+ print(res['label'], info[id_comp[spn.start]]['label'])
45
+ print(same_lbl)
46
+ id_del.append(id_comp[spn.start])
47
+ id_comp[spn.start] = n
48
+ else:
49
+ id_del.append(n)
50
+ # print(update)
51
+
52
+ # delete span beyond sentences
53
+ if len(list(spn.sents)) > 1:
54
+ id_del.append(n)
55
+
56
+ # print(id_comp)
57
+
58
+ for n, idx in enumerate(id_del):
59
+ # print(idx)
60
+ try:
61
+ del span_sc[idx - n]
62
+ except IndexError:
63
+ continue
utils/visualize.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ #
5
+ # This code is adapted from spacy-streamlit package by explosion
6
+ # https://github.com/explosion/spacy-streamlit/blob/master/spacy_streamlit/__init__.py
7
+ #
8
+
9
+ from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
10
+ import streamlit as st
11
+ import spacy
12
+ from spacy.language import Language
13
+ from spacy import displacy
14
+ import pandas as pd
15
+
16
+ import streamlit as st
17
+ from spacy_streamlit import visualize_spans
18
+ from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO
19
+
20
+ from pipeline.post_processors import simple_table, const_table, ngrammar
21
+ # from skbio import diversity as dv
22
+
23
+ SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))
24
+
25
+ # fmt: off
26
+ # SPAN_ATTRS = ["text", "label_", "start", "end", "start_char", "end_char"]
27
+ SPAN_ATTRS = [
28
+ "text",
29
+ "label_",
30
+ "start",
31
+ "end",
32
+ ]
33
+
34
+
35
+ def visualize_spans(
36
+ doc: Union[spacy.tokens.Doc, Dict[str, str]],
37
+ *,
38
+ spans_key: str = "sc",
39
+ attrs: List[str] = SPAN_ATTRS,
40
+ show_table: bool = True,
41
+ title: Optional[str] = "Spans",
42
+ manual: bool = False,
43
+ displacy_options: Optional[Dict] = None,
44
+ simple: bool = True,
45
+ ):
46
+ """
47
+ Visualizer for spans.
48
+ doc (Doc, Dict): The document to visualize.
49
+ spans_key (str): Which spans key to render spans from. Default is "sc".
50
+ attrs (list): The attributes on the entity Span to be labeled. Attributes are displayed only when the show_table
51
+ argument is True.
52
+ show_table (bool): Flag signifying whether to show a table with accompanying span attributes.
53
+ title (str): The title displayed at the top of the Spans visualization.
54
+ manual (bool): Flag signifying whether the doc argument is a Doc object or a List of Dicts containing span information.
55
+ displacy_options (Dict): Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered.
56
+ See https://spacy.io/api/top-level#displacy_options-span
57
+ """
58
+ if SPACY_VERSION < (3, 3, 0):
59
+ raise ValueError(
60
+ f"'visualize_spans' requires spacy>=3.3.0. You have spacy=={spacy.__version__}"
61
+ )
62
+ if not displacy_options:
63
+ displacy_options = dict()
64
+ displacy_options["spans_key"] = spans_key
65
+
66
+ if title:
67
+ st.header(title)
68
+
69
+ if manual:
70
+ if show_table:
71
+ st.warning(
72
+ "When the parameter 'manual' is set to True, the parameter 'show_table' must be set to False."
73
+ )
74
+ if not isinstance(doc, dict):
75
+ st.warning(
76
+ "When the parameter 'manual' is set to True, the parameter 'doc' must be of type 'Dict', not 'spacy.tokens.Doc'."
77
+ )
78
+ html = displacy.render(
79
+ doc,
80
+ style="span",
81
+ options=displacy_options,
82
+ manual=manual,
83
+ )
84
+ st.write(f"{get_html(html)}", unsafe_allow_html=True)
85
+
86
+ if show_table:
87
+ # data = [
88
+ # [str(getattr(span, attr)) for attr in attrs] + [str(score)]
89
+ # for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
90
+ # ]
91
+ if simple:
92
+ data, cols = simple_table(doc, spans_key='sc', attrs=attrs)
93
+ else:
94
+ data, cols = const_table(doc, spans_key='sc', attrs=attrs)
95
+
96
+ seq = [s for s in doc.spans[spans_key]]
97
+
98
+ span_ngrams = ngrammar(seq=seq, n=3)
99
+ # st.code(span_ngrams)
100
+
101
+ if data:
102
+ df = pd.DataFrame(data, columns=cols)
103
+
104
+ st.subheader("Span information")
105
+ st.dataframe(
106
+ df.style.highlight_between(subset='Conf. score', right=.7))
107
+
108
+ st.subheader("Label counts & Diagnostic confidence score summary")
109
+ counts = df['label_'].value_counts()
110
+ label_counts = df.groupby('label_').agg({
111
+ "label_":
112
+ 'count',
113
+ "Conf. score": ['median', 'min', 'max']
114
+ }).round(4)
115
+
116
+ st.dataframe(label_counts)
117
+
118
+ # st.subheader("Engagement label by grammatical function")
119
+ # label_dep = pd.crosstab(df['span dep'], df['label_'])
120
+ # st.dataframe(label_dep)
121
+
122
+ # st.subheader('Quantitative results')
123
+ # st.markdown(
124
+ # f"Shannon's index: {dv.alpha.shannon(counts, base=2): .3f}")
125
+ # st.markdown(
126
+ # f"Simpson's e index: {dv.alpha.simpson_e(counts): .3f}")
127
+ # st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
128
+ # print(dv.get_alpha_diversity_metrics())