Spaces:
Sleeping
Sleeping
pushes source files
Browse files- # Specify latent true scores +46 -0
- .gitignore +8 -0
- analyzer.py +147 -0
- demo.py +307 -0
- main.py +217 -0
- pipeline/__pycache__/custom_functions.cpython-39.pyc +0 -0
- pipeline/__pycache__/post_processors.cpython-310.pyc +0 -0
- pipeline/__pycache__/post_processors.cpython-39.pyc +0 -0
- pipeline/custom_functions.py +190 -0
- pipeline/post_processors.py +889 -0
- requirements.txt +25 -0
- resources/__pycache__/colors.cpython-39.pyc +0 -0
- resources/__pycache__/template_list.cpython-39.pyc +0 -0
- resources/__pycache__/text_list.cpython-39.pyc +0 -0
- resources/__pycache__/text_list_BAWE.cpython-39.pyc +0 -0
- resources/colors.py +16 -0
- resources/template_list.py +48 -0
- resources/text_list.py +0 -0
- resources/text_list_BAWE.py +0 -0
- utils/__pycache__/util.cpython-39.pyc +0 -0
- utils/__pycache__/utility.cpython-310.pyc +0 -0
- utils/__pycache__/utility.cpython-39.pyc +0 -0
- utils/__pycache__/visualize.cpython-310.pyc +0 -0
- utils/__pycache__/visualize.cpython-39.pyc +0 -0
- utils/utility.py +141 -0
- utils/visualize.py +151 -0
# Specify latent true scores
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Specify latent true scores
|
2 |
+
lx1 =~ 1 * x1
|
3 |
+
lx2 =~ 1 * x2
|
4 |
+
lx3 =~ 1 * x3
|
5 |
+
|
6 |
+
# Specify mean of latent true scores
|
7 |
+
lx1 ~ gamma_lx1 * 1
|
8 |
+
lx2 ~ 0 * 1
|
9 |
+
lx3 ~ 0 * 1
|
10 |
+
# Specify variance of latent true scores
|
11 |
+
lx1 ~~ sigma2_lx1 * lx1
|
12 |
+
lx2 ~~ 0 * lx2
|
13 |
+
lx3 ~~ 0 * lx3
|
14 |
+
# Specify intercept of obseved scores
|
15 |
+
x1 ~ 0 * 1
|
16 |
+
x2 ~ 0 * 1
|
17 |
+
x3 ~ 0 * 1
|
18 |
+
# Specify variance of observed scores
|
19 |
+
x1 ~~ sigma2_ux * x1
|
20 |
+
x2 ~~ sigma2_ux * x2
|
21 |
+
x3 ~~ sigma2_ux * x3
|
22 |
+
# Specify autoregressions of latent variables
|
23 |
+
lx2 ~ 1 * lx1
|
24 |
+
lx3 ~ 1 * lx2
|
25 |
+
# Specify latent change scores
|
26 |
+
dx2 =~ 1 * lx2
|
27 |
+
dx3 =~ 1 * lx3
|
28 |
+
# Specify latent change scores means
|
29 |
+
dx2 ~ 0 * 1
|
30 |
+
dx3 ~ 0 * 1
|
31 |
+
# Specify latent change scores variances
|
32 |
+
dx2 ~~ 0 * dx2
|
33 |
+
dx3 ~~ 0 * dx3
|
34 |
+
# Specify constant change factor
|
35 |
+
g2 =~ 1 * dx2 + 1 * dx3
|
36 |
+
# Specify constant change factor mean
|
37 |
+
g2 ~ alpha_g2 * 1
|
38 |
+
# Specify constant change factor variance
|
39 |
+
g2 ~~ sigma2_g2 * g2
|
40 |
+
# Specify constant change factor covariance with the initial true score
|
41 |
+
g2 ~~ sigma_g2lx1 * lx1
|
42 |
+
# Specify proportional change component
|
43 |
+
dx2 ~ beta_x * lx1
|
44 |
+
dx3 ~ beta_x * lx2
|
45 |
+
# Specify autoregression of change score
|
46 |
+
dx3 ~ phi_x * dx2
|
.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ECCE_analysis
|
2 |
+
ECCE_texts
|
3 |
+
ICNALE_analysis
|
4 |
+
ICNALE_texts
|
5 |
+
results
|
6 |
+
inputtexts
|
7 |
+
.DS_Store
|
8 |
+
|
analyzer.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
import spacy_streamlit
|
4 |
+
from collections import Counter
|
5 |
+
import glob
|
6 |
+
|
7 |
+
import spacy
|
8 |
+
from spacy.tokens import Doc
|
9 |
+
from spacy.cli._util import import_code
|
10 |
+
|
11 |
+
from utils.visualize import visualize_spans
|
12 |
+
from utils.utility import preprocess, delete_overlapping_span, cleanup_justify
|
13 |
+
|
14 |
+
from resources.text_list import TEXT_LIST
|
15 |
+
from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
|
16 |
+
from resources.colors import COLORS_1
|
17 |
+
|
18 |
+
|
19 |
+
from skbio import diversity as dv
|
20 |
+
|
21 |
+
from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
|
22 |
+
import pandas as pd
|
23 |
+
|
24 |
+
# from pipeline.custom_functions import custom_functions
|
25 |
+
SPAN_ATTRS = ["text", "label_", "start", "end"]
|
26 |
+
CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]
|
27 |
+
|
28 |
+
|
29 |
+
# spacy.prefer_gpu()
|
30 |
+
|
31 |
+
def load_model(spacy_model):
|
32 |
+
# source = spacy.blank("en")
|
33 |
+
nlp = spacy.load(spacy_model) # , vocab=nlp_to_copy.vocab
|
34 |
+
nlp.add_pipe('sentencizer')
|
35 |
+
return (nlp)
|
36 |
+
|
37 |
+
# source = spacy.blank("en")
|
38 |
+
|
39 |
+
modelname = "en_engagement_LSTM_f3"
|
40 |
+
# modelname = "en_engagement_LSTM_f5"
|
41 |
+
# modelname = "en_engagement_Dual_RoBERTa_acad3_f4"
|
42 |
+
|
43 |
+
os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname))
|
44 |
+
|
45 |
+
import_code("pipeline/custom_functions.py")
|
46 |
+
|
47 |
+
# nlp = spacy.load("en_engagement_three_RoBERTa_base_LSTM384")
|
48 |
+
nlp = spacy.load(modelname)
|
49 |
+
# doc = nlp(preprocess(TEXT_LIST[0]))
|
50 |
+
|
51 |
+
# cleanup_justify(doc, doc.spans["sc"])
|
52 |
+
# delete_overlapping_span(doc.spans['sc'])
|
53 |
+
|
54 |
+
# data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
|
55 |
+
# seq = [s for s in doc.spans["sc"]]
|
56 |
+
# span_ngrams = ngrammar(seq=seq, n=3)
|
57 |
+
|
58 |
+
# df = pd.DataFrame(data, columns=cols)
|
59 |
+
|
60 |
+
# constant_value = 42
|
61 |
+
# new_col = pd.Series([constant_value] * df.shape[0], name='new_col')
|
62 |
+
|
63 |
+
# doclen = len(doc)
|
64 |
+
# doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
|
65 |
+
|
66 |
+
# df.insert(0, "new", new_col, True)
|
67 |
+
# df.insert(1, "nwords", doc_len, True)
|
68 |
+
|
69 |
+
# df.to_csv("results/test.csv")
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
# inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt")
|
74 |
+
inputfiles = glob.glob("ICNALE_texts/*/*.txt")
|
75 |
+
savedir = "ICNALE_analysis"
|
76 |
+
storeall = True
|
77 |
+
storage = []
|
78 |
+
os.makedirs(os.path.join("ICNALE_analysis", modelname))
|
79 |
+
|
80 |
+
|
81 |
+
doc_level_storage = []
|
82 |
+
|
83 |
+
for file in inputfiles:
|
84 |
+
|
85 |
+
filename = os.path.split(file)[-1]
|
86 |
+
|
87 |
+
with open(file, "r") as f:
|
88 |
+
text = f.read()
|
89 |
+
|
90 |
+
text = preprocess(text)
|
91 |
+
doc = nlp(text)
|
92 |
+
cleanup_justify(doc, doc.spans["sc"])
|
93 |
+
delete_overlapping_span(doc.spans['sc'])
|
94 |
+
|
95 |
+
data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
|
96 |
+
seq = [s for s in doc.spans["sc"]]
|
97 |
+
span_ngrams = ngrammar(seq=seq, n=3)
|
98 |
+
|
99 |
+
|
100 |
+
### Make it a dataset
|
101 |
+
df = pd.DataFrame(data, columns=cols)
|
102 |
+
df = df.astype({"start": int, "end": int}) #convert col type
|
103 |
+
df = df.sort_values(by= ['start']) #and sort by start
|
104 |
+
# constant_value = 42
|
105 |
+
new_col = pd.Series([filename] * df.shape[0], name='filename')
|
106 |
+
|
107 |
+
doclen = len(doc)
|
108 |
+
doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
|
109 |
+
|
110 |
+
df.insert(0, "filename", new_col, True)
|
111 |
+
df.insert(1, "nwords", doc_len, True)
|
112 |
+
df.to_csv(f"{savedir}/{modelname}/{filename}.csv")
|
113 |
+
|
114 |
+
sequences = list(df['label_'])
|
115 |
+
# Engagement ngrams
|
116 |
+
span_bigrams = ngrammar(seq=seq, n=2)
|
117 |
+
bidf = pd.DataFrame(span_bigrams)
|
118 |
+
|
119 |
+
# constant_value = 42
|
120 |
+
new_col = pd.Series([filename] * bidf.shape[0], name='filename')
|
121 |
+
bidf = bidf.insert(0, "filename", new_col, True)
|
122 |
+
|
123 |
+
|
124 |
+
## Document level
|
125 |
+
doc_level = {}
|
126 |
+
counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
|
127 |
+
div = diversity_values(list(counts))
|
128 |
+
|
129 |
+
div_data = pd.DataFrame.from_dict(div, orient='index')
|
130 |
+
|
131 |
+
doc_data = pd.concat([counts, div_data], axis = 0).T
|
132 |
+
doc_data.insert(0, "filename", filename, True)
|
133 |
+
doc_data.insert(1, "nwords", doc_len, True)
|
134 |
+
doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv")
|
135 |
+
|
136 |
+
if storeall:
|
137 |
+
storage.append(df)
|
138 |
+
doc_level_storage.append(doc_data)
|
139 |
+
|
140 |
+
|
141 |
+
alldf = pd.concat(storage)
|
142 |
+
|
143 |
+
alldf.to_csv(f"{savedir}/0_{modelname}_20230726.csv")
|
144 |
+
|
145 |
+
|
146 |
+
# alldoc = pd.concat(doc_level_storage)
|
147 |
+
# alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv")
|
demo.py
ADDED
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
# import spacy_streamlit
|
3 |
+
# from collections import Counter
|
4 |
+
|
5 |
+
import spacy
|
6 |
+
# from spacy.tokens import Doc
|
7 |
+
|
8 |
+
# from spacy_streamlit import visualize_spans
|
9 |
+
|
10 |
+
import streamlit as st
|
11 |
+
|
12 |
+
from utils.utility import delete_overlapping_span, cleanup_justify
|
13 |
+
from utils.visualize import visualize_spans
|
14 |
+
|
15 |
+
# nlp = spacy.load(
|
16 |
+
# "packages/en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2"
|
17 |
+
# )
|
18 |
+
|
19 |
+
#Load from local storage
|
20 |
+
#MODEL_LIST = ['en_engagement_RoBERTa-ME-AtoE.tar.gz']
|
21 |
+
|
22 |
+
#model = st.selectbox('Select model', MODEL_LIST, index=0)
|
23 |
+
#nlp = spacy.load("packages/" + model)
|
24 |
+
|
25 |
+
# Load from huggingface
|
26 |
+
# sm = spacy.load('en_core_web_sm', disable=['ner'])
|
27 |
+
|
28 |
+
st.set_page_config(page_title="ENGAGEMENT analyzer (beta ver 0.3)",
|
29 |
+
layout="wide",
|
30 |
+
initial_sidebar_state="expanded")
|
31 |
+
|
32 |
+
|
33 |
+
@st.cache(allow_output_mutation=True)
|
34 |
+
def load_model():
|
35 |
+
# nlp = spacy.load("en_engagement_RoBERTa_context_flz")
|
36 |
+
nlp = spacy.load("en_engagement_LSTM")
|
37 |
+
# nlp = spacy.load("en_engagement_spl_RoBERTa_base_attention")
|
38 |
+
return (nlp)
|
39 |
+
|
40 |
+
|
41 |
+
nlp = load_model()
|
42 |
+
|
43 |
+
doc = nlp(
|
44 |
+
'Welcome! Probably this is one of the few attempts to teach a machine how to read the discourse...! Although it is not perfect, you should be able to get a good place to start for your stance-taking analyses. The result will be presented here.'
|
45 |
+
)
|
46 |
+
|
47 |
+
# TPL_ENT = """
|
48 |
+
# <mark class="entity" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
49 |
+
# {text}
|
50 |
+
# <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">{label}</span>
|
51 |
+
# </mark>
|
52 |
+
# """
|
53 |
+
|
54 |
+
TPL_SPANS = """
|
55 |
+
<div class="spans" style="line-height: 4.5;">
|
56 |
+
{text}
|
57 |
+
{span_slices}
|
58 |
+
{span_starts}
|
59 |
+
</div>
|
60 |
+
"""
|
61 |
+
|
62 |
+
TPL_SPAN = """
|
63 |
+
<span style="font-weight: bold; display: inline-block; line-height: 3; padding-bottom: 12px;position: relative;">
|
64 |
+
{text}
|
65 |
+
{span_slices}
|
66 |
+
{span_starts}
|
67 |
+
</span>
|
68 |
+
"""
|
69 |
+
|
70 |
+
TPL_SPAN_SLICE = """
|
71 |
+
<span style="background: {bg}; top: {top_offset}px; display: inline-block; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
72 |
+
</span>
|
73 |
+
"""
|
74 |
+
|
75 |
+
TPL_SPAN_START = """
|
76 |
+
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
77 |
+
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
78 |
+
|
79 |
+
{label}{kb_link}
|
80 |
+
</span>
|
81 |
+
</span>
|
82 |
+
|
83 |
+
"""
|
84 |
+
|
85 |
+
# TPL_SPAN_START_RTL = """
|
86 |
+
# <span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
87 |
+
# <span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
88 |
+
# {label}{kb_link}
|
89 |
+
# </span>
|
90 |
+
# </span>
|
91 |
+
# """
|
92 |
+
|
93 |
+
DEFAULT_TEXT = """Tickner said regardless of the result, the royal commission was a waste of money and he would proceed with a separate inquiry into the issue headed by Justice Jane Matthews. His attack came as the Aboriginal women involved in the case demanded a female minister examine the religious beliefs they claim are inherent in their fight against a bridge to the island near Goolwa in South Australia."""
|
94 |
+
|
95 |
+
TEXT_LIST = [
|
96 |
+
'''To a significant extent, individuals can be considered responsible for the rise of Hitler to power on the 31st of January, 1933. Hitler himself, the charismatic leader of the Nazi Party, as well as creator of Nazi policy, played a key role in his own rise to power. However, other individuals in government, such as Hindenburg and von Papen were influential in Hitler’s rise. To a small extent, other factors also enabled Hitler to rise to power such as the Depression and the weakness of the political system. Nevertheless to a significant extent, individuals can be held responsible for the rise of Adolf Hitler to power.''',
|
97 |
+
'''Tickner said regardless of the result, the royal commission was a waste of money and he would proceed with a separate inquiry into the issue headed by Justice Jane Matthews. His attack came as the Aboriginal women involved in the case demanded a female minister examine the religious beliefs they claim are inherent in their fight against a bridge to the island near Goolwa in South Australia.''',
|
98 |
+
'''Certainly, the argumentation is not without some faults. For example, the statement that “linking homosexuality to witches fulfills the same purpose” is not supported by references to the readings. It is not clear who was linking homosexuality to witches and in what context. Nevertheless, overall and in line with the general tendencies reported in the previous section, the author employs various contracting and expanding engagement resources successfully. However, a large part of the successful use of engagement resources seems to be related to how the author structures these strategies throughout the text, namely in a wave-like fashion: from acknowledging the opinions of others, to countering them by offering one’s own interpretation, to supporting it by acknowledging other sources.''',
|
99 |
+
'''As the centuries passed, accounts of witchcraft became more and more specific; details of witches’ ceremonies and oaths became more concrete and whatever the condemned humans confessed to was treated as fact. As discussants correctly pointed out, Bernardino of Siena, Martin Le Franc, and the anonymous author of the Errores Gazariorum all have an even more aggressive campaign against witches than did the authors of our previous readings. By depicting their rituals and customs, they look to paint the most grotesque picture of witches possible. Their frenzied accusations, were some of the main catalysts of the subsequent witch hunts.''',
|
100 |
+
'''The post labeled “Witchcraft as a Problem in Society” clearly explains the contribution that each text makes to the witch hunts. While two of the authors focused on describing, in full detail, the shocking and disturbing practices that witches partook of, the others tried to prove that the witch threat was real. These last texts sought to explain witchcraft so as to convince readers that witches actually existed. As all posts reiterate, the devil is definitely at the source of witchcraft.''',
|
101 |
+
'''The third part temporarily puts aside mediation analysis and shifts the discussion to moderation analysis. In Chapter 7, I show how a multiple regression model can be made more flexible by allowing one variable’s effect to depend linearly on another variable in the model. The resulting moderated multiple regression model allows an investigator to ascertain the extent to which X’s influence on outcome variable Y is contingent on or interacts with a moderator variable W.''',
|
102 |
+
'''For instance, research has shown that people have a tendency to justify close others’ unethical actions to protect them (Gino and Galinsky 2012). Research has also shown that parents who feel close to their children often adopt strict curfew practices (Elder et al., 1995). (EC-33)''',
|
103 |
+
'''Fitzpatrick and Pagani (2013) found that engagement skills in classroom behaviour at kindergarten were related with better math scores and academic success. (LC-0525-EN)''',
|
104 |
+
'''The COAG Reform Council (2013) indicated that when compared to other students, Australian Year 4 students who attended one year of ECEC services or programs gained 11 points higher in reading (LC-0471-MA). Preliminary evidence suggests that teaching children from low-income families using humanoid robots increases motivation, sense of community, and self-expression... (EC-64). These findings suggest that visual perception takes up only a small fraction of fixation durations. Specifically, Verdelhan (2010) proposes a two-country, one-good model in which each country has an exogenously specified i.i.d. consumption growth process. Waters & Baur (2003) suggest that children or adolescents who are overweight or obese suffer from social and psychological issues. (LC-0460-EN)''',
|
105 |
+
'''According to the Australian Bureau of Statistics (2008), the percentage of obese or overweight adults is a staggering 60%.
|
106 |
+
According to George et al. (2011), in the UK immigration has improved the academic performance of the native children.
|
107 |
+
According to UNICEF (2011) a child that is breastfed within the first hour of life is fourteen times less likely to die from diarrhoea or pneumonia.''',
|
108 |
+
'''As far as I am concerned, I do think globalization is good chance for China’s developing. From my point of view, I prefer to think that advantages of globalization outweighs disadvantages. ''',
|
109 |
+
'''As we know, China has made great progress for these years. I think it is the result of globalization. We all know China is a fast-developing country. We can seethe great progress that China has made. ''',
|
110 |
+
'''His idea was that an important ninth century bishop called John Anglicus may indeed have given birth to a child in full view of everyone on the streets of Rome, but that this bishop was not and never had been the pope. Of course, there is no evidence whatever for this, as Leibnitz himself well knew.''',
|
111 |
+
'''On the whole, however, when evaluating meanings metaphorically, the Chinese EFL learners hedge and qualify their statements subjectively, tempering the certainty and authority of their assertions rather than using the resources of interpersonal metaphor to reinforce and substantiate their arguments. These tendencies reveal a key area for pedagogical intervention. Namely, instruction could focus on the value of construing metaphors objectively to obscure the author as the source of the evaluation. Similarly, raising students’ awareness of the space of negotiation and the value of offering assertions on a cline of certainty (e.g., IT IS EVIDENT) rather than through exclusive declarations of shared knowledge (e.g., AS WE ALL KNOW) is critical for academic writing refinement. Instructional interventions such as these are key areas for further investigation.''',
|
112 |
+
'''Of the defendants involved in Utah Pie Company’s case only one seems to have emerged as exceptionally successful. However this success was not a factor of overwhelming market power, as can be seen by the dominant position of Mrs. Smith’s during this time, which had maintained a 39-45 percent market share over the corresponding period.''',
|
113 |
+
'''Because of the evidence presented by Tremblay and Tremblay, it would appear that mergers in the brewing industry would have been procompetitive because of economies of scale. However, allowing a firm to acquire more than 20% of the market in Wisconsin would give it too much power to charge higher prices, even if the merger would help lower total average costs.''',
|
114 |
+
'''Taken in whole, the economic evidence for grocery retailers in the decades after the Von’s decision suggests that increased concentration is pro-competitive and good for consumers, running contrary to the fears proposed by the Court.''',
|
115 |
+
'''The remedies that Justice Lewis Powell prescribed did not gain the desired effect, and I feel that they were not very effective in promoting competition. (Elan, S86)''',
|
116 |
+
'''There is the possibility for abuse if the producer sets different maximum prices for different retailers, allowing some to reap higher profits.''',
|
117 |
+
'''Such a program, with appropriate limits, would provide a balanced structure that would ensure quality patient care.''',
|
118 |
+
'''A recent survey of physician satisfaction by Harvard Medical School found that physician autonomy and the ability to provide high-quality care, not income, are most strongly associated with changes in job satisfaction . Thus, it seems reasonable to assume that health care providers would take advantage of the greater bargaining power to improve the quality of care. (Ken, S78-79)''',
|
119 |
+
'''It appears, then, that maximum price fixing does the greatest harm when set below a competitive level [evidentialize]. In Case 4 it could potentially do harm to small retailers trying to enter the market [suggest], but does so for the benefit of consumers and the producer. Based purely on the models, it appears that, at the very least, maximum prices deserve a Rule of Reason approach to evaluate their cost and benefits.''',
|
120 |
+
'''It could be seen that for this 68% of the respondents, Tampines was characteristically a location that provided for them all their basic needs. It can be seen from chart [11] that many people quoted accessibility and proximity to home, and even shopping as one of the ideal factors that drew them there. Accessibility is quite a key factor because it is evident that the regional centre was built on the basis of good infrastructure. In comparison, 32% of the respondents felt that the conventional downtown was still a major attraction, even though the regional centre had gained quite a vast amount of popularity and did to large extent have an air of modernity.'''
|
121 |
+
]
|
122 |
+
|
123 |
+
|
124 |
+
@st.cache(suppress_st_warning=True)
|
125 |
+
def preprocess(text):
|
126 |
+
text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text)
|
127 |
+
text = re.sub('\n', ' ', text)
|
128 |
+
text = re.sub('\s+', " ", text)
|
129 |
+
text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
|
130 |
+
return text
|
131 |
+
|
132 |
+
|
133 |
+
@st.cache(allow_output_mutation=True)
|
134 |
+
def delete_span(span_sc: dict):
|
135 |
+
id_del = []
|
136 |
+
for n, spn in enumerate(span_sc, start=1):
|
137 |
+
# print(spn)
|
138 |
+
# print(spn.label_)
|
139 |
+
if len(list(spn.sents)) > 1:
|
140 |
+
id_del.append(n)
|
141 |
+
# print(len(list(spn.sents)))
|
142 |
+
|
143 |
+
for idx in id_del:
|
144 |
+
# print(idx)
|
145 |
+
del span_sc[idx]
|
146 |
+
|
147 |
+
|
148 |
+
# st.markdown('''
|
149 |
+
# <style>
|
150 |
+
# .sidebar .sidebar-content {{
|
151 |
+
# width: 300px;
|
152 |
+
# }}
|
153 |
+
# </style>
|
154 |
+
# ''',
|
155 |
+
# unsafe_allow_html=True)
|
156 |
+
|
157 |
+
with st.sidebar:
|
158 |
+
st.markdown("""
|
159 |
+
|
160 |
+
## Engagement moves analyzed in this tool (adapted from Martin & White, 2005).
|
161 |
+
|
162 |
+
| Engagement moves | Description |
|
163 |
+
| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
164 |
+
| `Deny` (Contract -> disclaim) | An utterance which invokes a contrary position but which at the same time rejects it directly. The contrary position is hence given very little dialogic space. |
|
165 |
+
| `Counter` (Contract -> disclaim) | An utterance which expresses the present proposition as replacing and thus 'countering' another proposition which would have been expected. |
|
166 |
+
| `Concur` (Contract -> proclaim) | An utterance which shows writers' expectation/assumption that the putative readers will agree with the preposition and/or to have the same knowledge. |
|
167 |
+
| `Pronounce` (Contract -> proclaim) | An utterance which expresses a strong level of writer commitment through the author's explicit emphasis and interpolation, thereby closing down the dialogic space. |
|
168 |
+
| `Endorse` (Contract -> proclaim) | An utterance which refers to external sources as warrantable, undeniable, and/or reliable. It expresses the writer’s alignment with and endorsement of an attributed proposition. As such, the dialogic space is somewhat narrowed. |
|
169 |
+
| `Entertain` (Expand) | An utterance which indicates author's position but as only one possibility amongst others, thereby opening up dialogic space. |
|
170 |
+
| `Attribute` (Expand) | An utterance which signifies dialogic space as the writer attributes the proposition to an external source. |
|
171 |
+
| `Monogloss` | An utterance which does not employ any value of engagement. Such an utterance ignores the dialogic potential in an utterance. |
|
172 |
+
|
173 |
+
""")
|
174 |
+
#For a more complete description of the category, visit [the annotation guideline](https://egumasa.github.io/engagement-annotation-project/3_Categories/)!!
|
175 |
+
|
176 |
+
st.sidebar.markdown("""
|
177 |
+
Engagement Analyzer is developed by [Masaki Eguchi](https://masakieguchi.weebly.com).
|
178 |
+
|
179 |
+
### Acknowledgements:
|
180 |
+
|
181 |
+
The development of this tool has been supported by the following grants:
|
182 |
+
|
183 |
+
- The TIRF Doctoral Dissertation Grant 2022 sponsored by the International Research Foundation for English Language Education (TIRF)
|
184 |
+
- The NFMLTA-MLJ Doctoral Dissertation Writing Support Grant 2022 sponsored by the National Federation of Modern Language Teachers Associations (NFMLTA)
|
185 |
+
- Duolingo English Test Doctoral Dissertation Award, 2022
|
186 |
+
- The Graduate Student Research Award sponsored by the Department of Linguistics, University of Oregon
|
187 |
+
|
188 |
+
I would also like to thank:
|
189 |
+
- Aaron Miller (Linguistics, University of Oregon) for corpus annotation
|
190 |
+
- Ryan Walker (Linguistics/Antholopology, University of Oregon) for corpus annotation
|
191 |
+
- Dr. Kristopher Kyle (Associate Professor in Linguistics, University of Oregon)
|
192 |
+
""")
|
193 |
+
|
194 |
+
|
195 |
+
cc = '<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial 4.0 International License</a>.'
|
196 |
+
|
197 |
+
st.sidebar.markdown(cc, unsafe_allow_html=True)
|
198 |
+
|
199 |
+
st.header("Engagement Analyzer (beta ver 0.2)")
|
200 |
+
st.write(
|
201 |
+
"Engagement Analyzer is a free tool that analyzes English texts for rhetorical strategies under the Engagement system framework (Martin & White, 2005). Martin and White (2005) propose two basic stance-taking strategies: expansion and contraction, which are in turn divided into finer-grained rhetorical strategies. The current tool allows you to analyze texts for a total of nine rhetorical strategies. The definitions of each category label can be found from the side bar"
|
202 |
+
)
|
203 |
+
|
204 |
+
with st.expander("See more explanation"):
|
205 |
+
st.markdown("""
|
206 |
+
According to Martin & White (2005), Engagement is about how the writer of a text takes stances on a topic of discussion by `Expanding (= open)` or `Contracting (= close)` the discourse for alternative viewpoints.
|
207 |
+
|
208 |
+
**Expansion strategy** = Discourse moves which open-up the dialogic space; the speaker/writer actively makes allowances for dialogically alternative positions and voices. (e.g., `ENTERTAIN`, `ATTRIBUTE`)
|
209 |
+
|
210 |
+
**Contraction strategy** = Discourse moves which close down dialogic space; the speaker/writer acts to challenge, fend off or restrict other alternative positions and voices (e.g., `DENY`, `COUNTER`, `PRONOUNCE`, `ENDORSE`).
|
211 |
+
|
212 |
+
""")
|
213 |
+
|
214 |
+
st.info('''Updated on Jan.11th, 2023\n
|
215 |
+
The current version was trained on 2,519 sentences and tested on 443 sentences. It achieved the following benchmark:
|
216 |
+
- Macro F1 = .75
|
217 |
+
- Macro Precision = .78
|
218 |
+
- Macro Recall = .74
|
219 |
+
I expect that the model's performance improves as the annotated dataset gets larger.
|
220 |
+
''')
|
221 |
+
|
222 |
+
with st.form("my_form"):
|
223 |
+
|
224 |
+
st.subheader("Option 1: selecting example text from list")
|
225 |
+
text = st.selectbox('', TEXT_LIST)
|
226 |
+
|
227 |
+
st.subheader("Option 2: analyze your own text")
|
228 |
+
input_text = st.text_area(
|
229 |
+
label="",
|
230 |
+
value=
|
231 |
+
"I would strongly encourage you to put your texts here to analyze it for stance-taking expressions.",
|
232 |
+
height=120)
|
233 |
+
st.text(
|
234 |
+
'The text from the pull-down list and in the textbox cannot be analyzed at the same time. Please select the mode.'
|
235 |
+
)
|
236 |
+
|
237 |
+
textmode = st.radio(
|
238 |
+
label='Choose the mode.',
|
239 |
+
options=['Option 1: Pull-down choice', 'Option 2: My own text'],
|
240 |
+
index=1)
|
241 |
+
|
242 |
+
submitted = st.form_submit_button("Submit")
|
243 |
+
if submitted:
|
244 |
+
if textmode == 'Option 2: My own text':
|
245 |
+
text = input_text
|
246 |
+
with st.spinner('Analysis in progress...'):
|
247 |
+
doc = nlp(preprocess(text))
|
248 |
+
#st.markdown("> " + input_text)
|
249 |
+
else:
|
250 |
+
with st.spinner('Analysis in progress...'):
|
251 |
+
doc = nlp(preprocess(text))
|
252 |
+
#st.markdown("> " + text)
|
253 |
+
|
254 |
+
## Dependency parsing
|
255 |
+
|
256 |
+
# if textmode == 'My own text':
|
257 |
+
# text = input_text
|
258 |
+
# doc = nlp(preprocess(text))
|
259 |
+
# #st.markdown("> " + input_text)
|
260 |
+
# else:
|
261 |
+
# doc = nlp(preprocess(text))
|
262 |
+
# #st.markdown("> " + text)
|
263 |
+
|
264 |
+
# st.header("Text", "text")
|
265 |
+
# st.write(text)
|
266 |
+
# delete_span(doc.spans['sc'])
|
267 |
+
|
268 |
+
cleanup_justify(doc, doc.spans['sc'])
|
269 |
+
delete_overlapping_span(doc.spans['sc'])
|
270 |
+
|
271 |
+
visualize_spans(
|
272 |
+
doc,
|
273 |
+
spans_key="sc",
|
274 |
+
displacy_options={
|
275 |
+
'template': {
|
276 |
+
"span": TPL_SPAN,
|
277 |
+
'slice': TPL_SPAN_SLICE,
|
278 |
+
'start': TPL_SPAN_START,
|
279 |
+
},
|
280 |
+
"colors": {
|
281 |
+
"ENTERTAIN": "#82b74b",
|
282 |
+
"DENY": '#c94c4c',
|
283 |
+
"COUNTER": "#eea29a",
|
284 |
+
"PRONOUNCE": "#92a8d1",
|
285 |
+
"ENDORSE": "#034f84",
|
286 |
+
"CITATION": "#b2b2b2",
|
287 |
+
# "MONOGLOSS": "#3e4444",
|
288 |
+
"ATTRIBUTE": "#f7786b",
|
289 |
+
"ATTRIBUTION": "#f7786b",
|
290 |
+
"PROCLAIM": "#92a8d1",
|
291 |
+
"CITATION": "#F8C471",
|
292 |
+
"SOURCES": "#F7DC6F",
|
293 |
+
"JUSTIFYING": "#2ECC71",
|
294 |
+
"ENDOPHORIC": "#FAD7A0"
|
295 |
+
},
|
296 |
+
},
|
297 |
+
simple = True
|
298 |
+
)
|
299 |
+
|
300 |
+
st.subheader("Bibliography")
|
301 |
+
st.markdown("""
|
302 |
+
* Chang, P., & Schleppegrell, M. (2011). Taking an effective authorial stance in academic writing: Making the linguistic resources explicit for L2 writers in the social sciences. _Journal of English for Academic Purposes, 10_ (3), 140–151. https://doi.org/10.1016/j.jeap.2011.05.005
|
303 |
+
* Martin, J. R., & White, P. R. R. (2005). _The language of evaluation: Appraisal in English._ Palgrave Macmillan.
|
304 |
+
* Ryshina-Pankova, M. (2014). Exploring academic argumentation in course-related blogs through ENGAGEMENT. In G. Thompson & L. Alba-Juez (Eds.), _Pragmatics & Beyond New Series (Vol. 242, pp. 281–302)_. John Benjamins Publishing Company. https://doi.org/10.1075/pbns.242.14rys
|
305 |
+
* Wu, S. M. (2007). The use of engagement resources in high- and low-rated undergraduate geography essays. _Journal of English for Academic Purposes, 6_ (3), 254–271. https://doi.org/10.1016/j.jeap.2007.09.006
|
306 |
+
|
307 |
+
""")
|
main.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import spacy_streamlit
|
3 |
+
from spacy_streamlit import visualize_parser
|
4 |
+
from collections import Counter
|
5 |
+
|
6 |
+
import spacy
|
7 |
+
import streamlit as st
|
8 |
+
|
9 |
+
# try:
|
10 |
+
# from .scripts.custom_functions import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3
|
11 |
+
# except ImportError:
|
12 |
+
# from pipeline.custom_functions import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3
|
13 |
+
from spacy.tokens import Doc
|
14 |
+
from spacy.cli._util import import_code
|
15 |
+
|
16 |
+
from utils.visualize import visualize_spans
|
17 |
+
from utils.utility import preprocess, delete_overlapping_span, cleanup_justify
|
18 |
+
from resources.text_list import TEXT_LIST
|
19 |
+
from resources.text_list_BAWE import TEXT_LIST_BAWE
|
20 |
+
from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
|
21 |
+
from resources.colors import COLORS_1
|
22 |
+
|
23 |
+
import_code("pipeline/custom_functions.py")
|
24 |
+
st.set_page_config(page_title='Engagement model comparaer', layout="wide")
|
25 |
+
|
26 |
+
# spacy.prefer_gpu()
|
27 |
+
|
28 |
+
MODEL_LIST =['en_engagement_LSTM', 'en_engagement_LSTM']
|
29 |
+
|
30 |
+
# MODEL_LIST = [
|
31 |
+
# 'en_engagement_three_RoBERTa_base_LSTM384-0.9.2/en_engagement_three_RoBERTa_base_LSTM384/en_engagement_three_RoBERTa_base_LSTM384-0.9.2',
|
32 |
+
# 'en_engagement_three_RoBERTa_acad3_db-0.9.2/en_engagement_three_RoBERTa_acad3_db/en_engagement_three_RoBERTa_acad3_db-0.9.2',
|
33 |
+
# 'silver-sweep-34/model-best',
|
34 |
+
# 'expert-sweep-4/model-best',
|
35 |
+
# 'confused-sweep-6/model-best',
|
36 |
+
# 'warm-sweep-20/model-best',
|
37 |
+
# "en_engagement_three_RoBERTa_base-1.10.0/en_engagement_three_RoBERTa_base/en_engagement_three_RoBERTa_base-1.10.0",
|
38 |
+
# "en_engagement_three_RoBERTa_acad_db-1.10.0/en_engagement_three_RoBERTa_acad_db/en_engagement_three_RoBERTa_acad_db-1.10.0",
|
39 |
+
# "en_engagement_para_RoBERTa_acad_db3-0.9.0/en_engagement_para_RoBERTa_acad_db3/en_engagement_para_RoBERTa_acad_db3-0.9.0",
|
40 |
+
# "en_engagement_para_RoBERTa_acad_LSTM2-0.9.0/en_engagement_para_RoBERTa_acad_LSTM2/en_engagement_para_RoBERTa_acad_LSTM2-0.9.0",
|
41 |
+
# "en_engagement_three_RoBERTa_acad_db3-0.9.1/en_engagement_three_RoBERTa_acad_db3/en_engagement_three_RoBERTa_acad_db3-0.9.1",
|
42 |
+
# "en_engagement_three_RoBERTa_acad_LSTM2-0.9.1/en_engagement_three_RoBERTa_acad_LSTM2/en_engagement_three_RoBERTa_acad_LSTM2-0.9.1",
|
43 |
+
# "en_engagement_three_RoBERTa_acad_db3-0.9.2/en_engagement_three_RoBERTa_acad_db3/en_engagement_three_RoBERTa_acad_db3-0.9.2",
|
44 |
+
# 'en_engagement_spl_RoBERTa_acad_db-0.7.4/en_engagement_spl_RoBERTa_acad_db/en_engagement_spl_RoBERTa_acad_db-0.7.4',
|
45 |
+
# 'en_engagement_spl_RoBERTa_acad_db3-0.9.0/en_engagement_spl_RoBERTa_acad_db3/en_engagement_spl_RoBERTa_acad_db3-0.9.0',
|
46 |
+
# 'en_engagement_spl_RoBERTa_acad_LSTM-0.7.2/en_engagement_spl_RoBERTa_acad_LSTM/en_engagement_spl_RoBERTa_acad_LSTM-0.7.2',
|
47 |
+
# 'en_engagement_spl_RoBERTa_acad_512',
|
48 |
+
# 'en_engagement_spl_RoBERTa_acad',
|
49 |
+
# 'en_engagement_spl_RoBERTa_exp-0.6.5/en_engagement_spl_RoBERTa_exp/en_engagement_spl_RoBERTa_exp-0.6.5',
|
50 |
+
# # 'en_engagement_spl_RoBERTa_acad-0.3.4.1221/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.3.4.1221',
|
51 |
+
# # 'en_engagement_spl_RoBERTa_acad-0.2.2.1228/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.2.1228',
|
52 |
+
# # 'en_engagement_spl_RoBERTa_acad-0.2.1.1228/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.1.1228',
|
53 |
+
# # 'en_engagement_spl_RoBERTa_acad-0.2.2.1220/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.2.1220',
|
54 |
+
# # 'en_engagement_spl_RoBERTa2-0.2.2.1210/en_engagement_spl_RoBERTa2/en_engagement_spl_RoBERTa2-0.2.2.1210',
|
55 |
+
# # 'en_engagement_spl_RoBERTa-0.2.2.1210/en_engagement_spl_RoBERTa/en_engagement_spl_RoBERTa-0.2.2.1210',
|
56 |
+
# # 'en_engagement_spl_RoBERTa_acad_max1_do02',
|
57 |
+
# # 'en_engagement_spl_RoBERTa2-0.2.2.1210/en_engagement_spl_RoBERTa2/en_engagement_spl_RoBERTa2-0.2.2.1210',
|
58 |
+
# # 'en_engagement_spl_RoBERTa_acad-0.2.3.1210/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.3.1210',
|
59 |
+
# # 'en_engagement_spl_RoBERTa_acad_max1_do02',
|
60 |
+
# # 'en_engagement_spl_RoBERTa_sqbatch_RAdam-20221202_0.1.5/en_engagement_spl_RoBERTa_sqbatch_RAdam/en_engagement_spl_RoBERTa_sqbatch_RAdam-20221202_0.1.5',
|
61 |
+
# # 'en_engagement_spl_RoBERTa_context_flz-20221130_0.1.4/en_engagement_spl_RoBERTa_context_flz/en_engagement_spl_RoBERTa_context_flz-20221130_0.1.4',
|
62 |
+
# # 'en_engagement_spl_RoBERTa_cx_max1_do2-20221202_0.1.5/en_engagement_spl_RoBERTa_cx_max1_do2/en_engagement_spl_RoBERTa_cx_max1_do2-20221202_0.1.5',
|
63 |
+
# # 'en_engagement_spl_RoBERTa_context_flz-20221125_0.1.4/en_engagement_spl_RoBERTa_context_flz/en_engagement_spl_RoBERTa_context_flz-20221125_0.1.4',
|
64 |
+
# # 'en_engagement_RoBERTa_context_flz-20221125_0.1.4/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221125_0.1.4',
|
65 |
+
# # 'en_engagement_RoBERTa_context_flz-20221117_0.1.3/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221117_0.1.3',
|
66 |
+
# # 'en_engagement_spl_RoBERTa_acad_context_flz-20221117_0.1.3/en_engagement_spl_RoBERTa_acad_context_flz/en_engagement_spl_RoBERTa_acad_context_flz-20221117_0.1.3',
|
67 |
+
# # 'en_engagement_RoBERTa_context_flz-Batch2_0.1.1/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-Batch2_0.1.1',
|
68 |
+
# # 'en_engagement_RoBERTa_context_flz-20221113_0.1.3/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221113_0.1.3',
|
69 |
+
# # 'en_engagement_RoBERTa_context_flz-20221113_0.1.1/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221113_0.1.1',
|
70 |
+
# # 'en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2',
|
71 |
+
# # 'en_engagement_RoBERTa_combined-Batch2Eng_0.2/en_engagement_RoBERTa_combined/en_engagement_RoBERTa_combined-Batch2Eng_0.2',
|
72 |
+
# # 'en_engagement_RoBERTa_acad-0.2.1/en_engagement_RoBERTa_acad/en_engagement_RoBERTa_acad-0.2.1',
|
73 |
+
# # # 'en_engagement_BERT-0.0.2/en_engagement_BERT/en_engagement_BERT-0.0.2',
|
74 |
+
# # # 'en_engagement_BERT_acad-0.0.2/en_engagement_BERT_acad/en_engagement_BERT_acad-0.0.2',
|
75 |
+
# # # 'en_engagement_RoBERTa_acad-0.0.2/en_engagement_RoBERTa_acad/en_engagement_RoBERTa_acad-0.0.2',
|
76 |
+
# # 'en_engagement_RoBERTa-0.0.1/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.1',
|
77 |
+
# # # ' en_engagement_RoBERTa_sent-0.0.1_null/en_engagement_RoBERTa_sent/en_engagement_RoBERTa_sent-0.0.1_null',
|
78 |
+
# # # 'en_engagement_RoBERTa_combined-0.0.1/en_engagement_RoBERTa_combined/en_engagement_RoBERTa_combined-0.0.1',
|
79 |
+
# # 'en_engagement_RoBERTa-ME_AtoE/en_engagement_RoBERTa/en_engagement_RoBERTa-ME_AtoE',
|
80 |
+
# # 'en_engagement_RoBERTa-AtoI_0.0.3/en_engagement_RoBERTa/en_engagement_RoBERTa-AtoI_0.0.3',
|
81 |
+
# # 'en_engagement_RoBERTa-AtoI_0.0.3/en_engagement_RoBERTa/en_engagement_RoBERTa-AtoI_0.0.2'
|
82 |
+
# ]
|
83 |
+
|
84 |
+
multicol = st.checkbox("Compare two models", value=False, key=None, help=None)
|
85 |
+
|
86 |
+
model1 = st.selectbox('Select model option 1', MODEL_LIST, index=0)
|
87 |
+
model2 = st.selectbox('Select model option 2', MODEL_LIST, index=1)
|
88 |
+
|
89 |
+
if '/' in model1:
|
90 |
+
model1 = "packages/" + model1
|
91 |
+
|
92 |
+
if '/' in model2:
|
93 |
+
model2 = "packages/" + model2
|
94 |
+
|
95 |
+
|
96 |
+
@st.cache(allow_output_mutation=True)
|
97 |
+
def load_model(spacy_model):
|
98 |
+
# source = spacy.blank("en")
|
99 |
+
nlp = spacy.load(spacy_model) #, vocab=nlp_to_copy.vocab
|
100 |
+
nlp.add_pipe('sentencizer')
|
101 |
+
return (nlp)
|
102 |
+
|
103 |
+
# source = spacy.blank("en")
|
104 |
+
nlp = load_model(model1)
|
105 |
+
|
106 |
+
if multicol:
|
107 |
+
nlp2 = load_model(model2)
|
108 |
+
|
109 |
+
|
110 |
+
text = st.selectbox('select sent to debug', TEXT_LIST_BAWE)
|
111 |
+
|
112 |
+
input_text = st.text_area("", height=200)
|
113 |
+
|
114 |
+
# Dependency parsing
|
115 |
+
st.header("Text", "text")
|
116 |
+
if len(input_text.split(" ")) > 1:
|
117 |
+
doc = nlp(preprocess(input_text))
|
118 |
+
if multicol:
|
119 |
+
doc2 = nlp2(preprocess(input_text))
|
120 |
+
# st.markdown("> " + input_text)
|
121 |
+
else:
|
122 |
+
doc = nlp(preprocess(text))
|
123 |
+
if multicol:
|
124 |
+
doc2 = nlp2(preprocess(text))
|
125 |
+
# st.markdown("> " + text)
|
126 |
+
|
127 |
+
clearjustify = st.checkbox(
|
128 |
+
"Clear problematic JUSTIFYING spans", value=True, key=None, help=None)
|
129 |
+
|
130 |
+
delete_overlaps = st.checkbox(
|
131 |
+
"Delete overlaps", value=True, key=None, help=None)
|
132 |
+
|
133 |
+
# combine = st.checkbox(
|
134 |
+
# "Combine", value=False, key=None, help=None)
|
135 |
+
|
136 |
+
# import copy
|
137 |
+
# def combine_spangroups(doc1, doc2):
|
138 |
+
# # new_doc = Doc.from_docs([doc1, doc2], ensure_whitespace=True)
|
139 |
+
# new_doc = copy.deepcopy(doc1)
|
140 |
+
# # type()
|
141 |
+
# new_doc.spans['sc'].extend(doc2.spans['sc'])
|
142 |
+
|
143 |
+
# return new_doc
|
144 |
+
|
145 |
+
|
146 |
+
# if combine:
|
147 |
+
# new_doc = combine_spangroups(doc, doc2)
|
148 |
+
# visualize_spans(new_doc,
|
149 |
+
# spans_key="sc",
|
150 |
+
# title='Combined spans:',
|
151 |
+
# displacy_options={
|
152 |
+
# 'template': {
|
153 |
+
# "span": TPL_SPAN,
|
154 |
+
# 'slice': TPL_SPAN_SLICE,
|
155 |
+
# 'start': TPL_SPAN_START,
|
156 |
+
# },
|
157 |
+
# "colors": COLORS_1,
|
158 |
+
# },
|
159 |
+
# simple=False)
|
160 |
+
|
161 |
+
if clearjustify:
|
162 |
+
cleanup_justify(doc, doc.spans['sc'])
|
163 |
+
|
164 |
+
if delete_overlaps:
|
165 |
+
delete_overlapping_span(doc.spans['sc'])
|
166 |
+
if multicol:
|
167 |
+
delete_overlapping_span(doc2.spans['sc'])
|
168 |
+
|
169 |
+
if not multicol:
|
170 |
+
visualize_spans(doc,
|
171 |
+
spans_key="sc",
|
172 |
+
title='Engagement Span Anotations 1',
|
173 |
+
displacy_options={
|
174 |
+
'template': {
|
175 |
+
"span": TPL_SPAN,
|
176 |
+
'slice': TPL_SPAN_SLICE,
|
177 |
+
'start': TPL_SPAN_START,
|
178 |
+
},
|
179 |
+
"colors": COLORS_1,
|
180 |
+
},
|
181 |
+
simple=False)
|
182 |
+
|
183 |
+
|
184 |
+
else:
|
185 |
+
col1, col2 = st.columns(2)
|
186 |
+
|
187 |
+
with col1:
|
188 |
+
visualize_spans(doc,
|
189 |
+
spans_key="sc",
|
190 |
+
title='Engagement Span Anotations 1',
|
191 |
+
displacy_options={
|
192 |
+
'template': {
|
193 |
+
"span": TPL_SPAN,
|
194 |
+
'slice': TPL_SPAN_SLICE,
|
195 |
+
'start': TPL_SPAN_START,
|
196 |
+
},
|
197 |
+
"colors": COLORS_1,
|
198 |
+
},
|
199 |
+
simple=False)
|
200 |
+
|
201 |
+
with col2:
|
202 |
+
visualize_spans(doc2,
|
203 |
+
spans_key="sc",
|
204 |
+
title='Engagement Span Anotations 2',
|
205 |
+
displacy_options={
|
206 |
+
'template': {
|
207 |
+
"span": TPL_SPAN,
|
208 |
+
'slice': TPL_SPAN_SLICE,
|
209 |
+
'start': TPL_SPAN_START,
|
210 |
+
},
|
211 |
+
"colors": COLORS_1,
|
212 |
+
},
|
213 |
+
simple=False)
|
214 |
+
|
215 |
+
|
216 |
+
dep_options = {"fine_grained": True, "distance": 120}
|
217 |
+
visualize_parser(doc, displacy_options=dep_options)
|
pipeline/__pycache__/custom_functions.cpython-39.pyc
ADDED
Binary file (3.61 kB). View file
|
|
pipeline/__pycache__/post_processors.cpython-310.pyc
ADDED
Binary file (20.1 kB). View file
|
|
pipeline/__pycache__/post_processors.cpython-39.pyc
ADDED
Binary file (21.4 kB). View file
|
|
pipeline/custom_functions.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from functools import partial
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Iterable, Callable
|
4 |
+
import spacy
|
5 |
+
from spacy.training import Example
|
6 |
+
from spacy.tokens import DocBin, Doc
|
7 |
+
|
8 |
+
# make the factory work
|
9 |
+
# from scripts.rel_pipe import make_relation_extractor
|
10 |
+
|
11 |
+
# make the config work
|
12 |
+
# from scripts.rel_model import create_relation_model, create_classification_layer, create_instances, create_tensors
|
13 |
+
# from scripts.custom_comps.SpanCat_extention import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3, build_mean_max_reducer4
|
14 |
+
|
15 |
+
from typing import List, Tuple, cast
|
16 |
+
from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
|
17 |
+
from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init, PyTorchLSTM
|
18 |
+
from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
|
19 |
+
from thinc.types import Ragged, Floats2d
|
20 |
+
|
21 |
+
from spacy.util import registry
|
22 |
+
from spacy.tokens import Doc
|
23 |
+
from spacy.ml.extract_spans import extract_spans
|
24 |
+
|
25 |
+
# @registry.layers("spacy.LinearLogistic.v1")
|
26 |
+
# def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
|
27 |
+
# """An output layer for multi-label classification. It uses a linear layer
|
28 |
+
# followed by a logistic activation.
|
29 |
+
# """
|
30 |
+
# return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
|
31 |
+
|
32 |
+
|
33 |
+
@registry.layers("mean_max_reducer.v1.5")
|
34 |
+
def build_mean_max_reducer1(hidden_size: int,
|
35 |
+
dropout: float = 0.0) -> Model[Ragged, Floats2d]:
|
36 |
+
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
37 |
+
and then combine the concatenated vectors with a hidden layer.
|
38 |
+
"""
|
39 |
+
return chain(
|
40 |
+
concatenate(
|
41 |
+
cast(Model[Ragged, Floats2d], reduce_last()),
|
42 |
+
cast(Model[Ragged, Floats2d], reduce_first()),
|
43 |
+
reduce_mean(),
|
44 |
+
reduce_max(),
|
45 |
+
),
|
46 |
+
Maxout(nO=hidden_size, normalize=True, dropout=dropout),
|
47 |
+
)
|
48 |
+
|
49 |
+
|
50 |
+
@registry.layers("mean_max_reducer.v2")
|
51 |
+
def build_mean_max_reducer2(hidden_size: int,
|
52 |
+
dropout: float = 0.0) -> Model[Ragged, Floats2d]:
|
53 |
+
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
54 |
+
and then combine the concatenated vectors with a hidden layer.
|
55 |
+
"""
|
56 |
+
return chain(
|
57 |
+
concatenate(
|
58 |
+
cast(Model[Ragged, Floats2d], reduce_last()),
|
59 |
+
cast(Model[Ragged, Floats2d], reduce_first()),
|
60 |
+
reduce_mean(),
|
61 |
+
reduce_max(),
|
62 |
+
), Maxout(nO=hidden_size, normalize=True, dropout=dropout),
|
63 |
+
Maxout(nO=hidden_size, normalize=True, dropout=dropout))
|
64 |
+
|
65 |
+
|
66 |
+
# @registry.layers("mean_max_reducer.v2")
|
67 |
+
# def build_mean_max_reducer2(hidden_size: int,
|
68 |
+
# depth: int) -> Model[Ragged, Floats2d]:
|
69 |
+
# """Reduce sequences by concatenating their mean and max pooled vectors,
|
70 |
+
# and then combine the concatenated vectors with a hidden layer.
|
71 |
+
# """
|
72 |
+
# return chain(
|
73 |
+
# concatenate(
|
74 |
+
# cast(Model[Ragged, Floats2d], reduce_last()),
|
75 |
+
# cast(Model[Ragged, Floats2d], reduce_first()),
|
76 |
+
# reduce_mean(),
|
77 |
+
# reduce_max(),
|
78 |
+
# ), Maxout(nO=hidden_size, normalize=True, dropout=0.0),
|
79 |
+
# PyTorchLSTM(nO=64, nI=hidden_size, bi=True, depth=depth, dropout=0.2))
|
80 |
+
|
81 |
+
|
82 |
+
@registry.layers("mean_max_reducer.v3")
|
83 |
+
def build_mean_max_reducer3(hidden_size: int,
|
84 |
+
maxout_pieces: int = 3,
|
85 |
+
dropout: float = 0.0) -> Model[Ragged, Floats2d]:
|
86 |
+
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
87 |
+
and then combine the concatenated vectors with a hidden layer.
|
88 |
+
"""
|
89 |
+
hidden_size2 = int(hidden_size / 2)
|
90 |
+
hidden_size3 = int(hidden_size / 2)
|
91 |
+
return chain(
|
92 |
+
concatenate(
|
93 |
+
cast(Model[Ragged, Floats2d], reduce_last()),
|
94 |
+
cast(Model[Ragged, Floats2d], reduce_first()),
|
95 |
+
reduce_mean(),
|
96 |
+
reduce_max(),
|
97 |
+
),
|
98 |
+
Maxout(nO=hidden_size,
|
99 |
+
nP=maxout_pieces,
|
100 |
+
normalize=True,
|
101 |
+
dropout=dropout),
|
102 |
+
Maxout(nO=hidden_size2,
|
103 |
+
nP=maxout_pieces,
|
104 |
+
normalize=True,
|
105 |
+
dropout=dropout),
|
106 |
+
Maxout(nO=hidden_size3,
|
107 |
+
nP=maxout_pieces,
|
108 |
+
normalize=True,
|
109 |
+
dropout=dropout))
|
110 |
+
|
111 |
+
|
112 |
+
@registry.layers("mean_max_reducer.v3.3")
|
113 |
+
def build_mean_max_reducer4(hidden_size: int,
|
114 |
+
depth: int) -> Model[Ragged, Floats2d]:
|
115 |
+
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
116 |
+
and then combine the concatenated vectors with a hidden layer.
|
117 |
+
"""
|
118 |
+
hidden_size2 = int(hidden_size / 2)
|
119 |
+
hidden_size3 = int(hidden_size / 2)
|
120 |
+
return chain(
|
121 |
+
concatenate(
|
122 |
+
cast(Model[Ragged, Floats2d], reduce_last()),
|
123 |
+
cast(Model[Ragged, Floats2d], reduce_first()),
|
124 |
+
reduce_mean(),
|
125 |
+
reduce_max(),
|
126 |
+
), Maxout(nO=hidden_size, nP=3, normalize=True, dropout=0.0),
|
127 |
+
Maxout(nO=hidden_size2, nP=3, normalize=True, dropout=0.0),
|
128 |
+
Maxout(nO=hidden_size3, nP=3, normalize=True, dropout=0.0))
|
129 |
+
|
130 |
+
|
131 |
+
@registry.architectures("CustomSpanCategorizer.v2")
|
132 |
+
def build_spancat_model(
|
133 |
+
tok2vec: Model[List[Doc], List[Floats2d]],
|
134 |
+
reducer: Model[Ragged, Floats2d],
|
135 |
+
scorer: Model[Floats2d, Floats2d],
|
136 |
+
) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
|
137 |
+
"""Build a span categorizer model, given a token-to-vector model, a
|
138 |
+
reducer model to map the sequence of vectors for each span down to a single
|
139 |
+
vector, and a scorer model to map the vectors to probabilities.
|
140 |
+
tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
|
141 |
+
reducer (Model[Ragged, Floats2d]): The reducer model.
|
142 |
+
scorer (Model[Floats2d, Floats2d]): The scorer model.
|
143 |
+
"""
|
144 |
+
model = chain(
|
145 |
+
cast(
|
146 |
+
Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]],
|
147 |
+
with_getitem(
|
148 |
+
0,
|
149 |
+
chain(tok2vec,
|
150 |
+
cast(Model[List[Floats2d], Ragged], list2ragged()))),
|
151 |
+
),
|
152 |
+
extract_spans(),
|
153 |
+
reducer,
|
154 |
+
scorer,
|
155 |
+
)
|
156 |
+
model.set_ref("tok2vec", tok2vec)
|
157 |
+
model.set_ref("reducer", reducer)
|
158 |
+
model.set_ref("scorer", scorer)
|
159 |
+
return model
|
160 |
+
|
161 |
+
|
162 |
+
# @registry.architectures("spacy.SpanCategorizer.v1")
|
163 |
+
# def build_spancat_model(
|
164 |
+
# tok2vec: Model[List[Doc], List[Floats2d]],
|
165 |
+
# reducer: Model[Ragged, Floats2d],
|
166 |
+
# scorer: Model[Floats2d, Floats2d],
|
167 |
+
# ) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
|
168 |
+
# """Build a span categorizer model, given a token-to-vector model, a
|
169 |
+
# reducer model to map the sequence of vectors for each span down to a single
|
170 |
+
# vector, and a scorer model to map the vectors to probabilities.
|
171 |
+
# tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
|
172 |
+
# reducer (Model[Ragged, Floats2d]): The reducer model.
|
173 |
+
# scorer (Model[Floats2d, Floats2d]): The scorer model.
|
174 |
+
# """
|
175 |
+
# model = chain(
|
176 |
+
# cast(
|
177 |
+
# Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]],
|
178 |
+
# with_getitem(
|
179 |
+
# 0,
|
180 |
+
# chain(tok2vec,
|
181 |
+
# cast(Model[List[Floats2d], Ragged], list2ragged()))),
|
182 |
+
# ),
|
183 |
+
# extract_spans(),
|
184 |
+
# reducer,
|
185 |
+
# scorer,
|
186 |
+
# )
|
187 |
+
# model.set_ref("tok2vec", tok2vec)
|
188 |
+
# model.set_ref("reducer", reducer)
|
189 |
+
# model.set_ref("scorer", scorer)
|
190 |
+
# return model
|
pipeline/post_processors.py
ADDED
@@ -0,0 +1,889 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
|
3 |
+
import pandas as pd
|
4 |
+
import spacy
|
5 |
+
from spacy.language import Language
|
6 |
+
from skbio import diversity as dv
|
7 |
+
|
8 |
+
SPAN_ATTRS = ["text", "label_", "start", "end"]
|
9 |
+
CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]
|
10 |
+
|
11 |
+
|
12 |
+
def simple_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
13 |
+
spans_key: str = "sc",
|
14 |
+
attrs: List[str] = SPAN_ATTRS):
|
15 |
+
columns = attrs + ["Conf. score"]
|
16 |
+
data = [
|
17 |
+
[str(getattr(span, attr))
|
18 |
+
for attr in attrs] + [score] # [f'{score:.5f}']
|
19 |
+
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
|
20 |
+
]
|
21 |
+
return data, columns
|
22 |
+
|
23 |
+
|
24 |
+
# def span_info_aggregator()
|
25 |
+
|
26 |
+
def construction_classifier(doc, span):
|
27 |
+
category = None
|
28 |
+
spanroot = span.root
|
29 |
+
|
30 |
+
## Grabbing lexico-grammatical information
|
31 |
+
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
|
32 |
+
span_dep = [t.dep_ for t in span]
|
33 |
+
span_token = [t.norm_ for t in span]
|
34 |
+
span_tag = [t.tag_ for t in span]
|
35 |
+
|
36 |
+
|
37 |
+
c = [c for c in spanroot.children]
|
38 |
+
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
|
39 |
+
|
40 |
+
c_norm = [c.norm_ for c in spanroot.children]
|
41 |
+
c_dep = [c.dep_ for c in spanroot.children]
|
42 |
+
c_pos = [c.pos_ for c in spanroot.children]
|
43 |
+
c_tag = [c.tag_ for c in spanroot.children]
|
44 |
+
|
45 |
+
right_dep = [c.dep_ for c in spanroot.rights]
|
46 |
+
|
47 |
+
#conditionals
|
48 |
+
subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
|
49 |
+
argmentless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in spanroot.children)
|
50 |
+
argless_span = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in span)
|
51 |
+
|
52 |
+
## nesting classifiers
|
53 |
+
if spanroot.dep_ == "conj":
|
54 |
+
while spanroot.dep_ == 'conj':
|
55 |
+
spanroot = spanroot.head
|
56 |
+
# if spanroot.dep_ == "poss":
|
57 |
+
# while spanroot.dep_ == 'poss':
|
58 |
+
# spanroot = spanroot.head
|
59 |
+
|
60 |
+
## Conjunctions
|
61 |
+
# Preconjunctions
|
62 |
+
if spanroot.dep_ in ['preconj', 'cc']:
|
63 |
+
category = "Conjunction"
|
64 |
+
|
65 |
+
## NOUN PHRASES
|
66 |
+
# adverbial phrases
|
67 |
+
if spanroot.dep_ in ['amod']:
|
68 |
+
category = "Adjectival modifier"
|
69 |
+
# adverbial phrases
|
70 |
+
if spanroot.dep_ in ['compound']:
|
71 |
+
category = "Compound noun"
|
72 |
+
|
73 |
+
## Nominal category
|
74 |
+
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
|
75 |
+
if "acl" in c_dep:
|
76 |
+
category = "Noun + Complement (Object)"
|
77 |
+
else:
|
78 |
+
category = "Object"
|
79 |
+
|
80 |
+
if spanroot.dep_ in ["nsubj", "nsubjpass"]:
|
81 |
+
if "acl" in c_dep:
|
82 |
+
category = "Noun + Complement (Subject)"
|
83 |
+
else:
|
84 |
+
category = "Subject"
|
85 |
+
|
86 |
+
## ADJUNCTS
|
87 |
+
# prep phrases
|
88 |
+
if spanroot.dep_ in ['prep', 'agent']:
|
89 |
+
category = 'Prepositional phrase'
|
90 |
+
# adverbial phrases
|
91 |
+
if spanroot.dep_ in ['advmod', "npadvmod", "nmod", "npmod", 'quantmod']:
|
92 |
+
category = "Adverbial phrase"
|
93 |
+
|
94 |
+
## Predication patterns
|
95 |
+
if spanroot.dep_ in ['acomp', 'oprd']:
|
96 |
+
if "xcomp" in c_dep:
|
97 |
+
category = "Subject predicate to-cl"
|
98 |
+
else:
|
99 |
+
category = "Adjectival complement"
|
100 |
+
|
101 |
+
if spanroot.dep_ in ['attr']:
|
102 |
+
subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
|
103 |
+
|
104 |
+
c_head = [c.dep_ for c in spanroot.head.children]
|
105 |
+
if "expl" in c_head and "no_det" in span_t_dep_:
|
106 |
+
category = "There is/are no NOUN"
|
107 |
+
elif "expl" in c_head and spanroot.pos_ in ["NOUN"]:
|
108 |
+
category = "There is/are + Noun complement"
|
109 |
+
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
|
110 |
+
category = "There is/are + Noun complement"
|
111 |
+
|
112 |
+
elif spanroot.pos_ in ["NOUN", "PRON"]:
|
113 |
+
if "acl" in c_dep:
|
114 |
+
category = "Noun + Complement (attr)"
|
115 |
+
else:
|
116 |
+
category = "Nominal complement"
|
117 |
+
|
118 |
+
elif not subjless and spanroot.pos_ in ['VERB', "AUX"]:
|
119 |
+
category = "Main verb 4"
|
120 |
+
|
121 |
+
elif spanroot.tag_ in ['NNP']:
|
122 |
+
category = "Nominal complement"
|
123 |
+
|
124 |
+
|
125 |
+
####################################
|
126 |
+
### clausal ####
|
127 |
+
####################################
|
128 |
+
if spanroot.dep_ in ["ROOT", "advcl", "ccomp", 'acl', 'pcomp', 'relcl']:
|
129 |
+
|
130 |
+
_check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
131 |
+
_check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
132 |
+
root_before_ccomp = [c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"]
|
133 |
+
|
134 |
+
_check_for_to = ["_".join([c.norm_, c.dep_]) for c in spanroot.subtree if c.head.dep_ == "advcl" and (c.dep_=="mark" or c.dep_ == "aux")]
|
135 |
+
entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
|
136 |
+
|
137 |
+
## Start with broad category, which is then re-evaluated for specific constructions.
|
138 |
+
if spanroot.dep_ in ['advcl', 'mark', 'acl', 'pcomp']:
|
139 |
+
## Adverbial clauses
|
140 |
+
### Finite-adverbial clauses
|
141 |
+
### Non-finite adverbial clauses
|
142 |
+
subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
|
143 |
+
entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
|
144 |
+
|
145 |
+
if "mark" in span_dep and spanroot.pos_ in ['VERB', "AUX"]:
|
146 |
+
category = "Finite adverbial clause"
|
147 |
+
elif "mark" in span_dep and "aux" in span_dep :
|
148 |
+
category = "Finite adverbial clause"
|
149 |
+
|
150 |
+
elif "mark" in span_dep and spanroot.pos_ in ['VERB', "AUX"] and "expl" in c_dep:
|
151 |
+
category = "Finite adverbial clause"
|
152 |
+
|
153 |
+
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
|
154 |
+
if spanroot.pos_ in ['VERB', "AUX"]:
|
155 |
+
category = "Finite adverbial clause"
|
156 |
+
|
157 |
+
elif spanroot.pos_ not in ['VERB', "AUX"] and subjless:
|
158 |
+
category = "Non-finite adv clause 1"
|
159 |
+
|
160 |
+
elif entire_cl:
|
161 |
+
category = "Finite adverbial clause"
|
162 |
+
|
163 |
+
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
|
164 |
+
# he doing his job
|
165 |
+
if argmentless:
|
166 |
+
#e.g., frankly speaking, strictly speaking
|
167 |
+
category = "Adverbial Phrase"
|
168 |
+
else:
|
169 |
+
category = "Non-finite adv clause 2"
|
170 |
+
|
171 |
+
elif spanroot.pos_ not in ['VERB', "AUX"] and "mark" in span_dep and subjless:
|
172 |
+
|
173 |
+
category = "Non-finite adv clause 3"
|
174 |
+
|
175 |
+
elif "aux" in c_dep and "TO" in c_tag:
|
176 |
+
category = "Adverbial Phrase"
|
177 |
+
|
178 |
+
|
179 |
+
elif "mark" not in span_dep and spanroot.pos_ in ['VERB', "AUX"]:
|
180 |
+
category = "Dependent Verb phrase"
|
181 |
+
|
182 |
+
elif not argmentless:
|
183 |
+
category = "Adverbial clause"
|
184 |
+
|
185 |
+
elif spanroot.dep_ == "advcl":
|
186 |
+
category = "Adverbial phrase"
|
187 |
+
|
188 |
+
|
189 |
+
if spanroot.dep_ in ['relcl', 'ccomp', 'acl']:
|
190 |
+
|
191 |
+
head = spanroot.head
|
192 |
+
if ";" in [t.norm_ for t in head.children]:
|
193 |
+
category = "Main verb 3"
|
194 |
+
elif "nsubj" not in span_dep:
|
195 |
+
category = "Dependent verb 1"
|
196 |
+
elif "mark" in span_dep:
|
197 |
+
category = "Complement clause"
|
198 |
+
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
|
199 |
+
category = "Non-finite complement clause"
|
200 |
+
elif spanroot.dep_ in ['relcl']:
|
201 |
+
category = "Relative clause"
|
202 |
+
elif spanroot.dep_ in ['ccomp']:
|
203 |
+
category = "Complement clause"
|
204 |
+
elif spanroot.dep_ in ['acl']:
|
205 |
+
category = "Noun Complement clause"
|
206 |
+
else:
|
207 |
+
# print(_check_for_to)
|
208 |
+
category = "this one"
|
209 |
+
|
210 |
+
## Specific constructions
|
211 |
+
# Extraposed that-clause or to-infinitives
|
212 |
+
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in ["VERB", "AUX"]:
|
213 |
+
print(c_dep)
|
214 |
+
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
|
215 |
+
#eg it seems odd (oprd) that X.
|
216 |
+
#eg it is certain (acomp) that X.
|
217 |
+
category = "Extraposed that-cl (adj-complement)" #e.g., it is certain that X.
|
218 |
+
|
219 |
+
elif "xcomp" in c_dep or ("advcl" in c_dep):
|
220 |
+
if "for_mark" in _check_for_to:
|
221 |
+
category = "Extraposed to-cl (explicit subj)" #eg It is possible to .
|
222 |
+
elif _check_to:
|
223 |
+
category = "Extraposed to-cl 1" #eg It is possible to .
|
224 |
+
elif _check_ing:
|
225 |
+
category = "Extraposed -ing 1" #eg It is possible to .
|
226 |
+
elif ("prep" in right_dep or "npadvmod" in right_dep) and "ccomp" in right_dep and spanroot.lemma_ == "be":
|
227 |
+
category = "Cleft construction"
|
228 |
+
|
229 |
+
elif "attr" in c_dep:
|
230 |
+
category = "Extraposed that-cl (copula)" #eg It is a wonder that X.
|
231 |
+
|
232 |
+
else:
|
233 |
+
category = "Extraposed that-cl (VERB)"
|
234 |
+
|
235 |
+
# if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
|
236 |
+
# category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
|
237 |
+
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "acomp" in c_dep:
|
238 |
+
if "xcomp" in c_dep:
|
239 |
+
if _check_to:
|
240 |
+
category = "Extraposed to-cl 2" #eg it is difficult to decide.
|
241 |
+
elif _check_ing:
|
242 |
+
category = "Extraposed -ing 2" #eg it is difficult to decide.
|
243 |
+
|
244 |
+
else:
|
245 |
+
category = "Extraposed that-cl (adj-complement) 2"
|
246 |
+
|
247 |
+
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
|
248 |
+
|
249 |
+
category = "Extraposed that-cl (adj-complement) 3" #eg it seems odd that X.
|
250 |
+
|
251 |
+
|
252 |
+
# something without dummy subject "it"
|
253 |
+
elif (("nsubj" in c_dep and spanroot.lemma_ in ['be']) or "nsubjpass" in c_dep) and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm:
|
254 |
+
|
255 |
+
# store xcomp, if the head of the xcomp is acomp
|
256 |
+
_check_xcomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"]
|
257 |
+
_check_ccomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"]
|
258 |
+
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
259 |
+
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
260 |
+
|
261 |
+
|
262 |
+
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
|
263 |
+
if any(root_before_ccomp):
|
264 |
+
category = "Post-predicate that-cl"
|
265 |
+
else:
|
266 |
+
category = "Comment clause"
|
267 |
+
|
268 |
+
elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp:
|
269 |
+
category = "Post-predicate that-cl 2"
|
270 |
+
|
271 |
+
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
|
272 |
+
category = "Post-predicate to-cl"
|
273 |
+
|
274 |
+
elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_to:
|
275 |
+
category = "Subject predicate to-cl"
|
276 |
+
|
277 |
+
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
|
278 |
+
category = "Subject predicate to-cl (passive)"
|
279 |
+
|
280 |
+
elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_ing:
|
281 |
+
category = "Subject predicate -ing"
|
282 |
+
elif "ccomp" in c_dep:
|
283 |
+
category = "Subject predicate that-cl"
|
284 |
+
elif "acomp" in c_dep:
|
285 |
+
category = "Adjectival predicate"
|
286 |
+
|
287 |
+
elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep):
|
288 |
+
category = "Finite-adverbial clause"
|
289 |
+
else:
|
290 |
+
category = "Main verb 1"
|
291 |
+
|
292 |
+
## without dummy subject it, and lexical verbs
|
293 |
+
elif ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm and spanroot.lemma_ not in ['be']:
|
294 |
+
_check_wh = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["attr", "advmod", 'dobj', 'nsubj'] and c.tag_ in ["WP", "WRB", "WDT", "WP$"]) and c.head.dep_ == "ccomp"]
|
295 |
+
_check_if = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) and c.head.dep_ == "ccomp"]
|
296 |
+
|
297 |
+
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
298 |
+
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
299 |
+
|
300 |
+
if "ccomp" in c_dep and (_check_wh or _check_if):
|
301 |
+
category = "Post-predicate wh-cl"
|
302 |
+
|
303 |
+
elif "ccomp" in c_dep:
|
304 |
+
if any(root_before_ccomp):
|
305 |
+
category = "Post-predicate that-cl"
|
306 |
+
else:
|
307 |
+
category = "Comment clause"
|
308 |
+
|
309 |
+
elif "xcomp" in c_dep:
|
310 |
+
if _check_to:
|
311 |
+
category = "Post-predicate to-cl"
|
312 |
+
elif _check_ing:
|
313 |
+
category = "Post-predicate -ing"
|
314 |
+
|
315 |
+
# Existential
|
316 |
+
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
|
317 |
+
category = "There is/are NOUN"
|
318 |
+
|
319 |
+
elif "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]:
|
320 |
+
category = "Cleft construction"
|
321 |
+
|
322 |
+
|
323 |
+
if spanroot.dep_ in ['parataxis']:
|
324 |
+
if "_".join(span_dep) in ["nsubj_parataxis", "aux_parataxis", "nsubj_aux_parataxis"]:
|
325 |
+
category = "Comment clause"
|
326 |
+
else:
|
327 |
+
category = "parataxis (for now)"
|
328 |
+
|
329 |
+
|
330 |
+
## External comp
|
331 |
+
if spanroot.dep_ in ['xcomp']:
|
332 |
+
if spanroot.head.pos_ == 'ADJ' and "to_aux" in c_t_dep_:
|
333 |
+
category = "Adjective complement to-cl"
|
334 |
+
if spanroot.head.pos_ == 'VERB' and "to_aux" in c_t_dep_:
|
335 |
+
category = "Verb complement to-cl"
|
336 |
+
|
337 |
+
if spanroot.dep_ in ['pcomp']:
|
338 |
+
if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and 'ccomp' in c_dep:
|
339 |
+
category = "Participle + that-cl"
|
340 |
+
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
341 |
+
category = "Participle"
|
342 |
+
|
343 |
+
## Simple classifier
|
344 |
+
# if spanroot.dep_ in ['pcomp']:
|
345 |
+
# if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
346 |
+
# category = "Gerund"
|
347 |
+
|
348 |
+
if spanroot.dep_ in ['neg']:
|
349 |
+
category = "Negative particle"
|
350 |
+
if spanroot.dep_ in ['aux', 'auxpass']:
|
351 |
+
category = "Auxiliary"
|
352 |
+
|
353 |
+
# Modal verbs
|
354 |
+
if spanroot.tag_ == "MD":
|
355 |
+
category = "Modal auxiliary"
|
356 |
+
|
357 |
+
|
358 |
+
if spanroot.dep_ in ['dep', "csubj", 'csubjpass']:
|
359 |
+
if spanroot.head.dep_ in ['ROOT', 'ccomp'] and spanroot.head.pos_ in ['AUX', 'VERB'] and spanroot.pos_ in ['AUX', 'VERB']:
|
360 |
+
if spanroot.morph == spanroot.head.morph:
|
361 |
+
category = "Main verb 4"
|
362 |
+
else:
|
363 |
+
category = "Dependent verb 2"
|
364 |
+
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
|
365 |
+
category = "Gerund"
|
366 |
+
elif spanroot.head.dep_ in ['conj', 'acl','relcl']:
|
367 |
+
if spanroot.morph == spanroot.head.morph:
|
368 |
+
category = "Main verb 4"
|
369 |
+
else:
|
370 |
+
category = "Dependent verb 2"
|
371 |
+
elif "VerbForm=Fin" in str(spanroot.morph):
|
372 |
+
category = "Dependent verb 2"
|
373 |
+
|
374 |
+
# Appositive phrases
|
375 |
+
if spanroot.dep_ in ['appos']:
|
376 |
+
if "nummod" in c_dep:
|
377 |
+
category = "Apposition"
|
378 |
+
elif spanroot.pos_ in ["PROPN"]:
|
379 |
+
category = "Appositive Proper Nouns"
|
380 |
+
elif spanroot.pos_ in ["NOUN"]:
|
381 |
+
category = "Appositive Noun Phrase"
|
382 |
+
elif spanroot.pos_ in ["VERB", "AUX"]:
|
383 |
+
_check = any(c.dep_ in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
|
384 |
+
if _check:
|
385 |
+
category = "Appositive Finite-clause"
|
386 |
+
|
387 |
+
if spanroot.dep_ in ['appos', "dep", "attr"]:
|
388 |
+
if not subjless and spanroot.pos_ in ['VERB', "AUX"]:
|
389 |
+
category = "Main verb 5"
|
390 |
+
|
391 |
+
if spanroot.dep_ in ["dep", "mark"]:
|
392 |
+
if spanroot.tag_ in ["RB", "IN", "CC"]:
|
393 |
+
category = "Conjunction"
|
394 |
+
|
395 |
+
|
396 |
+
#sometimes the extra-clausal links are not accurate
|
397 |
+
if spanroot.dep_ in ['aux', "auxpass", 'oprd', 'appos', "xcomp"]:
|
398 |
+
if spanroot.head.dep_ == "ROOT":
|
399 |
+
category = "Main verb"
|
400 |
+
else:
|
401 |
+
category = "dependent verb 5"
|
402 |
+
|
403 |
+
if span.label_ == "CITATION":
|
404 |
+
if "NNP" in span_tag or "NNPS" in span_tag:
|
405 |
+
if span_dep[0] == 'punct' and span_dep[-1] == 'punct':
|
406 |
+
category = "Parenthetical Citation"
|
407 |
+
elif span_tag[0] in ["NNP", "NNPS"]:
|
408 |
+
category = "Narrative Citation"
|
409 |
+
else:
|
410 |
+
category = "Other Citation"
|
411 |
+
|
412 |
+
if category == None:
|
413 |
+
category = spanroot.dep_
|
414 |
+
|
415 |
+
return category
|
416 |
+
|
417 |
+
|
418 |
+
def construction_classifier2(doc, span):
|
419 |
+
category = None
|
420 |
+
spanroot = span.root
|
421 |
+
|
422 |
+
## Grabbing lexico-grammatical information
|
423 |
+
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
|
424 |
+
span_dep = [t.dep_ for t in span]
|
425 |
+
span_token = [t.norm_ for t in span]
|
426 |
+
span_tag = [t.tag_ for t in span]
|
427 |
+
|
428 |
+
|
429 |
+
c = [c for c in spanroot.children]
|
430 |
+
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
|
431 |
+
|
432 |
+
c_norm = [c.norm_ for c in spanroot.children]
|
433 |
+
c_dep = [c.dep_ for c in spanroot.children]
|
434 |
+
c_pos = [c.pos_ for c in spanroot.children]
|
435 |
+
c_tag = [c.tag_ for c in spanroot.children]
|
436 |
+
|
437 |
+
right_dep = [c.dep_ for c in spanroot.rights]
|
438 |
+
|
439 |
+
#conditionals
|
440 |
+
subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
|
441 |
+
argmentless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in spanroot.children)
|
442 |
+
argless_span = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in span)
|
443 |
+
argless_span = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in span)
|
444 |
+
|
445 |
+
|
446 |
+
## nesting classifiers
|
447 |
+
if spanroot.dep_ == "conj":
|
448 |
+
while spanroot.dep_ == 'conj':
|
449 |
+
spanroot = spanroot.head
|
450 |
+
|
451 |
+
if spanroot.dep_ == "poss":
|
452 |
+
head = spanroot.head
|
453 |
+
if head.dep_ in ["pobj", "dobj", "obj", "iobj" , "dative"]:
|
454 |
+
category = "Posessive Noun (Object)"
|
455 |
+
elif head.dep_ in ["nsubj", "nsubjpass"]:
|
456 |
+
category = "Posessive Noun (Subject)"
|
457 |
+
else:
|
458 |
+
category = "Posessive Noun (Other)"
|
459 |
+
|
460 |
+
|
461 |
+
## Conjunctions
|
462 |
+
# Preconjunctions
|
463 |
+
if spanroot.dep_ in ['preconj', 'cc']:
|
464 |
+
category = "Conjunction"
|
465 |
+
|
466 |
+
## NOUN PHRASES
|
467 |
+
# adverbial phrases
|
468 |
+
if spanroot.dep_ in ['amod']:
|
469 |
+
category = "Adjectival modifier"
|
470 |
+
# adverbial phrases
|
471 |
+
if spanroot.dep_ in ['compound']:
|
472 |
+
category = "Compound noun"
|
473 |
+
|
474 |
+
## Nominal category
|
475 |
+
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj" , "dative"]:
|
476 |
+
if "acl" in c_dep:
|
477 |
+
category = "Noun + Complement (Object)"
|
478 |
+
else:
|
479 |
+
category = "Object"
|
480 |
+
|
481 |
+
if spanroot.dep_ in ["nsubj", "nsubjpass"]:
|
482 |
+
if "acl" in c_dep:
|
483 |
+
category = "Noun + Complement (Subject)"
|
484 |
+
else:
|
485 |
+
category = "Subject"
|
486 |
+
|
487 |
+
## ADJUNCTS
|
488 |
+
# prep phrases
|
489 |
+
if spanroot.dep_ in ['prep', 'agent']:
|
490 |
+
category = 'Prepositional phrase'
|
491 |
+
|
492 |
+
# adverbial phrases
|
493 |
+
if spanroot.dep_ in ['advmod', "npadvmod", "nmod", "npmod", 'quantmod', 'nummod']:
|
494 |
+
category = "Adverbial phrase"
|
495 |
+
|
496 |
+
## Predication patterns
|
497 |
+
if spanroot.dep_ in ['acomp', 'oprd']:
|
498 |
+
if "xcomp" in c_dep:
|
499 |
+
category = "Subject predicate to-cl"
|
500 |
+
else:
|
501 |
+
category = "Adjectival complement"
|
502 |
+
|
503 |
+
if spanroot.dep_ in ['attr']:
|
504 |
+
subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
|
505 |
+
|
506 |
+
c_head = [c.dep_ for c in spanroot.head.children]
|
507 |
+
if "expl" in c_head and "no_det" in span_t_dep_:
|
508 |
+
category = "There is/are no NOUN"
|
509 |
+
elif "expl" in c_head and spanroot.pos_ in ["NOUN"]:
|
510 |
+
category = "There is/are + Noun complement"
|
511 |
+
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
|
512 |
+
category = "There is/are + Noun complement"
|
513 |
+
|
514 |
+
elif spanroot.pos_ in ["NOUN", "PRON"]:
|
515 |
+
if "acl" in c_dep:
|
516 |
+
category = "Noun + Complement (attr)"
|
517 |
+
else:
|
518 |
+
category = "Nominal complement"
|
519 |
+
|
520 |
+
elif not subjless and spanroot.pos_ in ['VERB', "AUX"]:
|
521 |
+
category = "Main verb 4"
|
522 |
+
|
523 |
+
elif spanroot.tag_ in ['NNP']:
|
524 |
+
category = "Nominal complement"
|
525 |
+
|
526 |
+
## External comp
|
527 |
+
if spanroot.dep_ in ['xcomp']:
|
528 |
+
if spanroot.head.pos_ == 'ADJ' and "to_aux" in c_t_dep_:
|
529 |
+
category = "Adjective complement to-cl"
|
530 |
+
if spanroot.head.pos_ == 'VERB' and "to_aux" in c_t_dep_:
|
531 |
+
category = "Verb complement to-cl"
|
532 |
+
|
533 |
+
if spanroot.dep_ in ['pcomp']:
|
534 |
+
if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and 'ccomp' in c_dep:
|
535 |
+
category = "Participle + that-cl"
|
536 |
+
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
537 |
+
category = "Participle"
|
538 |
+
|
539 |
+
## Simple classifier
|
540 |
+
# if spanroot.dep_ in ['pcomp']:
|
541 |
+
# if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
|
542 |
+
# category = "Gerund"
|
543 |
+
|
544 |
+
if spanroot.dep_ in ['neg']:
|
545 |
+
category = "Negative particle"
|
546 |
+
if spanroot.dep_ in ['aux', 'auxpass']:
|
547 |
+
category = "Auxiliary"
|
548 |
+
|
549 |
+
# Modal verbs
|
550 |
+
if spanroot.tag_ == "MD":
|
551 |
+
category = "Modal auxiliary"
|
552 |
+
|
553 |
+
|
554 |
+
####################################
|
555 |
+
### clausal ####
|
556 |
+
####################################
|
557 |
+
if spanroot.dep_ in ["ROOT", "advcl", "ccomp", 'acl', 'pcomp', 'relcl', 'punct']:
|
558 |
+
|
559 |
+
_check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
560 |
+
_check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
561 |
+
root_before_ccomp = [c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"]
|
562 |
+
|
563 |
+
_check_for_to = ["_".join([c.norm_, c.dep_]) for c in spanroot.subtree if c.head.dep_ == "advcl" and (c.dep_=="mark" or c.dep_ == "aux")]
|
564 |
+
entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
|
565 |
+
|
566 |
+
|
567 |
+
## Start with broad category, which is then re-evaluated for specific constructions.
|
568 |
+
if spanroot.dep_ in ['advcl', 'acl', 'punct', 'pcomp']: #'mark',
|
569 |
+
## Adverbial clauses
|
570 |
+
subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
|
571 |
+
entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
|
572 |
+
|
573 |
+
### Finite-adverbial clauses
|
574 |
+
if "mark" in span_dep and (spanroot.pos_ in ['VERB', "AUX"] or "aux" in span_dep ):
|
575 |
+
category = "Finite adverbial clause"
|
576 |
+
|
577 |
+
elif "mark" in span_dep and "aux" in span_dep :
|
578 |
+
category = "Finite adverbial clause"
|
579 |
+
|
580 |
+
elif "mark" in span_dep and spanroot.pos_ in ['VERB', "AUX"] and "expl" in c_dep:
|
581 |
+
category = "Finite adverbial clause"
|
582 |
+
|
583 |
+
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
|
584 |
+
if spanroot.pos_ in ['VERB', "AUX"]:
|
585 |
+
category = "Finite adverbial clause"
|
586 |
+
|
587 |
+
elif spanroot.pos_ not in ['VERB', "AUX"] and subjless:
|
588 |
+
category = "Non-finite adv clause 1"
|
589 |
+
|
590 |
+
elif not argmentless:
|
591 |
+
category = "Finite adverbial clause"
|
592 |
+
|
593 |
+
## non-finite
|
594 |
+
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
|
595 |
+
# he doing his job
|
596 |
+
if argmentless:
|
597 |
+
#e.g., frankly speaking, strictly speaking
|
598 |
+
category = "Adverbial Phrase"
|
599 |
+
else:
|
600 |
+
category = "Non-finite adv clause 2"
|
601 |
+
|
602 |
+
elif spanroot.pos_ not in ['VERB', "AUX"] and "mark" in span_dep and subjless:
|
603 |
+
|
604 |
+
category = "Non-finite adv clause 3"
|
605 |
+
|
606 |
+
elif "aux" in c_dep and "TO" in c_tag:
|
607 |
+
category = "Adverbial Phrase"
|
608 |
+
|
609 |
+
|
610 |
+
elif "mark" not in span_dep and spanroot.pos_ in ['VERB', "AUX"]:
|
611 |
+
category = "Dependent Verb phrase"
|
612 |
+
|
613 |
+
elif not argmentless:
|
614 |
+
category = "Adverbial clause"
|
615 |
+
|
616 |
+
elif spanroot.dep_ == "advcl":
|
617 |
+
category = "Adverbial phrase"
|
618 |
+
|
619 |
+
else:
|
620 |
+
category = "Finite adverbial clause "
|
621 |
+
|
622 |
+
if spanroot.dep_ in ['relcl', 'ccomp', 'acl', 'punct', "pcomp"]:
|
623 |
+
|
624 |
+
head = spanroot.head
|
625 |
+
if ";" in [t.norm_ for t in head.children]:
|
626 |
+
category = "Main verb 3"
|
627 |
+
|
628 |
+
elif "nsubj" not in span_dep:
|
629 |
+
category = "Dependent verb 1"
|
630 |
+
|
631 |
+
elif "mark" in span_dep:
|
632 |
+
category = "Complement clause"
|
633 |
+
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
|
634 |
+
category = "Non-finite complement clause"
|
635 |
+
elif spanroot.dep_ in ['relcl']:
|
636 |
+
category = "Relative clause"
|
637 |
+
elif spanroot.dep_ in ['ccomp']:
|
638 |
+
category = "Complement clause"
|
639 |
+
elif spanroot.dep_ in ['acl']:
|
640 |
+
category = "Noun Complement clause"
|
641 |
+
|
642 |
+
## Specific constructions
|
643 |
+
# Extraposed that-clause or to-infinitives
|
644 |
+
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in ["VERB", "AUX"]:
|
645 |
+
# print(c_dep)
|
646 |
+
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
|
647 |
+
#eg it seems odd (oprd) that X.
|
648 |
+
#eg it is certain (acomp) that X.
|
649 |
+
category = "Extraposed that-cl (adj-complement)" #e.g., it is certain that X.
|
650 |
+
|
651 |
+
elif "xcomp" in c_dep or ("advcl" in c_dep):
|
652 |
+
if "for_mark" in _check_for_to:
|
653 |
+
category = "Extraposed to-cl (explicit subj)" #eg It is possible to .
|
654 |
+
elif _check_to:
|
655 |
+
category = "Extraposed to-cl 1" #eg It is possible to .
|
656 |
+
elif _check_ing:
|
657 |
+
category = "Extraposed -ing 1" #eg It is possible to .
|
658 |
+
elif ("prep" in right_dep or "npadvmod" in right_dep) and "ccomp" in right_dep and spanroot.lemma_ == "be":
|
659 |
+
category = "Cleft construction"
|
660 |
+
|
661 |
+
elif "attr" in c_dep:
|
662 |
+
category = "Extraposed that-cl (copula)" #eg It is a wonder that X.
|
663 |
+
|
664 |
+
else:
|
665 |
+
category = "Extraposed that-cl (VERB)"
|
666 |
+
|
667 |
+
# if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
|
668 |
+
# category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
|
669 |
+
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "acomp" in c_dep:
|
670 |
+
if "xcomp" in c_dep:
|
671 |
+
if _check_to:
|
672 |
+
category = "Extraposed to-cl 2" #eg it is difficult to decide.
|
673 |
+
elif _check_ing:
|
674 |
+
category = "Extraposed -ing 2" #eg it is difficult to decide.
|
675 |
+
|
676 |
+
else:
|
677 |
+
category = "Extraposed that-cl (adj-complement) 2"
|
678 |
+
|
679 |
+
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
|
680 |
+
|
681 |
+
category = "Extraposed that-cl (adj-complement) 3" #eg it seems odd that X.
|
682 |
+
|
683 |
+
|
684 |
+
# something without dummy subject "it"
|
685 |
+
elif (("nsubj" in c_dep and spanroot.lemma_ in ['be']) or "nsubjpass" in c_dep) and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm:
|
686 |
+
|
687 |
+
# store xcomp, if the head of the xcomp is acomp
|
688 |
+
_check_xcomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"]
|
689 |
+
_check_ccomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"]
|
690 |
+
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
691 |
+
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
692 |
+
|
693 |
+
|
694 |
+
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
|
695 |
+
if any(root_before_ccomp):
|
696 |
+
category = "Post-predicate that-cl"
|
697 |
+
else:
|
698 |
+
category = "Comment clause"
|
699 |
+
|
700 |
+
elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp:
|
701 |
+
category = "Post-predicate that-cl 2"
|
702 |
+
|
703 |
+
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
|
704 |
+
category = "Post-predicate to-cl"
|
705 |
+
|
706 |
+
elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_to:
|
707 |
+
category = "Subject predicate to-cl"
|
708 |
+
|
709 |
+
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
|
710 |
+
category = "Subject predicate to-cl (passive)"
|
711 |
+
|
712 |
+
elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_ing:
|
713 |
+
category = "Subject predicate -ing"
|
714 |
+
elif "ccomp" in c_dep:
|
715 |
+
category = "Subject predicate that-cl"
|
716 |
+
elif "acomp" in c_dep:
|
717 |
+
category = "Adjectival predicate"
|
718 |
+
|
719 |
+
elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep):
|
720 |
+
category = "Finite-adverbial clause"
|
721 |
+
elif not argmentless and "SCONJ" in c_pos:
|
722 |
+
category = "Finite-adverbial clause"
|
723 |
+
else:
|
724 |
+
category = "Main verb 1"
|
725 |
+
|
726 |
+
## without dummy subject it, and lexical verbs
|
727 |
+
elif ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm and spanroot.lemma_ not in ['be']:
|
728 |
+
_check_wh = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["attr", "advmod", 'dobj', 'nsubj'] and c.tag_ in ["WP", "WRB", "WDT", "WP$"]) and c.head.dep_ == "ccomp"]
|
729 |
+
_check_if = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) and c.head.dep_ == "ccomp"]
|
730 |
+
|
731 |
+
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
|
732 |
+
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
|
733 |
+
|
734 |
+
if "ccomp" in c_dep and (_check_wh or _check_if):
|
735 |
+
category = "Post-predicate wh-cl"
|
736 |
+
|
737 |
+
elif "ccomp" in c_dep:
|
738 |
+
if any(root_before_ccomp):
|
739 |
+
category = "Post-predicate that-cl"
|
740 |
+
else:
|
741 |
+
category = "Comment clause"
|
742 |
+
|
743 |
+
elif "xcomp" in c_dep:
|
744 |
+
if _check_to:
|
745 |
+
category = "Post-predicate to-cl"
|
746 |
+
elif _check_ing:
|
747 |
+
category = "Post-predicate -ing"
|
748 |
+
|
749 |
+
|
750 |
+
|
751 |
+
# Existential
|
752 |
+
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
|
753 |
+
category = "There is/are NOUN"
|
754 |
+
|
755 |
+
elif "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]:
|
756 |
+
category = "Cleft construction"
|
757 |
+
|
758 |
+
### The end of clausal analysis
|
759 |
+
|
760 |
+
if spanroot.dep_ in ['parataxis']:
|
761 |
+
if "_".join(span_dep) in ["nsubj_parataxis", "aux_parataxis", "nsubj_aux_parataxis"]:
|
762 |
+
category = "Comment clause"
|
763 |
+
else:
|
764 |
+
category = "Parataxis"
|
765 |
+
|
766 |
+
|
767 |
+
if spanroot.dep_ in ['dep', "csubj", 'csubjpass']:
|
768 |
+
if spanroot.head.dep_ in ['ROOT', 'ccomp'] and spanroot.head.pos_ in ['AUX', 'VERB'] and spanroot.pos_ in ['AUX', 'VERB']:
|
769 |
+
if spanroot.morph == spanroot.head.morph:
|
770 |
+
category = "Main verb 4"
|
771 |
+
else:
|
772 |
+
category = "Dependent verb 2"
|
773 |
+
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
|
774 |
+
category = "Gerund"
|
775 |
+
elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str(spanroot.morph):
|
776 |
+
category = "Dependent verb 2"
|
777 |
+
elif spanroot.dep_ in ["csubj", 'csubjpass']:
|
778 |
+
category = "Dependent verb (csubj)"
|
779 |
+
|
780 |
+
|
781 |
+
# Appositive phrases
|
782 |
+
if spanroot.dep_ in ['appos']:
|
783 |
+
if "nummod" in c_dep:
|
784 |
+
category = "Apposition"
|
785 |
+
if spanroot.pos_ in ["PROPN"]:
|
786 |
+
category = "Appositive Proper Nouns"
|
787 |
+
elif spanroot.pos_ in ["NOUN"]:
|
788 |
+
category = "Appositive Noun Phrase"
|
789 |
+
elif spanroot.pos_ in ["VERB", "AUX"]:
|
790 |
+
_check = any(c.dep_ in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
|
791 |
+
if _check:
|
792 |
+
category = "Appositive Finite-clause"
|
793 |
+
|
794 |
+
|
795 |
+
if spanroot.dep_ in ['appos', "dep", "attr"]:
|
796 |
+
if not subjless and spanroot.pos_ in ['VERB', "AUX"]:
|
797 |
+
category = "Main verb (likely parsing error)"
|
798 |
+
|
799 |
+
#sometimes the dep are on the conjunctions
|
800 |
+
if spanroot.dep_ in ["dep", "mark"]:
|
801 |
+
if spanroot.tag_ in ["RB", "IN", "CC"]:
|
802 |
+
category = "Conjunction"
|
803 |
+
|
804 |
+
if spanroot.dep_ in ["intj"]:
|
805 |
+
category = "Introjection"
|
806 |
+
|
807 |
+
|
808 |
+
#sometimes the extra-clausal links are not accurate
|
809 |
+
if spanroot.dep_ in ['aux', "auxpass", 'oprd', 'appos', "xcomp", "attr", 'dep', "meta", 'prt'] and category == None:
|
810 |
+
if spanroot.head.dep_ == "ROOT":
|
811 |
+
category = "Main verb"
|
812 |
+
else:
|
813 |
+
category = "dependent verb 5"
|
814 |
+
|
815 |
+
if span.label_ == "CITATION":
|
816 |
+
if "NNP" in span_tag or "NNPS" in span_tag:
|
817 |
+
if span_dep[0] == 'punct' and span_dep[-1] == 'punct':
|
818 |
+
category = "Parenthetical Citation"
|
819 |
+
elif span_tag[0] in ["NNP", "NNPS"]:
|
820 |
+
category = "Narrative Citation"
|
821 |
+
else:
|
822 |
+
category = "Other Citation"
|
823 |
+
|
824 |
+
if category == None:
|
825 |
+
category = spanroot.dep_
|
826 |
+
|
827 |
+
return category
|
828 |
+
|
829 |
+
|
830 |
+
|
831 |
+
def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
832 |
+
spans_key: str = "sc",
|
833 |
+
attrs: List[str] = SPAN_ATTRS):
|
834 |
+
columns = attrs + ["Conf. score", "sent no.", "grammatical realization", 'span dep', "ner",
|
835 |
+
"POS", 'span dep seq', "TAG sequence", "POS sequence", "head", "head dep", "children", "morphology", "sent"]
|
836 |
+
data = []
|
837 |
+
# data = span_info_aggregator(doc, columns)
|
838 |
+
sentences = {s: i for i, s in enumerate(doc.sents)}
|
839 |
+
|
840 |
+
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']):
|
841 |
+
|
842 |
+
span_info = []
|
843 |
+
span_info.extend([str(getattr(span, attr)) for attr in attrs])
|
844 |
+
|
845 |
+
span_info.append(score)
|
846 |
+
span_info.append(int(sentences[span.sent]))
|
847 |
+
span_info.append(construction_classifier2(doc, span))
|
848 |
+
span_info.append(span.root.dep_)
|
849 |
+
span_info.append(span.root.ent_type_)
|
850 |
+
span_info.append(span.root.tag_)
|
851 |
+
span_info.append("_".join([t.dep_ for t in span]))
|
852 |
+
span_info.append("_".join([t.tag_ for t in span]))
|
853 |
+
span_info.append("_".join([t.pos_ for t in span]))
|
854 |
+
span_info.append(span.root.head.norm_)
|
855 |
+
span_info.append(span.root.head.dep_)
|
856 |
+
span_info.append("_".join([c.dep_ for c in span.root.children]))
|
857 |
+
span_info.append(span.root.morph)
|
858 |
+
span_info.append(span.sent.text.strip())
|
859 |
+
|
860 |
+
data.append(span_info)
|
861 |
+
|
862 |
+
return data, columns
|
863 |
+
|
864 |
+
|
865 |
+
def ngrammar(seq: list, n=2, concat = False, sep = "-"):
|
866 |
+
result = []
|
867 |
+
n_item = len(seq)
|
868 |
+
for idx, item in enumerate(seq):
|
869 |
+
if idx + n <= n_item:
|
870 |
+
if concat:
|
871 |
+
result.append(sep.join(seq[idx: idx + n]))
|
872 |
+
else:
|
873 |
+
result.append(seq[idx: idx + n])
|
874 |
+
return result
|
875 |
+
|
876 |
+
|
877 |
+
def diversity_values(count_vec: list):
|
878 |
+
result = {}
|
879 |
+
if len(count_vec) == 0:
|
880 |
+
count_vec = [0,0,0,0,0,0,0,0,0,0]
|
881 |
+
|
882 |
+
result['shannon'] = dv.alpha.shannon(list(count_vec), base=2)
|
883 |
+
result['brillouin_d'] = dv.alpha.brillouin_d(list(count_vec))
|
884 |
+
result["simpson_d"] = 1- dv.alpha.simpson(list(count_vec))
|
885 |
+
result['simpson_e'] = dv.alpha.simpson_e(list(count_vec))
|
886 |
+
# result['gini_index'] = dv.alpha.gini_index(list(count_vec))
|
887 |
+
# result['faith_pd'] = dv.alpha.faith_pd(list(count_vec))
|
888 |
+
|
889 |
+
return result
|
requirements.txt
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pip
|
2 |
+
spacy-streamlit #==1.0.4
|
3 |
+
spacy>3.4.4
|
4 |
+
# spacy-experimental==0.6.1
|
5 |
+
# spacy-huggingface-hub==0.0.8
|
6 |
+
# spacy-transformers==1.1.8
|
7 |
+
# srsly==2.4.5
|
8 |
+
scikit-bio==0.5.8
|
9 |
+
# pip==23.3.1
|
10 |
+
# setuptools
|
11 |
+
# pydantic==1.* #necessary for spacy 3.4.4?
|
12 |
+
# altair<5
|
13 |
+
# streamlit
|
14 |
+
typing_extensions<4.6.0
|
15 |
+
|
16 |
+
|
17 |
+
# https://huggingface.co/egumasa/en_engagement_RoBERTa_combined/resolve/main/en_engagement_RoBERTa_combined-any-py3-none-any.whl
|
18 |
+
# https://huggingface.co/egumasa/en_engagement_RoBERTa_context_flz/resolve/main/en_engagement_RoBERTa_context_flz-any-py3-none-any.whl
|
19 |
+
# https://huggingface.co/egumasa/en_engagement_spl_RoBERTa_acad_max1_do02/resolve/main/en_engagement_spl_RoBERTa_acad_max1_do02-any-py3-none-any.whl
|
20 |
+
# https://huggingface.co/egumasa/en_engagement_spl_RoBERTa_acad/resolve/main/en_engagement_spl_RoBERTa_acad-any-py3-none-any.whl
|
21 |
+
# https://huggingface.co/egumasa/en_engagement_spl_RoBERTa_acad2/resolve/main/en_engagement_spl_RoBERTa_acad2-any-py3-none-any.whl
|
22 |
+
# https://huggingface.co/egumasa/en_engagement_spl_RoBERTa_acad2/resolve/main/en_engagement_spl_RoBERTa_acad2-any-py3-none-any.whl
|
23 |
+
# https://huggingface.co/egumasa/en_engagement_LSTM/resolve/main/en_engagement_LSTM-any-py3-none-any.whl
|
24 |
+
https://huggingface.co/egumasa/en_engagement_LSTM/resolve/main/en_engagement_LSTM-any-py3-none-any.whl #This is the best in 2023
|
25 |
+
# https://huggingface.co/egumasa/en_engagement_spl_RoBERTa_base_attention/resolve/main/en_engagement_spl_RoBERTa_base_attention-any-py3-none-any.whl
|
resources/__pycache__/colors.cpython-39.pyc
ADDED
Binary file (447 Bytes). View file
|
|
resources/__pycache__/template_list.cpython-39.pyc
ADDED
Binary file (2.35 kB). View file
|
|
resources/__pycache__/text_list.cpython-39.pyc
ADDED
Binary file (122 kB). View file
|
|
resources/__pycache__/text_list_BAWE.cpython-39.pyc
ADDED
Binary file (111 kB). View file
|
|
resources/colors.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
COLORS_1 = {
|
2 |
+
"ENTERTAIN": "#82b74b",
|
3 |
+
"DENY": '#c94c4c',
|
4 |
+
"COUNTER": "#eea29a",
|
5 |
+
"PRONOUNCE": "#92a8d1",
|
6 |
+
"ENDORSE": "#034f84",
|
7 |
+
"CITATION": "#b2b2b2",
|
8 |
+
# "MONOGLOSS": "#3e4444",
|
9 |
+
"ATTRIBUTE": "#f7786b",
|
10 |
+
"ATTRIBUTION": "#f7786b",
|
11 |
+
"PROCLAIM": "#92a8d1",
|
12 |
+
"ENDOPHORIC": "#FAD7A0",
|
13 |
+
"SOURCES": "#F9E79F"
|
14 |
+
|
15 |
+
}
|
16 |
+
|
resources/template_list.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TPL_ENT = """
|
2 |
+
<mark class="entity" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
|
3 |
+
{text}
|
4 |
+
<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">{label}</span>
|
5 |
+
</mark>
|
6 |
+
"""
|
7 |
+
|
8 |
+
TPL_SPANS = """
|
9 |
+
<div class="spans" style="line-height: 4.5;">
|
10 |
+
{text}
|
11 |
+
{span_slices}
|
12 |
+
{span_starts}
|
13 |
+
</div>
|
14 |
+
"""
|
15 |
+
|
16 |
+
TPL_SPAN = """
|
17 |
+
<span style="font-weight: bold; display: inline-block; line-height: 3; padding-bottom: 12px;position: relative;">
|
18 |
+
{text}
|
19 |
+
{span_slices}
|
20 |
+
{span_starts}
|
21 |
+
</span>
|
22 |
+
"""
|
23 |
+
|
24 |
+
TPL_SPAN_SLICE = """
|
25 |
+
<span style="background: {bg}; top: {top_offset}px; display: inline-block; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
26 |
+
</span>
|
27 |
+
"""
|
28 |
+
|
29 |
+
TPL_SPAN_START = """
|
30 |
+
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
31 |
+
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
32 |
+
|
33 |
+
{label}{kb_link}
|
34 |
+
</span>
|
35 |
+
</span>
|
36 |
+
|
37 |
+
"""
|
38 |
+
|
39 |
+
TPL_SPAN_START_RTL = """
|
40 |
+
<span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
|
41 |
+
<span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
|
42 |
+
{label}{kb_link}
|
43 |
+
</span>
|
44 |
+
</span>
|
45 |
+
"""
|
46 |
+
|
47 |
+
DEFAULT_TEXT = """Tickner said regardless of the result, the royal commission was a waste of money and he would proceed with a separate inquiry into the issue headed by Justice Jane Matthews. His attack came as the Aboriginal women involved in the case demanded a female minister examine the religious beliefs they claim are inherent in their fight against a bridge to the island near Goolwa in South Australia."""
|
48 |
+
|
resources/text_list.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
resources/text_list_BAWE.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
utils/__pycache__/util.cpython-39.pyc
ADDED
Binary file (2.93 kB). View file
|
|
utils/__pycache__/utility.cpython-310.pyc
ADDED
Binary file (2.91 kB). View file
|
|
utils/__pycache__/utility.cpython-39.pyc
ADDED
Binary file (2.93 kB). View file
|
|
utils/__pycache__/visualize.cpython-310.pyc
ADDED
Binary file (4.22 kB). View file
|
|
utils/__pycache__/visualize.cpython-39.pyc
ADDED
Binary file (4.18 kB). View file
|
|
utils/utility.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from collections import Counter
|
3 |
+
from spacy.tokens import SpanGroup
|
4 |
+
|
5 |
+
|
6 |
+
def preprocess(text):
|
7 |
+
text = re.sub("--- Para SEP ---", '\n', text)
|
8 |
+
text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text)
|
9 |
+
text = re.sub('\n', ' ', text)
|
10 |
+
text = re.sub(r'\s+', " ", text)
|
11 |
+
text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
|
12 |
+
return text
|
13 |
+
|
14 |
+
|
15 |
+
def del_spans(span_sc, indexes: list):
|
16 |
+
|
17 |
+
indexes.sort(
|
18 |
+
reverse=True
|
19 |
+
) # reversing allows the deletion from the last, keeping the original index
|
20 |
+
|
21 |
+
for idx in indexes:
|
22 |
+
if idx + 1 < len(span_sc):
|
23 |
+
del span_sc[idx + 1]
|
24 |
+
|
25 |
+
|
26 |
+
def delete_overlapping_span(span_sc: dict):
|
27 |
+
# print(span_sc)
|
28 |
+
start_token_list = [spn.start for spn in span_sc]
|
29 |
+
dict_ = Counter(start_token_list)
|
30 |
+
overlap = {k: v for k, v in dict_.items() if v > 1}
|
31 |
+
|
32 |
+
id_del = []
|
33 |
+
id_comp = {}
|
34 |
+
|
35 |
+
info = {}
|
36 |
+
for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
|
37 |
+
start=0):
|
38 |
+
res = {
|
39 |
+
'score': score,
|
40 |
+
'spn': spn,
|
41 |
+
'label': spn.label_,
|
42 |
+
'start': spn.start,
|
43 |
+
'end': spn.end,
|
44 |
+
'compare': spn.start in overlap,
|
45 |
+
"sents": len(list(spn.sents))
|
46 |
+
}
|
47 |
+
# print(res)
|
48 |
+
info[n] = res
|
49 |
+
|
50 |
+
if res['compare']:
|
51 |
+
if spn.start not in id_comp:
|
52 |
+
id_comp[spn.start] = n
|
53 |
+
else:
|
54 |
+
same_lbl = res['label'] == info[id_comp[spn.start]]['label']
|
55 |
+
update = res['score'] > info[id_comp[spn.start]]['score']
|
56 |
+
if update and same_lbl:
|
57 |
+
print(res['label'], info[id_comp[spn.start]]['label'])
|
58 |
+
print(same_lbl)
|
59 |
+
id_del.append(id_comp[spn.start])
|
60 |
+
id_comp[spn.start] = n
|
61 |
+
else:
|
62 |
+
id_del.append(n)
|
63 |
+
# print(update)
|
64 |
+
|
65 |
+
# delete span beyond sentences
|
66 |
+
if len(list(spn.sents)) > 1:
|
67 |
+
id_del.append(n)
|
68 |
+
|
69 |
+
# print(id_comp)
|
70 |
+
del_spans(span_sc, id_del)
|
71 |
+
# for n, idx in enumerate(id_del):
|
72 |
+
# # print(idx)
|
73 |
+
|
74 |
+
# try:
|
75 |
+
# del span_sc[idx - n]
|
76 |
+
# except IndexError:
|
77 |
+
# continue
|
78 |
+
|
79 |
+
|
80 |
+
def cleanup_justify(doc, span_sc: dict):
|
81 |
+
# This function adjusts the JUSTIFYING span
|
82 |
+
|
83 |
+
# First create an index of span with JUSTIFYING tags
|
84 |
+
justifies = {}
|
85 |
+
for idx, span in enumerate(span_sc):
|
86 |
+
# temp_root = span.root
|
87 |
+
# while span.start <= temp_root.head.i <= span.end:
|
88 |
+
# temp_root = temp_root.head
|
89 |
+
if span.label_ in ['JUSTIFYING']:
|
90 |
+
justifies[span.root] = {
|
91 |
+
"span": span,
|
92 |
+
"head": span.root.head,
|
93 |
+
"start": span.start,
|
94 |
+
"end": span.end,
|
95 |
+
"del": False,
|
96 |
+
"dependent": False,
|
97 |
+
"span_idx": idx
|
98 |
+
}
|
99 |
+
# print(justifies)
|
100 |
+
|
101 |
+
# flagging the dependency
|
102 |
+
for spanroot, info in justifies.items():
|
103 |
+
if spanroot.head in justifies:
|
104 |
+
info['dependent'] = True
|
105 |
+
info['del'] = True
|
106 |
+
|
107 |
+
# print(justifies)
|
108 |
+
new_spans = []
|
109 |
+
for spanroot, info in justifies.items():
|
110 |
+
|
111 |
+
if not info['dependent']:
|
112 |
+
# print("New Justifying candidate span:")
|
113 |
+
# print(doc[spanroot.left_edge.i:spanroot.right_edge.i + 1])
|
114 |
+
|
115 |
+
new_span = doc[spanroot.left_edge.i:spanroot.right_edge.i + 1]
|
116 |
+
new_span.label_ = "JUSTIFYING"
|
117 |
+
|
118 |
+
if new_span not in span_sc:
|
119 |
+
new_spans.append(new_span)
|
120 |
+
info['del'] = True
|
121 |
+
|
122 |
+
else:
|
123 |
+
info['del'] = True
|
124 |
+
|
125 |
+
to_delete = [
|
126 |
+
info['span_idx'] for spanroot, info in justifies.items() if info['del']
|
127 |
+
]
|
128 |
+
|
129 |
+
to_delete_span = [
|
130 |
+
info['span'] for spanroot, info in justifies.items() if info['del']
|
131 |
+
]
|
132 |
+
|
133 |
+
# print(to_delete)
|
134 |
+
# print(to_delete_span)
|
135 |
+
|
136 |
+
del_spans(span_sc, to_delete)
|
137 |
+
|
138 |
+
span_grp = SpanGroup(doc, spans=new_spans)
|
139 |
+
span_sc.extend(span_grp)
|
140 |
+
|
141 |
+
# print(justifies)
|
utils/visualize.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
#
|
5 |
+
# This code is adapted from spacy-streamlit package by explosion
|
6 |
+
# https://github.com/explosion/spacy-streamlit/blob/master/spacy_streamlit/__init__.py
|
7 |
+
#
|
8 |
+
|
9 |
+
from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
|
10 |
+
import streamlit as st
|
11 |
+
import spacy
|
12 |
+
from spacy.language import Language
|
13 |
+
from spacy import displacy
|
14 |
+
import pandas as pd
|
15 |
+
|
16 |
+
import streamlit as st
|
17 |
+
from spacy_streamlit import visualize_spans
|
18 |
+
from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO
|
19 |
+
|
20 |
+
from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
|
21 |
+
from skbio import diversity as dv
|
22 |
+
|
23 |
+
SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))
|
24 |
+
|
25 |
+
# fmt: off
|
26 |
+
# SPAN_ATTRS = ["text", "label_", "start", "end", "start_char", "end_char"]
|
27 |
+
SPAN_ATTRS = [
|
28 |
+
"text",
|
29 |
+
"label_",
|
30 |
+
"start",
|
31 |
+
"end",
|
32 |
+
]
|
33 |
+
|
34 |
+
CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]
|
35 |
+
|
36 |
+
def visualize_spans(
|
37 |
+
doc: Union[spacy.tokens.Doc, Dict[str, str]],
|
38 |
+
*,
|
39 |
+
spans_key: str = "sc",
|
40 |
+
attrs: List[str] = SPAN_ATTRS,
|
41 |
+
show_table: bool = True,
|
42 |
+
title: Optional[str] = "Spans",
|
43 |
+
manual: bool = False,
|
44 |
+
displacy_options: Optional[Dict] = None,
|
45 |
+
simple: bool = True,
|
46 |
+
):
|
47 |
+
"""
|
48 |
+
Visualizer for spans.
|
49 |
+
doc (Doc, Dict): The document to visualize.
|
50 |
+
spans_key (str): Which spans key to render spans from. Default is "sc".
|
51 |
+
attrs (list): The attributes on the entity Span to be labeled. Attributes are displayed only when the show_table
|
52 |
+
argument is True.
|
53 |
+
show_table (bool): Flag signifying whether to show a table with accompanying span attributes.
|
54 |
+
title (str): The title displayed at the top of the Spans visualization.
|
55 |
+
manual (bool): Flag signifying whether the doc argument is a Doc object or a List of Dicts containing span information.
|
56 |
+
displacy_options (Dict): Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered.
|
57 |
+
See https://spacy.io/api/top-level#displacy_options-span
|
58 |
+
"""
|
59 |
+
if SPACY_VERSION < (3, 3, 0):
|
60 |
+
raise ValueError(
|
61 |
+
f"'visualize_spans' requires spacy>=3.3.0. You have spacy=={spacy.__version__}"
|
62 |
+
)
|
63 |
+
if not displacy_options:
|
64 |
+
displacy_options = dict()
|
65 |
+
displacy_options["spans_key"] = spans_key
|
66 |
+
|
67 |
+
if title:
|
68 |
+
st.header(title)
|
69 |
+
|
70 |
+
if manual:
|
71 |
+
if show_table:
|
72 |
+
st.warning(
|
73 |
+
"When the parameter 'manual' is set to True, the parameter 'show_table' must be set to False."
|
74 |
+
)
|
75 |
+
if not isinstance(doc, dict):
|
76 |
+
st.warning(
|
77 |
+
"When the parameter 'manual' is set to True, the parameter 'doc' must be of type 'Dict', not 'spacy.tokens.Doc'."
|
78 |
+
)
|
79 |
+
html = displacy.render(
|
80 |
+
doc,
|
81 |
+
style="span",
|
82 |
+
options=displacy_options,
|
83 |
+
manual=manual,
|
84 |
+
)
|
85 |
+
st.write(f"{get_html(html)}", unsafe_allow_html=True)
|
86 |
+
|
87 |
+
if show_table:
|
88 |
+
# data = [
|
89 |
+
# [str(getattr(span, attr)) for attr in attrs] + [str(score)]
|
90 |
+
# for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
|
91 |
+
# ]
|
92 |
+
if simple:
|
93 |
+
data, cols = simple_table(doc, spans_key='sc', attrs=attrs)
|
94 |
+
else:
|
95 |
+
data, cols = const_table(doc, spans_key='sc', attrs=attrs)
|
96 |
+
|
97 |
+
# seq = [s for s in doc.spans[spans_key]]
|
98 |
+
|
99 |
+
if data:
|
100 |
+
df = pd.DataFrame(data, columns=cols)
|
101 |
+
df = df.astype({"start": int, "end": int})
|
102 |
+
df = df.sort_values(by= ['start'])
|
103 |
+
st.subheader("Span information")
|
104 |
+
st.dataframe(
|
105 |
+
df.style.highlight_between(subset='Conf. score', right=.7))
|
106 |
+
|
107 |
+
if not simple:
|
108 |
+
st.subheader("Label counts & Diagnostic confidence score summary")
|
109 |
+
counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
|
110 |
+
|
111 |
+
print(counts)
|
112 |
+
print(list(counts))
|
113 |
+
label_counts = df.groupby('label_').agg({
|
114 |
+
"label_":
|
115 |
+
'count',
|
116 |
+
"Conf. score": ['median', 'min', 'max']
|
117 |
+
}).round(4).reindex(CATEGORIES, fill_value=0)
|
118 |
+
|
119 |
+
st.dataframe(label_counts)
|
120 |
+
# print(list(label_counts))
|
121 |
+
|
122 |
+
sequences = list(df['label_'])
|
123 |
+
# Engagement ngrams
|
124 |
+
span_bigrams = ngrammar(seq=sequences, n=2, concat=True)
|
125 |
+
span_trigrams = ngrammar(seq=sequences, n=3, concat=True)
|
126 |
+
|
127 |
+
st.dataframe(pd.DataFrame(span_bigrams))
|
128 |
+
st.code(span_trigrams)
|
129 |
+
|
130 |
+
|
131 |
+
st.subheader("Engagement label by grammatical function")
|
132 |
+
label_dep = pd.crosstab(df['grammatical realization'], df['label_'])
|
133 |
+
st.dataframe(label_dep)
|
134 |
+
|
135 |
+
st.subheader('Quantitative results')
|
136 |
+
# st.markdown(
|
137 |
+
# f"Shannon's index: {dv.alpha.shannon(list(counts), base=2): .3f}")
|
138 |
+
# st.markdown(
|
139 |
+
# f"Simpson's e index: {1 - dv.alpha.simpson_e(list(counts)): .3f}")
|
140 |
+
|
141 |
+
div = diversity_values(list(counts))
|
142 |
+
div_data = pd.DataFrame.from_dict(div, orient='index')
|
143 |
+
st.dataframe(div_data)
|
144 |
+
|
145 |
+
doc_data = pd.concat([counts, div_data], axis = 0).T
|
146 |
+
filename = "NA"
|
147 |
+
doc_data.insert(0, "filename", filename, True)
|
148 |
+
doc_data.insert(1, "nwords", len(doc), True)
|
149 |
+
st.dataframe(doc_data)
|
150 |
+
# st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
|
151 |
+
# print(dv.get_alpha_diversity_metrics())
|