git renew
Browse files- .DS_Store +0 -0
- .gitignore +4 -6
- analyzer.py +146 -0
- main.py +217 -0
- utils/.DS_Store +0 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitignore
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
results/*
|
6 |
-
inputtexts/*
|
|
|
1 |
+
ECCE_analysis
|
2 |
+
ECCE_texts
|
3 |
+
results
|
4 |
+
inputtexts
|
|
|
|
analyzer.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import os
|
3 |
+
import spacy_streamlit
|
4 |
+
from collections import Counter
|
5 |
+
import glob
|
6 |
+
|
7 |
+
import spacy
|
8 |
+
from spacy.tokens import Doc
|
9 |
+
from spacy.cli._util import import_code
|
10 |
+
|
11 |
+
from utils.visualize import visualize_spans
|
12 |
+
from utils.util import preprocess, delete_overlapping_span, cleanup_justify
|
13 |
+
|
14 |
+
from resources.text_list import TEXT_LIST
|
15 |
+
from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
|
16 |
+
from resources.colors import COLORS_1
|
17 |
+
|
18 |
+
|
19 |
+
from skbio import diversity as dv
|
20 |
+
|
21 |
+
from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
|
22 |
+
import pandas as pd
|
23 |
+
|
24 |
+
# from pipeline.custom_functions import custom_functions
|
25 |
+
SPAN_ATTRS = ["text", "label_", "start", "end"]
|
26 |
+
CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]
|
27 |
+
|
28 |
+
|
29 |
+
# spacy.prefer_gpu()
|
30 |
+
|
31 |
+
def load_model(spacy_model):
|
32 |
+
# source = spacy.blank("en")
|
33 |
+
nlp = spacy.load(spacy_model) # , vocab=nlp_to_copy.vocab
|
34 |
+
nlp.add_pipe('sentencizer')
|
35 |
+
return (nlp)
|
36 |
+
|
37 |
+
# source = spacy.blank("en")
|
38 |
+
|
39 |
+
# modelname = "en_engagement_LSTM_f5"
|
40 |
+
# modelname = "en_engagement_LSTM_f5"
|
41 |
+
modelname = "en_engagement_Dual_RoBERTa_acad3_f4"
|
42 |
+
|
43 |
+
os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname))
|
44 |
+
|
45 |
+
import_code("pipeline/custom_functions.py")
|
46 |
+
|
47 |
+
# nlp = spacy.load("en_engagement_three_RoBERTa_base_LSTM384")
|
48 |
+
nlp = spacy.load(modelname)
|
49 |
+
# doc = nlp(preprocess(TEXT_LIST[0]))
|
50 |
+
|
51 |
+
# cleanup_justify(doc, doc.spans["sc"])
|
52 |
+
# delete_overlapping_span(doc.spans['sc'])
|
53 |
+
|
54 |
+
# data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
|
55 |
+
# seq = [s for s in doc.spans["sc"]]
|
56 |
+
# span_ngrams = ngrammar(seq=seq, n=3)
|
57 |
+
|
58 |
+
# df = pd.DataFrame(data, columns=cols)
|
59 |
+
|
60 |
+
# constant_value = 42
|
61 |
+
# new_col = pd.Series([constant_value] * df.shape[0], name='new_col')
|
62 |
+
|
63 |
+
# doclen = len(doc)
|
64 |
+
# doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
|
65 |
+
|
66 |
+
# df.insert(0, "new", new_col, True)
|
67 |
+
# df.insert(1, "nwords", doc_len, True)
|
68 |
+
|
69 |
+
# df.to_csv("results/test.csv")
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt")
|
74 |
+
savedir = "ECCE_analysis"
|
75 |
+
storeall = True
|
76 |
+
storage = []
|
77 |
+
os.makedirs(os.path.join("ECCE_analysis", modelname))
|
78 |
+
|
79 |
+
|
80 |
+
doc_level_storage = []
|
81 |
+
|
82 |
+
for file in inputfiles:
|
83 |
+
|
84 |
+
filename = os.path.split(file)[-1]
|
85 |
+
|
86 |
+
with open(file, "r") as f:
|
87 |
+
text = f.read()
|
88 |
+
|
89 |
+
text = preprocess(text)
|
90 |
+
doc = nlp(text)
|
91 |
+
cleanup_justify(doc, doc.spans["sc"])
|
92 |
+
delete_overlapping_span(doc.spans['sc'])
|
93 |
+
|
94 |
+
data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
|
95 |
+
seq = [s for s in doc.spans["sc"]]
|
96 |
+
span_ngrams = ngrammar(seq=seq, n=3)
|
97 |
+
|
98 |
+
|
99 |
+
### Make it a dataset
|
100 |
+
df = pd.DataFrame(data, columns=cols)
|
101 |
+
df = df.astype({"start": int, "end": int}) #convert col type
|
102 |
+
df = df.sort_values(by= ['start']) #and sort by start
|
103 |
+
# constant_value = 42
|
104 |
+
new_col = pd.Series([filename] * df.shape[0], name='filename')
|
105 |
+
|
106 |
+
doclen = len(doc)
|
107 |
+
doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
|
108 |
+
|
109 |
+
df.insert(0, "filename", new_col, True)
|
110 |
+
df.insert(1, "nwords", doc_len, True)
|
111 |
+
df.to_csv(f"{savedir}/{modelname}/{filename}.csv")
|
112 |
+
|
113 |
+
sequences = list(df['label_'])
|
114 |
+
# Engagement ngrams
|
115 |
+
span_bigrams = ngrammar(seq=seq, n=2)
|
116 |
+
bidf = pd.DataFrame(span_bigrams)
|
117 |
+
|
118 |
+
# constant_value = 42
|
119 |
+
new_col = pd.Series([filename] * bidf.shape[0], name='filename')
|
120 |
+
bidf = bidf.insert(0, "filename", new_col, True)
|
121 |
+
|
122 |
+
|
123 |
+
## Document level
|
124 |
+
doc_level = {}
|
125 |
+
counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
|
126 |
+
div = diversity_values(list(counts))
|
127 |
+
|
128 |
+
div_data = pd.DataFrame.from_dict(div, orient='index')
|
129 |
+
|
130 |
+
doc_data = pd.concat([counts, div_data], axis = 0).T
|
131 |
+
doc_data.insert(0, "filename", filename, True)
|
132 |
+
doc_data.insert(1, "nwords", doc_len, True)
|
133 |
+
doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv")
|
134 |
+
|
135 |
+
if storeall:
|
136 |
+
storage.append(df)
|
137 |
+
doc_level_storage.append(doc_data)
|
138 |
+
|
139 |
+
|
140 |
+
alldf = pd.concat(storage)
|
141 |
+
|
142 |
+
alldf.to_csv(f"{savedir}/0_{modelname}_20230426.csv")
|
143 |
+
|
144 |
+
|
145 |
+
alldoc = pd.concat(doc_level_storage)
|
146 |
+
alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv")
|
main.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import spacy_streamlit
|
3 |
+
from spacy_streamlit import visualize_parser
|
4 |
+
from collections import Counter
|
5 |
+
|
6 |
+
import spacy
|
7 |
+
import streamlit as st
|
8 |
+
|
9 |
+
# try:
|
10 |
+
# from .scripts.custom_functions import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3
|
11 |
+
# except ImportError:
|
12 |
+
# from pipeline.custom_functions import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3
|
13 |
+
from spacy.tokens import Doc
|
14 |
+
from spacy.cli._util import import_code
|
15 |
+
|
16 |
+
from utils.visualize import visualize_spans
|
17 |
+
from utils.util import preprocess, delete_overlapping_span, cleanup_justify
|
18 |
+
from resources.text_list import TEXT_LIST
|
19 |
+
from resources.text_list_BAWE import TEXT_LIST_BAWE
|
20 |
+
from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
|
21 |
+
from resources.colors import COLORS_1
|
22 |
+
|
23 |
+
import_code("pipeline/custom_functions.py")
|
24 |
+
st.set_page_config(page_title='Engagement model comparaer', layout="wide")
|
25 |
+
|
26 |
+
# spacy.prefer_gpu()
|
27 |
+
|
28 |
+
MODEL_LIST =['en_engagement_LSTM', 'en_engagement_LSTM']
|
29 |
+
|
30 |
+
# MODEL_LIST = [
|
31 |
+
# 'en_engagement_three_RoBERTa_base_LSTM384-0.9.2/en_engagement_three_RoBERTa_base_LSTM384/en_engagement_three_RoBERTa_base_LSTM384-0.9.2',
|
32 |
+
# 'en_engagement_three_RoBERTa_acad3_db-0.9.2/en_engagement_three_RoBERTa_acad3_db/en_engagement_three_RoBERTa_acad3_db-0.9.2',
|
33 |
+
# 'silver-sweep-34/model-best',
|
34 |
+
# 'expert-sweep-4/model-best',
|
35 |
+
# 'confused-sweep-6/model-best',
|
36 |
+
# 'warm-sweep-20/model-best',
|
37 |
+
# "en_engagement_three_RoBERTa_base-1.10.0/en_engagement_three_RoBERTa_base/en_engagement_three_RoBERTa_base-1.10.0",
|
38 |
+
# "en_engagement_three_RoBERTa_acad_db-1.10.0/en_engagement_three_RoBERTa_acad_db/en_engagement_three_RoBERTa_acad_db-1.10.0",
|
39 |
+
# "en_engagement_para_RoBERTa_acad_db3-0.9.0/en_engagement_para_RoBERTa_acad_db3/en_engagement_para_RoBERTa_acad_db3-0.9.0",
|
40 |
+
# "en_engagement_para_RoBERTa_acad_LSTM2-0.9.0/en_engagement_para_RoBERTa_acad_LSTM2/en_engagement_para_RoBERTa_acad_LSTM2-0.9.0",
|
41 |
+
# "en_engagement_three_RoBERTa_acad_db3-0.9.1/en_engagement_three_RoBERTa_acad_db3/en_engagement_three_RoBERTa_acad_db3-0.9.1",
|
42 |
+
# "en_engagement_three_RoBERTa_acad_LSTM2-0.9.1/en_engagement_three_RoBERTa_acad_LSTM2/en_engagement_three_RoBERTa_acad_LSTM2-0.9.1",
|
43 |
+
# "en_engagement_three_RoBERTa_acad_db3-0.9.2/en_engagement_three_RoBERTa_acad_db3/en_engagement_three_RoBERTa_acad_db3-0.9.2",
|
44 |
+
# 'en_engagement_spl_RoBERTa_acad_db-0.7.4/en_engagement_spl_RoBERTa_acad_db/en_engagement_spl_RoBERTa_acad_db-0.7.4',
|
45 |
+
# 'en_engagement_spl_RoBERTa_acad_db3-0.9.0/en_engagement_spl_RoBERTa_acad_db3/en_engagement_spl_RoBERTa_acad_db3-0.9.0',
|
46 |
+
# 'en_engagement_spl_RoBERTa_acad_LSTM-0.7.2/en_engagement_spl_RoBERTa_acad_LSTM/en_engagement_spl_RoBERTa_acad_LSTM-0.7.2',
|
47 |
+
# 'en_engagement_spl_RoBERTa_acad_512',
|
48 |
+
# 'en_engagement_spl_RoBERTa_acad',
|
49 |
+
# 'en_engagement_spl_RoBERTa_exp-0.6.5/en_engagement_spl_RoBERTa_exp/en_engagement_spl_RoBERTa_exp-0.6.5',
|
50 |
+
# # 'en_engagement_spl_RoBERTa_acad-0.3.4.1221/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.3.4.1221',
|
51 |
+
# # 'en_engagement_spl_RoBERTa_acad-0.2.2.1228/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.2.1228',
|
52 |
+
# # 'en_engagement_spl_RoBERTa_acad-0.2.1.1228/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.1.1228',
|
53 |
+
# # 'en_engagement_spl_RoBERTa_acad-0.2.2.1220/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.2.1220',
|
54 |
+
# # 'en_engagement_spl_RoBERTa2-0.2.2.1210/en_engagement_spl_RoBERTa2/en_engagement_spl_RoBERTa2-0.2.2.1210',
|
55 |
+
# # 'en_engagement_spl_RoBERTa-0.2.2.1210/en_engagement_spl_RoBERTa/en_engagement_spl_RoBERTa-0.2.2.1210',
|
56 |
+
# # 'en_engagement_spl_RoBERTa_acad_max1_do02',
|
57 |
+
# # 'en_engagement_spl_RoBERTa2-0.2.2.1210/en_engagement_spl_RoBERTa2/en_engagement_spl_RoBERTa2-0.2.2.1210',
|
58 |
+
# # 'en_engagement_spl_RoBERTa_acad-0.2.3.1210/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.3.1210',
|
59 |
+
# # 'en_engagement_spl_RoBERTa_acad_max1_do02',
|
60 |
+
# # 'en_engagement_spl_RoBERTa_sqbatch_RAdam-20221202_0.1.5/en_engagement_spl_RoBERTa_sqbatch_RAdam/en_engagement_spl_RoBERTa_sqbatch_RAdam-20221202_0.1.5',
|
61 |
+
# # 'en_engagement_spl_RoBERTa_context_flz-20221130_0.1.4/en_engagement_spl_RoBERTa_context_flz/en_engagement_spl_RoBERTa_context_flz-20221130_0.1.4',
|
62 |
+
# # 'en_engagement_spl_RoBERTa_cx_max1_do2-20221202_0.1.5/en_engagement_spl_RoBERTa_cx_max1_do2/en_engagement_spl_RoBERTa_cx_max1_do2-20221202_0.1.5',
|
63 |
+
# # 'en_engagement_spl_RoBERTa_context_flz-20221125_0.1.4/en_engagement_spl_RoBERTa_context_flz/en_engagement_spl_RoBERTa_context_flz-20221125_0.1.4',
|
64 |
+
# # 'en_engagement_RoBERTa_context_flz-20221125_0.1.4/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221125_0.1.4',
|
65 |
+
# # 'en_engagement_RoBERTa_context_flz-20221117_0.1.3/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221117_0.1.3',
|
66 |
+
# # 'en_engagement_spl_RoBERTa_acad_context_flz-20221117_0.1.3/en_engagement_spl_RoBERTa_acad_context_flz/en_engagement_spl_RoBERTa_acad_context_flz-20221117_0.1.3',
|
67 |
+
# # 'en_engagement_RoBERTa_context_flz-Batch2_0.1.1/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-Batch2_0.1.1',
|
68 |
+
# # 'en_engagement_RoBERTa_context_flz-20221113_0.1.3/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221113_0.1.3',
|
69 |
+
# # 'en_engagement_RoBERTa_context_flz-20221113_0.1.1/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221113_0.1.1',
|
70 |
+
# # 'en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2',
|
71 |
+
# # 'en_engagement_RoBERTa_combined-Batch2Eng_0.2/en_engagement_RoBERTa_combined/en_engagement_RoBERTa_combined-Batch2Eng_0.2',
|
72 |
+
# # 'en_engagement_RoBERTa_acad-0.2.1/en_engagement_RoBERTa_acad/en_engagement_RoBERTa_acad-0.2.1',
|
73 |
+
# # # 'en_engagement_BERT-0.0.2/en_engagement_BERT/en_engagement_BERT-0.0.2',
|
74 |
+
# # # 'en_engagement_BERT_acad-0.0.2/en_engagement_BERT_acad/en_engagement_BERT_acad-0.0.2',
|
75 |
+
# # # 'en_engagement_RoBERTa_acad-0.0.2/en_engagement_RoBERTa_acad/en_engagement_RoBERTa_acad-0.0.2',
|
76 |
+
# # 'en_engagement_RoBERTa-0.0.1/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.1',
|
77 |
+
# # # ' en_engagement_RoBERTa_sent-0.0.1_null/en_engagement_RoBERTa_sent/en_engagement_RoBERTa_sent-0.0.1_null',
|
78 |
+
# # # 'en_engagement_RoBERTa_combined-0.0.1/en_engagement_RoBERTa_combined/en_engagement_RoBERTa_combined-0.0.1',
|
79 |
+
# # 'en_engagement_RoBERTa-ME_AtoE/en_engagement_RoBERTa/en_engagement_RoBERTa-ME_AtoE',
|
80 |
+
# # 'en_engagement_RoBERTa-AtoI_0.0.3/en_engagement_RoBERTa/en_engagement_RoBERTa-AtoI_0.0.3',
|
81 |
+
# # 'en_engagement_RoBERTa-AtoI_0.0.3/en_engagement_RoBERTa/en_engagement_RoBERTa-AtoI_0.0.2'
|
82 |
+
# ]
|
83 |
+
|
84 |
+
multicol = st.checkbox("Compare two models", value=False, key=None, help=None)
|
85 |
+
|
86 |
+
model1 = st.selectbox('Select model option 1', MODEL_LIST, index=0)
|
87 |
+
model2 = st.selectbox('Select model option 2', MODEL_LIST, index=1)
|
88 |
+
|
89 |
+
if '/' in model1:
|
90 |
+
model1 = "packages/" + model1
|
91 |
+
|
92 |
+
if '/' in model2:
|
93 |
+
model2 = "packages/" + model2
|
94 |
+
|
95 |
+
|
96 |
+
@st.cache(allow_output_mutation=True)
|
97 |
+
def load_model(spacy_model):
|
98 |
+
# source = spacy.blank("en")
|
99 |
+
nlp = spacy.load(spacy_model) #, vocab=nlp_to_copy.vocab
|
100 |
+
nlp.add_pipe('sentencizer')
|
101 |
+
return (nlp)
|
102 |
+
|
103 |
+
# source = spacy.blank("en")
|
104 |
+
nlp = load_model(model1)
|
105 |
+
|
106 |
+
if multicol:
|
107 |
+
nlp2 = load_model(model2)
|
108 |
+
|
109 |
+
|
110 |
+
text = st.selectbox('select sent to debug', TEXT_LIST_BAWE)
|
111 |
+
|
112 |
+
input_text = st.text_area("", height=200)
|
113 |
+
|
114 |
+
# Dependency parsing
|
115 |
+
st.header("Text", "text")
|
116 |
+
if len(input_text.split(" ")) > 1:
|
117 |
+
doc = nlp(preprocess(input_text))
|
118 |
+
if multicol:
|
119 |
+
doc2 = nlp2(preprocess(input_text))
|
120 |
+
# st.markdown("> " + input_text)
|
121 |
+
else:
|
122 |
+
doc = nlp(preprocess(text))
|
123 |
+
if multicol:
|
124 |
+
doc2 = nlp2(preprocess(text))
|
125 |
+
# st.markdown("> " + text)
|
126 |
+
|
127 |
+
clearjustify = st.checkbox(
|
128 |
+
"Clear problematic JUSTIFYING spans", value=True, key=None, help=None)
|
129 |
+
|
130 |
+
delete_overlaps = st.checkbox(
|
131 |
+
"Delete overlaps", value=True, key=None, help=None)
|
132 |
+
|
133 |
+
# combine = st.checkbox(
|
134 |
+
# "Combine", value=False, key=None, help=None)
|
135 |
+
|
136 |
+
# import copy
|
137 |
+
# def combine_spangroups(doc1, doc2):
|
138 |
+
# # new_doc = Doc.from_docs([doc1, doc2], ensure_whitespace=True)
|
139 |
+
# new_doc = copy.deepcopy(doc1)
|
140 |
+
# # type()
|
141 |
+
# new_doc.spans['sc'].extend(doc2.spans['sc'])
|
142 |
+
|
143 |
+
# return new_doc
|
144 |
+
|
145 |
+
|
146 |
+
# if combine:
|
147 |
+
# new_doc = combine_spangroups(doc, doc2)
|
148 |
+
# visualize_spans(new_doc,
|
149 |
+
# spans_key="sc",
|
150 |
+
# title='Combined spans:',
|
151 |
+
# displacy_options={
|
152 |
+
# 'template': {
|
153 |
+
# "span": TPL_SPAN,
|
154 |
+
# 'slice': TPL_SPAN_SLICE,
|
155 |
+
# 'start': TPL_SPAN_START,
|
156 |
+
# },
|
157 |
+
# "colors": COLORS_1,
|
158 |
+
# },
|
159 |
+
# simple=False)
|
160 |
+
|
161 |
+
if clearjustify:
|
162 |
+
cleanup_justify(doc, doc.spans['sc'])
|
163 |
+
|
164 |
+
if delete_overlaps:
|
165 |
+
delete_overlapping_span(doc.spans['sc'])
|
166 |
+
if multicol:
|
167 |
+
delete_overlapping_span(doc2.spans['sc'])
|
168 |
+
|
169 |
+
if not multicol:
|
170 |
+
visualize_spans(doc,
|
171 |
+
spans_key="sc",
|
172 |
+
title='Engagement Span Anotations 1',
|
173 |
+
displacy_options={
|
174 |
+
'template': {
|
175 |
+
"span": TPL_SPAN,
|
176 |
+
'slice': TPL_SPAN_SLICE,
|
177 |
+
'start': TPL_SPAN_START,
|
178 |
+
},
|
179 |
+
"colors": COLORS_1,
|
180 |
+
},
|
181 |
+
simple=False)
|
182 |
+
|
183 |
+
|
184 |
+
else:
|
185 |
+
col1, col2 = st.columns(2)
|
186 |
+
|
187 |
+
with col1:
|
188 |
+
visualize_spans(doc,
|
189 |
+
spans_key="sc",
|
190 |
+
title='Engagement Span Anotations 1',
|
191 |
+
displacy_options={
|
192 |
+
'template': {
|
193 |
+
"span": TPL_SPAN,
|
194 |
+
'slice': TPL_SPAN_SLICE,
|
195 |
+
'start': TPL_SPAN_START,
|
196 |
+
},
|
197 |
+
"colors": COLORS_1,
|
198 |
+
},
|
199 |
+
simple=False)
|
200 |
+
|
201 |
+
with col2:
|
202 |
+
visualize_spans(doc2,
|
203 |
+
spans_key="sc",
|
204 |
+
title='Engagement Span Anotations 2',
|
205 |
+
displacy_options={
|
206 |
+
'template': {
|
207 |
+
"span": TPL_SPAN,
|
208 |
+
'slice': TPL_SPAN_SLICE,
|
209 |
+
'start': TPL_SPAN_START,
|
210 |
+
},
|
211 |
+
"colors": COLORS_1,
|
212 |
+
},
|
213 |
+
simple=False)
|
214 |
+
|
215 |
+
|
216 |
+
dep_options = {"fine_grained": True, "distance": 120}
|
217 |
+
visualize_parser(doc, displacy_options=dep_options)
|
utils/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|