egumasa commited on
Commit
7c257bf
·
1 Parent(s): 9e3e64a
Files changed (5) hide show
  1. .DS_Store +0 -0
  2. .gitignore +4 -6
  3. analyzer.py +146 -0
  4. main.py +217 -0
  5. utils/.DS_Store +0 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore CHANGED
@@ -1,6 +1,4 @@
1
- test_run.py
2
- .DS_Store
3
- analyzer.py
4
- main.py
5
- results/*
6
- inputtexts/*
 
1
+ ECCE_analysis
2
+ ECCE_texts
3
+ results
4
+ inputtexts
 
 
analyzer.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import spacy_streamlit
4
+ from collections import Counter
5
+ import glob
6
+
7
+ import spacy
8
+ from spacy.tokens import Doc
9
+ from spacy.cli._util import import_code
10
+
11
+ from utils.visualize import visualize_spans
12
+ from utils.util import preprocess, delete_overlapping_span, cleanup_justify
13
+
14
+ from resources.text_list import TEXT_LIST
15
+ from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
16
+ from resources.colors import COLORS_1
17
+
18
+
19
+ from skbio import diversity as dv
20
+
21
+ from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
22
+ import pandas as pd
23
+
24
+ # from pipeline.custom_functions import custom_functions
25
+ SPAN_ATTRS = ["text", "label_", "start", "end"]
26
+ CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]
27
+
28
+
29
+ # spacy.prefer_gpu()
30
+
31
+ def load_model(spacy_model):
32
+ # source = spacy.blank("en")
33
+ nlp = spacy.load(spacy_model) # , vocab=nlp_to_copy.vocab
34
+ nlp.add_pipe('sentencizer')
35
+ return (nlp)
36
+
37
+ # source = spacy.blank("en")
38
+
39
+ # modelname = "en_engagement_LSTM_f5"
40
+ # modelname = "en_engagement_LSTM_f5"
41
+ modelname = "en_engagement_Dual_RoBERTa_acad3_f4"
42
+
43
+ os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname))
44
+
45
+ import_code("pipeline/custom_functions.py")
46
+
47
+ # nlp = spacy.load("en_engagement_three_RoBERTa_base_LSTM384")
48
+ nlp = spacy.load(modelname)
49
+ # doc = nlp(preprocess(TEXT_LIST[0]))
50
+
51
+ # cleanup_justify(doc, doc.spans["sc"])
52
+ # delete_overlapping_span(doc.spans['sc'])
53
+
54
+ # data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
55
+ # seq = [s for s in doc.spans["sc"]]
56
+ # span_ngrams = ngrammar(seq=seq, n=3)
57
+
58
+ # df = pd.DataFrame(data, columns=cols)
59
+
60
+ # constant_value = 42
61
+ # new_col = pd.Series([constant_value] * df.shape[0], name='new_col')
62
+
63
+ # doclen = len(doc)
64
+ # doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
65
+
66
+ # df.insert(0, "new", new_col, True)
67
+ # df.insert(1, "nwords", doc_len, True)
68
+
69
+ # df.to_csv("results/test.csv")
70
+
71
+
72
+
73
+ inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt")
74
+ savedir = "ECCE_analysis"
75
+ storeall = True
76
+ storage = []
77
+ os.makedirs(os.path.join("ECCE_analysis", modelname))
78
+
79
+
80
+ doc_level_storage = []
81
+
82
+ for file in inputfiles:
83
+
84
+ filename = os.path.split(file)[-1]
85
+
86
+ with open(file, "r") as f:
87
+ text = f.read()
88
+
89
+ text = preprocess(text)
90
+ doc = nlp(text)
91
+ cleanup_justify(doc, doc.spans["sc"])
92
+ delete_overlapping_span(doc.spans['sc'])
93
+
94
+ data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
95
+ seq = [s for s in doc.spans["sc"]]
96
+ span_ngrams = ngrammar(seq=seq, n=3)
97
+
98
+
99
+ ### Make it a dataset
100
+ df = pd.DataFrame(data, columns=cols)
101
+ df = df.astype({"start": int, "end": int}) #convert col type
102
+ df = df.sort_values(by= ['start']) #and sort by start
103
+ # constant_value = 42
104
+ new_col = pd.Series([filename] * df.shape[0], name='filename')
105
+
106
+ doclen = len(doc)
107
+ doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
108
+
109
+ df.insert(0, "filename", new_col, True)
110
+ df.insert(1, "nwords", doc_len, True)
111
+ df.to_csv(f"{savedir}/{modelname}/{filename}.csv")
112
+
113
+ sequences = list(df['label_'])
114
+ # Engagement ngrams
115
+ span_bigrams = ngrammar(seq=seq, n=2)
116
+ bidf = pd.DataFrame(span_bigrams)
117
+
118
+ # constant_value = 42
119
+ new_col = pd.Series([filename] * bidf.shape[0], name='filename')
120
+ bidf = bidf.insert(0, "filename", new_col, True)
121
+
122
+
123
+ ## Document level
124
+ doc_level = {}
125
+ counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
126
+ div = diversity_values(list(counts))
127
+
128
+ div_data = pd.DataFrame.from_dict(div, orient='index')
129
+
130
+ doc_data = pd.concat([counts, div_data], axis = 0).T
131
+ doc_data.insert(0, "filename", filename, True)
132
+ doc_data.insert(1, "nwords", doc_len, True)
133
+ doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv")
134
+
135
+ if storeall:
136
+ storage.append(df)
137
+ doc_level_storage.append(doc_data)
138
+
139
+
140
+ alldf = pd.concat(storage)
141
+
142
+ alldf.to_csv(f"{savedir}/0_{modelname}_20230426.csv")
143
+
144
+
145
+ alldoc = pd.concat(doc_level_storage)
146
+ alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv")
main.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import spacy_streamlit
3
+ from spacy_streamlit import visualize_parser
4
+ from collections import Counter
5
+
6
+ import spacy
7
+ import streamlit as st
8
+
9
+ # try:
10
+ # from .scripts.custom_functions import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3
11
+ # except ImportError:
12
+ # from pipeline.custom_functions import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3
13
+ from spacy.tokens import Doc
14
+ from spacy.cli._util import import_code
15
+
16
+ from utils.visualize import visualize_spans
17
+ from utils.util import preprocess, delete_overlapping_span, cleanup_justify
18
+ from resources.text_list import TEXT_LIST
19
+ from resources.text_list_BAWE import TEXT_LIST_BAWE
20
+ from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
21
+ from resources.colors import COLORS_1
22
+
23
+ import_code("pipeline/custom_functions.py")
24
+ st.set_page_config(page_title='Engagement model comparaer', layout="wide")
25
+
26
+ # spacy.prefer_gpu()
27
+
28
+ MODEL_LIST =['en_engagement_LSTM', 'en_engagement_LSTM']
29
+
30
+ # MODEL_LIST = [
31
+ # 'en_engagement_three_RoBERTa_base_LSTM384-0.9.2/en_engagement_three_RoBERTa_base_LSTM384/en_engagement_three_RoBERTa_base_LSTM384-0.9.2',
32
+ # 'en_engagement_three_RoBERTa_acad3_db-0.9.2/en_engagement_three_RoBERTa_acad3_db/en_engagement_three_RoBERTa_acad3_db-0.9.2',
33
+ # 'silver-sweep-34/model-best',
34
+ # 'expert-sweep-4/model-best',
35
+ # 'confused-sweep-6/model-best',
36
+ # 'warm-sweep-20/model-best',
37
+ # "en_engagement_three_RoBERTa_base-1.10.0/en_engagement_three_RoBERTa_base/en_engagement_three_RoBERTa_base-1.10.0",
38
+ # "en_engagement_three_RoBERTa_acad_db-1.10.0/en_engagement_three_RoBERTa_acad_db/en_engagement_three_RoBERTa_acad_db-1.10.0",
39
+ # "en_engagement_para_RoBERTa_acad_db3-0.9.0/en_engagement_para_RoBERTa_acad_db3/en_engagement_para_RoBERTa_acad_db3-0.9.0",
40
+ # "en_engagement_para_RoBERTa_acad_LSTM2-0.9.0/en_engagement_para_RoBERTa_acad_LSTM2/en_engagement_para_RoBERTa_acad_LSTM2-0.9.0",
41
+ # "en_engagement_three_RoBERTa_acad_db3-0.9.1/en_engagement_three_RoBERTa_acad_db3/en_engagement_three_RoBERTa_acad_db3-0.9.1",
42
+ # "en_engagement_three_RoBERTa_acad_LSTM2-0.9.1/en_engagement_three_RoBERTa_acad_LSTM2/en_engagement_three_RoBERTa_acad_LSTM2-0.9.1",
43
+ # "en_engagement_three_RoBERTa_acad_db3-0.9.2/en_engagement_three_RoBERTa_acad_db3/en_engagement_three_RoBERTa_acad_db3-0.9.2",
44
+ # 'en_engagement_spl_RoBERTa_acad_db-0.7.4/en_engagement_spl_RoBERTa_acad_db/en_engagement_spl_RoBERTa_acad_db-0.7.4',
45
+ # 'en_engagement_spl_RoBERTa_acad_db3-0.9.0/en_engagement_spl_RoBERTa_acad_db3/en_engagement_spl_RoBERTa_acad_db3-0.9.0',
46
+ # 'en_engagement_spl_RoBERTa_acad_LSTM-0.7.2/en_engagement_spl_RoBERTa_acad_LSTM/en_engagement_spl_RoBERTa_acad_LSTM-0.7.2',
47
+ # 'en_engagement_spl_RoBERTa_acad_512',
48
+ # 'en_engagement_spl_RoBERTa_acad',
49
+ # 'en_engagement_spl_RoBERTa_exp-0.6.5/en_engagement_spl_RoBERTa_exp/en_engagement_spl_RoBERTa_exp-0.6.5',
50
+ # # 'en_engagement_spl_RoBERTa_acad-0.3.4.1221/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.3.4.1221',
51
+ # # 'en_engagement_spl_RoBERTa_acad-0.2.2.1228/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.2.1228',
52
+ # # 'en_engagement_spl_RoBERTa_acad-0.2.1.1228/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.1.1228',
53
+ # # 'en_engagement_spl_RoBERTa_acad-0.2.2.1220/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.2.1220',
54
+ # # 'en_engagement_spl_RoBERTa2-0.2.2.1210/en_engagement_spl_RoBERTa2/en_engagement_spl_RoBERTa2-0.2.2.1210',
55
+ # # 'en_engagement_spl_RoBERTa-0.2.2.1210/en_engagement_spl_RoBERTa/en_engagement_spl_RoBERTa-0.2.2.1210',
56
+ # # 'en_engagement_spl_RoBERTa_acad_max1_do02',
57
+ # # 'en_engagement_spl_RoBERTa2-0.2.2.1210/en_engagement_spl_RoBERTa2/en_engagement_spl_RoBERTa2-0.2.2.1210',
58
+ # # 'en_engagement_spl_RoBERTa_acad-0.2.3.1210/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.3.1210',
59
+ # # 'en_engagement_spl_RoBERTa_acad_max1_do02',
60
+ # # 'en_engagement_spl_RoBERTa_sqbatch_RAdam-20221202_0.1.5/en_engagement_spl_RoBERTa_sqbatch_RAdam/en_engagement_spl_RoBERTa_sqbatch_RAdam-20221202_0.1.5',
61
+ # # 'en_engagement_spl_RoBERTa_context_flz-20221130_0.1.4/en_engagement_spl_RoBERTa_context_flz/en_engagement_spl_RoBERTa_context_flz-20221130_0.1.4',
62
+ # # 'en_engagement_spl_RoBERTa_cx_max1_do2-20221202_0.1.5/en_engagement_spl_RoBERTa_cx_max1_do2/en_engagement_spl_RoBERTa_cx_max1_do2-20221202_0.1.5',
63
+ # # 'en_engagement_spl_RoBERTa_context_flz-20221125_0.1.4/en_engagement_spl_RoBERTa_context_flz/en_engagement_spl_RoBERTa_context_flz-20221125_0.1.4',
64
+ # # 'en_engagement_RoBERTa_context_flz-20221125_0.1.4/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221125_0.1.4',
65
+ # # 'en_engagement_RoBERTa_context_flz-20221117_0.1.3/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221117_0.1.3',
66
+ # # 'en_engagement_spl_RoBERTa_acad_context_flz-20221117_0.1.3/en_engagement_spl_RoBERTa_acad_context_flz/en_engagement_spl_RoBERTa_acad_context_flz-20221117_0.1.3',
67
+ # # 'en_engagement_RoBERTa_context_flz-Batch2_0.1.1/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-Batch2_0.1.1',
68
+ # # 'en_engagement_RoBERTa_context_flz-20221113_0.1.3/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221113_0.1.3',
69
+ # # 'en_engagement_RoBERTa_context_flz-20221113_0.1.1/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221113_0.1.1',
70
+ # # 'en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2',
71
+ # # 'en_engagement_RoBERTa_combined-Batch2Eng_0.2/en_engagement_RoBERTa_combined/en_engagement_RoBERTa_combined-Batch2Eng_0.2',
72
+ # # 'en_engagement_RoBERTa_acad-0.2.1/en_engagement_RoBERTa_acad/en_engagement_RoBERTa_acad-0.2.1',
73
+ # # # 'en_engagement_BERT-0.0.2/en_engagement_BERT/en_engagement_BERT-0.0.2',
74
+ # # # 'en_engagement_BERT_acad-0.0.2/en_engagement_BERT_acad/en_engagement_BERT_acad-0.0.2',
75
+ # # # 'en_engagement_RoBERTa_acad-0.0.2/en_engagement_RoBERTa_acad/en_engagement_RoBERTa_acad-0.0.2',
76
+ # # 'en_engagement_RoBERTa-0.0.1/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.1',
77
+ # # # ' en_engagement_RoBERTa_sent-0.0.1_null/en_engagement_RoBERTa_sent/en_engagement_RoBERTa_sent-0.0.1_null',
78
+ # # # 'en_engagement_RoBERTa_combined-0.0.1/en_engagement_RoBERTa_combined/en_engagement_RoBERTa_combined-0.0.1',
79
+ # # 'en_engagement_RoBERTa-ME_AtoE/en_engagement_RoBERTa/en_engagement_RoBERTa-ME_AtoE',
80
+ # # 'en_engagement_RoBERTa-AtoI_0.0.3/en_engagement_RoBERTa/en_engagement_RoBERTa-AtoI_0.0.3',
81
+ # # 'en_engagement_RoBERTa-AtoI_0.0.3/en_engagement_RoBERTa/en_engagement_RoBERTa-AtoI_0.0.2'
82
+ # ]
83
+
84
+ multicol = st.checkbox("Compare two models", value=False, key=None, help=None)
85
+
86
+ model1 = st.selectbox('Select model option 1', MODEL_LIST, index=0)
87
+ model2 = st.selectbox('Select model option 2', MODEL_LIST, index=1)
88
+
89
+ if '/' in model1:
90
+ model1 = "packages/" + model1
91
+
92
+ if '/' in model2:
93
+ model2 = "packages/" + model2
94
+
95
+
96
+ @st.cache(allow_output_mutation=True)
97
+ def load_model(spacy_model):
98
+ # source = spacy.blank("en")
99
+ nlp = spacy.load(spacy_model) #, vocab=nlp_to_copy.vocab
100
+ nlp.add_pipe('sentencizer')
101
+ return (nlp)
102
+
103
+ # source = spacy.blank("en")
104
+ nlp = load_model(model1)
105
+
106
+ if multicol:
107
+ nlp2 = load_model(model2)
108
+
109
+
110
+ text = st.selectbox('select sent to debug', TEXT_LIST_BAWE)
111
+
112
+ input_text = st.text_area("", height=200)
113
+
114
+ # Dependency parsing
115
+ st.header("Text", "text")
116
+ if len(input_text.split(" ")) > 1:
117
+ doc = nlp(preprocess(input_text))
118
+ if multicol:
119
+ doc2 = nlp2(preprocess(input_text))
120
+ # st.markdown("> " + input_text)
121
+ else:
122
+ doc = nlp(preprocess(text))
123
+ if multicol:
124
+ doc2 = nlp2(preprocess(text))
125
+ # st.markdown("> " + text)
126
+
127
+ clearjustify = st.checkbox(
128
+ "Clear problematic JUSTIFYING spans", value=True, key=None, help=None)
129
+
130
+ delete_overlaps = st.checkbox(
131
+ "Delete overlaps", value=True, key=None, help=None)
132
+
133
+ # combine = st.checkbox(
134
+ # "Combine", value=False, key=None, help=None)
135
+
136
+ # import copy
137
+ # def combine_spangroups(doc1, doc2):
138
+ # # new_doc = Doc.from_docs([doc1, doc2], ensure_whitespace=True)
139
+ # new_doc = copy.deepcopy(doc1)
140
+ # # type()
141
+ # new_doc.spans['sc'].extend(doc2.spans['sc'])
142
+
143
+ # return new_doc
144
+
145
+
146
+ # if combine:
147
+ # new_doc = combine_spangroups(doc, doc2)
148
+ # visualize_spans(new_doc,
149
+ # spans_key="sc",
150
+ # title='Combined spans:',
151
+ # displacy_options={
152
+ # 'template': {
153
+ # "span": TPL_SPAN,
154
+ # 'slice': TPL_SPAN_SLICE,
155
+ # 'start': TPL_SPAN_START,
156
+ # },
157
+ # "colors": COLORS_1,
158
+ # },
159
+ # simple=False)
160
+
161
+ if clearjustify:
162
+ cleanup_justify(doc, doc.spans['sc'])
163
+
164
+ if delete_overlaps:
165
+ delete_overlapping_span(doc.spans['sc'])
166
+ if multicol:
167
+ delete_overlapping_span(doc2.spans['sc'])
168
+
169
+ if not multicol:
170
+ visualize_spans(doc,
171
+ spans_key="sc",
172
+ title='Engagement Span Anotations 1',
173
+ displacy_options={
174
+ 'template': {
175
+ "span": TPL_SPAN,
176
+ 'slice': TPL_SPAN_SLICE,
177
+ 'start': TPL_SPAN_START,
178
+ },
179
+ "colors": COLORS_1,
180
+ },
181
+ simple=False)
182
+
183
+
184
+ else:
185
+ col1, col2 = st.columns(2)
186
+
187
+ with col1:
188
+ visualize_spans(doc,
189
+ spans_key="sc",
190
+ title='Engagement Span Anotations 1',
191
+ displacy_options={
192
+ 'template': {
193
+ "span": TPL_SPAN,
194
+ 'slice': TPL_SPAN_SLICE,
195
+ 'start': TPL_SPAN_START,
196
+ },
197
+ "colors": COLORS_1,
198
+ },
199
+ simple=False)
200
+
201
+ with col2:
202
+ visualize_spans(doc2,
203
+ spans_key="sc",
204
+ title='Engagement Span Anotations 2',
205
+ displacy_options={
206
+ 'template': {
207
+ "span": TPL_SPAN,
208
+ 'slice': TPL_SPAN_SLICE,
209
+ 'start': TPL_SPAN_START,
210
+ },
211
+ "colors": COLORS_1,
212
+ },
213
+ simple=False)
214
+
215
+
216
+ dep_options = {"fine_grained": True, "distance": 120}
217
+ visualize_parser(doc, displacy_options=dep_options)
utils/.DS_Store ADDED
Binary file (6.15 kB). View file