egumasa commited on
Commit
866b9fc
Β·
1 Parent(s): fe18b15

detailed summary

Browse files
demo.py CHANGED
@@ -32,7 +32,7 @@ st.set_page_config(
32
  )
33
 
34
 
35
- @st.cache(allow_output_mutation=True)
36
  def load_model():
37
  # nlp = spacy.load("en_engagement_RoBERTa_context_flz")
38
  nlp = spacy.load("en_engagement_LSTM")
@@ -123,7 +123,7 @@ TEXT_LIST = [
123
  ]
124
 
125
 
126
- @st.cache(suppress_st_warning=True)
127
  def preprocess(text):
128
  text = re.sub("\n\n", " &&&&&&&&#&#&#&#&", text)
129
  text = re.sub("\n", " ", text)
@@ -132,7 +132,7 @@ def preprocess(text):
132
  return text
133
 
134
 
135
- @st.cache(allow_output_mutation=True)
136
  def delete_span(span_sc: dict):
137
  id_del = []
138
  for n, spn in enumerate(span_sc, start=1):
@@ -297,6 +297,8 @@ visualize_spans(
297
  },
298
  },
299
  simple=False,
 
 
300
  )
301
 
302
  st.subheader("Bibliography")
@@ -307,3 +309,8 @@ st.markdown("""
307
  * Wu, S. M. (2007). The use of engagement resources in high- and low-rated undergraduate geography essays. _Journal of English for Academic Purposes, 6_ (3), 254–271. https://doi.org/10.1016/j.jeap.2007.09.006
308
 
309
  """)
 
 
 
 
 
 
32
  )
33
 
34
 
35
+ @st.cache_resource()
36
  def load_model():
37
  # nlp = spacy.load("en_engagement_RoBERTa_context_flz")
38
  nlp = spacy.load("en_engagement_LSTM")
 
123
  ]
124
 
125
 
126
+ @st.cache_resource()
127
  def preprocess(text):
128
  text = re.sub("\n\n", " &&&&&&&&#&#&#&#&", text)
129
  text = re.sub("\n", " ", text)
 
132
  return text
133
 
134
 
135
+ @st.cache_resource()
136
  def delete_span(span_sc: dict):
137
  id_del = []
138
  for n, spn in enumerate(span_sc, start=1):
 
297
  },
298
  },
299
  simple=False,
300
+ show_diversity=True,
301
+ show_confidence=False,
302
  )
303
 
304
  st.subheader("Bibliography")
 
309
  * Wu, S. M. (2007). The use of engagement resources in high- and low-rated undergraduate geography essays. _Journal of English for Academic Purposes, 6_ (3), 254–271. https://doi.org/10.1016/j.jeap.2007.09.006
310
 
311
  """)
312
+
313
+ st.subheader("Please cite the following papers:")
314
+ st.markdown("""* Eguchi, M., & Kyle, K. (2023). Span Identification of Epistemic Stance-Taking in Academic Written English. Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023), 429–442. https://aclanthology.org/2023.bea-1.35
315
+ * Eguchi, M., & Kyle, K. (2024). Building custom NLP tools to annotate discourse-functional features for second language writing research: A tutorial. *Research Methods in Applied Linguistics, 3*(3), 100153. https://doi.org/10.1016/j.rmal.2024.100153
316
+ """)
pipeline/__pycache__/post_processors.cpython-310.pyc CHANGED
Binary files a/pipeline/__pycache__/post_processors.cpython-310.pyc and b/pipeline/__pycache__/post_processors.cpython-310.pyc differ
 
pipeline/post_processors.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
3
  import pandas as pd
4
  import spacy
@@ -6,23 +5,38 @@ from spacy.language import Language
6
  from skbio import diversity as dv
7
 
8
  SPAN_ATTRS = ["text", "label_", "start", "end"]
9
- CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]
10
-
11
-
12
- def simple_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
13
- spans_key: str = "sc",
14
- attrs: List[str] = SPAN_ATTRS):
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  columns = attrs + ["Conf. score"]
16
  data = [
17
- [str(getattr(span, attr))
18
- for attr in attrs] + [score] # [f'{score:.5f}']
19
- for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
 
20
  ]
21
  return data, columns
22
 
23
 
24
  # def span_info_aggregator()
25
 
 
26
  def construction_classifier(doc, span):
27
  category = None
28
  spanroot = span.root
@@ -33,7 +47,6 @@ def construction_classifier(doc, span):
33
  span_token = [t.norm_ for t in span]
34
  span_tag = [t.tag_ for t in span]
35
 
36
-
37
  c = [c for c in spanroot.children]
38
  c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
39
 
@@ -44,30 +57,65 @@ def construction_classifier(doc, span):
44
 
45
  right_dep = [c.dep_ for c in spanroot.rights]
46
 
47
- #conditionals
48
- subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
49
- argmentless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in spanroot.children)
50
- argless_span = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in span)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  ## nesting classifiers
53
  if spanroot.dep_ == "conj":
54
- while spanroot.dep_ == 'conj':
55
  spanroot = spanroot.head
56
  # if spanroot.dep_ == "poss":
57
  # while spanroot.dep_ == 'poss':
58
  # spanroot = spanroot.head
59
 
60
- ## Conjunctions
61
  # Preconjunctions
62
- if spanroot.dep_ in ['preconj', 'cc']:
63
  category = "Conjunction"
64
 
65
  ## NOUN PHRASES
66
  # adverbial phrases
67
- if spanroot.dep_ in ['amod']:
68
  category = "Adjectival modifier"
69
  # adverbial phrases
70
- if spanroot.dep_ in ['compound']:
71
  category = "Compound noun"
72
 
73
  ## Nominal category
@@ -85,21 +133,24 @@ def construction_classifier(doc, span):
85
 
86
  ## ADJUNCTS
87
  # prep phrases
88
- if spanroot.dep_ in ['prep', 'agent']:
89
- category = 'Prepositional phrase'
90
  # adverbial phrases
91
- if spanroot.dep_ in ['advmod', "npadvmod", "nmod", "npmod", 'quantmod']:
92
  category = "Adverbial phrase"
93
 
94
  ## Predication patterns
95
- if spanroot.dep_ in ['acomp', 'oprd']:
96
  if "xcomp" in c_dep:
97
  category = "Subject predicate to-cl"
98
  else:
99
  category = "Adjectival complement"
100
 
101
- if spanroot.dep_ in ['attr']:
102
- subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
 
 
 
103
 
104
  c_head = [c.dep_ for c in spanroot.head.children]
105
  if "expl" in c_head and "no_det" in span_t_dep_:
@@ -108,86 +159,115 @@ def construction_classifier(doc, span):
108
  category = "There is/are + Noun complement"
109
  elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
110
  category = "There is/are + Noun complement"
111
-
112
  elif spanroot.pos_ in ["NOUN", "PRON"]:
113
  if "acl" in c_dep:
114
  category = "Noun + Complement (attr)"
115
  else:
116
  category = "Nominal complement"
117
 
118
- elif not subjless and spanroot.pos_ in ['VERB', "AUX"]:
119
  category = "Main verb 4"
120
 
121
- elif spanroot.tag_ in ['NNP']:
122
  category = "Nominal complement"
123
 
124
-
125
  ####################################
126
  ### clausal ####
127
  ####################################
128
- if spanroot.dep_ in ["ROOT", "advcl", "ccomp", 'acl', 'pcomp', 'relcl']:
129
-
130
- _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
131
- _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
132
- root_before_ccomp = [c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"]
133
-
134
- _check_for_to = ["_".join([c.norm_, c.dep_]) for c in spanroot.subtree if c.head.dep_ == "advcl" and (c.dep_=="mark" or c.dep_ == "aux")]
135
- entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  ## Start with broad category, which is then re-evaluated for specific constructions.
138
- if spanroot.dep_ in ['advcl', 'mark', 'acl', 'pcomp']:
139
  ## Adverbial clauses
140
  ### Finite-adverbial clauses
141
  ### Non-finite adverbial clauses
142
- subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
143
- entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
144
-
145
- if "mark" in span_dep and spanroot.pos_ in ['VERB', "AUX"]:
 
 
 
 
 
146
  category = "Finite adverbial clause"
147
- elif "mark" in span_dep and "aux" in span_dep :
148
  category = "Finite adverbial clause"
149
 
150
- elif "mark" in span_dep and spanroot.pos_ in ['VERB', "AUX"] and "expl" in c_dep:
 
 
 
 
151
  category = "Finite adverbial clause"
152
 
153
  elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
154
- if spanroot.pos_ in ['VERB', "AUX"]:
155
  category = "Finite adverbial clause"
156
 
157
- elif spanroot.pos_ not in ['VERB', "AUX"] and subjless:
158
  category = "Non-finite adv clause 1"
159
 
160
  elif entire_cl:
161
  category = "Finite adverbial clause"
162
 
163
- elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
 
 
 
 
 
 
 
164
  # he doing his job
165
  if argmentless:
166
- #e.g., frankly speaking, strictly speaking
167
  category = "Adverbial Phrase"
168
  else:
169
  category = "Non-finite adv clause 2"
170
 
171
- elif spanroot.pos_ not in ['VERB', "AUX"] and "mark" in span_dep and subjless:
172
-
 
173
  category = "Non-finite adv clause 3"
174
-
175
  elif "aux" in c_dep and "TO" in c_tag:
176
  category = "Adverbial Phrase"
177
 
178
-
179
- elif "mark" not in span_dep and spanroot.pos_ in ['VERB', "AUX"]:
180
  category = "Dependent Verb phrase"
181
-
182
- elif not argmentless:
183
- category = "Adverbial clause"
184
-
185
- elif spanroot.dep_ == "advcl":
186
- category = "Adverbial phrase"
187
 
 
 
188
 
189
- if spanroot.dep_ in ['relcl', 'ccomp', 'acl']:
 
190
 
 
191
  head = spanroot.head
192
  if ";" in [t.norm_ for t in head.children]:
193
  category = "Main verb 3"
@@ -195,13 +275,20 @@ def construction_classifier(doc, span):
195
  category = "Dependent verb 1"
196
  elif "mark" in span_dep:
197
  category = "Complement clause"
198
- elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
 
 
 
 
 
 
 
199
  category = "Non-finite complement clause"
200
- elif spanroot.dep_ in ['relcl']:
201
  category = "Relative clause"
202
- elif spanroot.dep_ in ['ccomp']:
203
  category = "Complement clause"
204
- elif spanroot.dep_ in ['acl']:
205
  category = "Noun Complement clause"
206
  else:
207
  # print(_check_for_to)
@@ -209,55 +296,78 @@ def construction_classifier(doc, span):
209
 
210
  ## Specific constructions
211
  # Extraposed that-clause or to-infinitives
212
- if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in ["VERB", "AUX"]:
 
 
 
213
  print(c_dep)
214
  if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
215
- #eg it seems odd (oprd) that X.
216
- #eg it is certain (acomp) that X.
217
- category = "Extraposed that-cl (adj-complement)" #e.g., it is certain that X.
 
 
218
 
219
  elif "xcomp" in c_dep or ("advcl" in c_dep):
220
  if "for_mark" in _check_for_to:
221
- category = "Extraposed to-cl (explicit subj)" #eg It is possible to .
 
 
222
  elif _check_to:
223
- category = "Extraposed to-cl 1" #eg It is possible to .
224
  elif _check_ing:
225
- category = "Extraposed -ing 1" #eg It is possible to .
226
- elif ("prep" in right_dep or "npadvmod" in right_dep) and "ccomp" in right_dep and spanroot.lemma_ == "be":
227
- category = "Cleft construction"
 
 
 
 
228
 
229
  elif "attr" in c_dep:
230
- category = "Extraposed that-cl (copula)" #eg It is a wonder that X.
231
 
232
  else:
233
- category = "Extraposed that-cl (VERB)"
234
 
235
  # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
236
  # category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
237
- elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "acomp" in c_dep:
 
 
238
  if "xcomp" in c_dep:
239
  if _check_to:
240
- category = "Extraposed to-cl 2" #eg it is difficult to decide.
241
  elif _check_ing:
242
- category = "Extraposed -ing 2" #eg it is difficult to decide.
243
-
244
  else:
245
  category = "Extraposed that-cl (adj-complement) 2"
246
 
247
  elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
 
 
 
248
 
249
- category = "Extraposed that-cl (adj-complement) 3" #eg it seems odd that X.
250
-
251
-
252
  # something without dummy subject "it"
253
- elif (("nsubj" in c_dep and spanroot.lemma_ in ['be']) or "nsubjpass" in c_dep) and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm:
254
-
 
 
 
255
  # store xcomp, if the head of the xcomp is acomp
256
- _check_xcomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"]
257
- _check_ccomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"]
 
 
 
 
 
 
 
 
258
  # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
259
  # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
260
-
261
 
262
  if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
263
  if any(root_before_ccomp):
@@ -271,13 +381,13 @@ def construction_classifier(doc, span):
271
  elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
272
  category = "Post-predicate to-cl"
273
 
274
- elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_to:
275
  category = "Subject predicate to-cl"
276
 
277
  elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
278
  category = "Subject predicate to-cl (passive)"
279
 
280
- elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_ing:
281
  category = "Subject predicate -ing"
282
  elif "ccomp" in c_dep:
283
  category = "Subject predicate that-cl"
@@ -290,9 +400,27 @@ def construction_classifier(doc, span):
290
  category = "Main verb 1"
291
 
292
  ## without dummy subject it, and lexical verbs
293
- elif ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm and spanroot.lemma_ not in ['be']:
294
- _check_wh = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["attr", "advmod", 'dobj', 'nsubj'] and c.tag_ in ["WP", "WRB", "WDT", "WP$"]) and c.head.dep_ == "ccomp"]
295
- _check_if = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) and c.head.dep_ == "ccomp"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
  # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
298
  # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
@@ -315,27 +443,34 @@ def construction_classifier(doc, span):
315
  # Existential
316
  elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
317
  category = "There is/are NOUN"
318
-
319
- elif "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]:
320
- category = "Cleft construction"
321
 
 
 
 
 
322
 
323
- if spanroot.dep_ in ['parataxis']:
324
- if "_".join(span_dep) in ["nsubj_parataxis", "aux_parataxis", "nsubj_aux_parataxis"]:
 
 
 
 
325
  category = "Comment clause"
326
  else:
327
  category = "parataxis (for now)"
328
-
329
 
330
  ## External comp
331
- if spanroot.dep_ in ['xcomp']:
332
- if spanroot.head.pos_ == 'ADJ' and "to_aux" in c_t_dep_:
333
  category = "Adjective complement to-cl"
334
- if spanroot.head.pos_ == 'VERB' and "to_aux" in c_t_dep_:
335
  category = "Verb complement to-cl"
336
-
337
- if spanroot.dep_ in ['pcomp']:
338
- if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and 'ccomp' in c_dep:
 
 
 
339
  category = "Participle + that-cl"
340
  elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
341
  category = "Participle"
@@ -345,25 +480,28 @@ def construction_classifier(doc, span):
345
  # if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
346
  # category = "Gerund"
347
 
348
- if spanroot.dep_ in ['neg']:
349
  category = "Negative particle"
350
- if spanroot.dep_ in ['aux', 'auxpass']:
351
  category = "Auxiliary"
352
 
353
  # Modal verbs
354
  if spanroot.tag_ == "MD":
355
  category = "Modal auxiliary"
356
 
357
-
358
- if spanroot.dep_ in ['dep', "csubj", 'csubjpass']:
359
- if spanroot.head.dep_ in ['ROOT', 'ccomp'] and spanroot.head.pos_ in ['AUX', 'VERB'] and spanroot.pos_ in ['AUX', 'VERB']:
 
 
 
360
  if spanroot.morph == spanroot.head.morph:
361
  category = "Main verb 4"
362
  else:
363
  category = "Dependent verb 2"
364
  elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
365
  category = "Gerund"
366
- elif spanroot.head.dep_ in ['conj', 'acl','relcl']:
367
  if spanroot.morph == spanroot.head.morph:
368
  category = "Main verb 4"
369
  else:
@@ -372,7 +510,7 @@ def construction_classifier(doc, span):
372
  category = "Dependent verb 2"
373
 
374
  # Appositive phrases
375
- if spanroot.dep_ in ['appos']:
376
  if "nummod" in c_dep:
377
  category = "Apposition"
378
  elif spanroot.pos_ in ["PROPN"]:
@@ -380,21 +518,23 @@ def construction_classifier(doc, span):
380
  elif spanroot.pos_ in ["NOUN"]:
381
  category = "Appositive Noun Phrase"
382
  elif spanroot.pos_ in ["VERB", "AUX"]:
383
- _check = any(c.dep_ in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
 
 
 
384
  if _check:
385
  category = "Appositive Finite-clause"
386
-
387
- if spanroot.dep_ in ['appos', "dep", "attr"]:
388
- if not subjless and spanroot.pos_ in ['VERB', "AUX"]:
389
  category = "Main verb 5"
390
 
391
  if spanroot.dep_ in ["dep", "mark"]:
392
  if spanroot.tag_ in ["RB", "IN", "CC"]:
393
  category = "Conjunction"
394
 
395
-
396
- #sometimes the extra-clausal links are not accurate
397
- if spanroot.dep_ in ['aux', "auxpass", 'oprd', 'appos', "xcomp"]:
398
  if spanroot.head.dep_ == "ROOT":
399
  category = "Main verb"
400
  else:
@@ -402,7 +542,7 @@ def construction_classifier(doc, span):
402
 
403
  if span.label_ == "CITATION":
404
  if "NNP" in span_tag or "NNPS" in span_tag:
405
- if span_dep[0] == 'punct' and span_dep[-1] == 'punct':
406
  category = "Parenthetical Citation"
407
  elif span_tag[0] in ["NNP", "NNPS"]:
408
  category = "Narrative Citation"
@@ -425,7 +565,6 @@ def construction_classifier2(doc, span):
425
  span_token = [t.norm_ for t in span]
426
  span_tag = [t.tag_ for t in span]
427
 
428
-
429
  c = [c for c in spanroot.children]
430
  c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
431
 
@@ -436,43 +575,92 @@ def construction_classifier2(doc, span):
436
 
437
  right_dep = [c.dep_ for c in spanroot.rights]
438
 
439
- #conditionals
440
- subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
441
- argmentless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in spanroot.children)
442
- argless_span = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in span)
443
- argless_span = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in span)
444
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
  ## nesting classifiers
447
  if spanroot.dep_ == "conj":
448
- while spanroot.dep_ == 'conj':
449
  spanroot = spanroot.head
450
 
451
  if spanroot.dep_ == "poss":
452
  head = spanroot.head
453
- if head.dep_ in ["pobj", "dobj", "obj", "iobj" , "dative"]:
454
  category = "Posessive Noun (Object)"
455
  elif head.dep_ in ["nsubj", "nsubjpass"]:
456
  category = "Posessive Noun (Subject)"
457
  else:
458
  category = "Posessive Noun (Other)"
459
 
460
-
461
- ## Conjunctions
462
  # Preconjunctions
463
- if spanroot.dep_ in ['preconj', 'cc']:
464
  category = "Conjunction"
465
 
466
  ## NOUN PHRASES
467
  # adverbial phrases
468
- if spanroot.dep_ in ['amod']:
469
  category = "Adjectival modifier"
470
  # adverbial phrases
471
- if spanroot.dep_ in ['compound']:
472
  category = "Compound noun"
473
 
474
  ## Nominal category
475
- if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj" , "dative"]:
476
  if "acl" in c_dep:
477
  category = "Noun + Complement (Object)"
478
  else:
@@ -486,22 +674,25 @@ def construction_classifier2(doc, span):
486
 
487
  ## ADJUNCTS
488
  # prep phrases
489
- if spanroot.dep_ in ['prep', 'agent']:
490
- category = 'Prepositional phrase'
491
 
492
  # adverbial phrases
493
- if spanroot.dep_ in ['advmod', "npadvmod", "nmod", "npmod", 'quantmod', 'nummod']:
494
  category = "Adverbial phrase"
495
 
496
  ## Predication patterns
497
- if spanroot.dep_ in ['acomp', 'oprd']:
498
  if "xcomp" in c_dep:
499
  category = "Subject predicate to-cl"
500
  else:
501
  category = "Adjectival complement"
502
 
503
- if spanroot.dep_ in ['attr']:
504
- subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
 
 
 
505
 
506
  c_head = [c.dep_ for c in spanroot.head.children]
507
  if "expl" in c_head and "no_det" in span_t_dep_:
@@ -510,28 +701,31 @@ def construction_classifier2(doc, span):
510
  category = "There is/are + Noun complement"
511
  elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
512
  category = "There is/are + Noun complement"
513
-
514
  elif spanroot.pos_ in ["NOUN", "PRON"]:
515
  if "acl" in c_dep:
516
  category = "Noun + Complement (attr)"
517
  else:
518
  category = "Nominal complement"
519
 
520
- elif not subjless and spanroot.pos_ in ['VERB', "AUX"]:
521
  category = "Main verb 4"
522
 
523
- elif spanroot.tag_ in ['NNP']:
524
  category = "Nominal complement"
525
 
526
  ## External comp
527
- if spanroot.dep_ in ['xcomp']:
528
- if spanroot.head.pos_ == 'ADJ' and "to_aux" in c_t_dep_:
529
  category = "Adjective complement to-cl"
530
- if spanroot.head.pos_ == 'VERB' and "to_aux" in c_t_dep_:
531
  category = "Verb complement to-cl"
532
-
533
- if spanroot.dep_ in ['pcomp']:
534
- if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and 'ccomp' in c_dep:
 
 
 
535
  category = "Participle + that-cl"
536
  elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
537
  category = "Participle"
@@ -541,86 +735,117 @@ def construction_classifier2(doc, span):
541
  # if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
542
  # category = "Gerund"
543
 
544
- if spanroot.dep_ in ['neg']:
545
  category = "Negative particle"
546
- if spanroot.dep_ in ['aux', 'auxpass']:
547
  category = "Auxiliary"
548
 
549
  # Modal verbs
550
  if spanroot.tag_ == "MD":
551
  category = "Modal auxiliary"
552
 
553
-
554
  ####################################
555
  ### clausal ####
556
  ####################################
557
- if spanroot.dep_ in ["ROOT", "advcl", "ccomp", 'acl', 'pcomp', 'relcl', 'punct']:
558
-
559
- _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
560
- _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
561
- root_before_ccomp = [c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"]
562
-
563
- _check_for_to = ["_".join([c.norm_, c.dep_]) for c in spanroot.subtree if c.head.dep_ == "advcl" and (c.dep_=="mark" or c.dep_ == "aux")]
564
- entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
565
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
 
567
  ## Start with broad category, which is then re-evaluated for specific constructions.
568
- if spanroot.dep_ in ['advcl', 'acl', 'punct', 'pcomp']: #'mark',
569
  ## Adverbial clauses
570
- subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
571
- entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
 
 
 
 
 
572
 
573
  ### Finite-adverbial clauses
574
- if "mark" in span_dep and (spanroot.pos_ in ['VERB', "AUX"] or "aux" in span_dep ):
 
 
575
  category = "Finite adverbial clause"
576
 
577
- elif "mark" in span_dep and "aux" in span_dep :
578
  category = "Finite adverbial clause"
579
 
580
- elif "mark" in span_dep and spanroot.pos_ in ['VERB', "AUX"] and "expl" in c_dep:
 
 
 
 
581
  category = "Finite adverbial clause"
582
 
583
  elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
584
- if spanroot.pos_ in ['VERB', "AUX"]:
585
  category = "Finite adverbial clause"
586
 
587
- elif spanroot.pos_ not in ['VERB', "AUX"] and subjless:
588
  category = "Non-finite adv clause 1"
589
 
590
  elif not argmentless:
591
- category = "Finite adverbial clause"
592
 
593
  ## non-finite
594
- elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
 
 
 
 
 
 
 
595
  # he doing his job
596
  if argmentless:
597
- #e.g., frankly speaking, strictly speaking
598
  category = "Adverbial Phrase"
599
  else:
600
  category = "Non-finite adv clause 2"
601
 
602
- elif spanroot.pos_ not in ['VERB', "AUX"] and "mark" in span_dep and subjless:
603
-
 
604
  category = "Non-finite adv clause 3"
605
-
606
  elif "aux" in c_dep and "TO" in c_tag:
607
  category = "Adverbial Phrase"
608
 
609
-
610
- elif "mark" not in span_dep and spanroot.pos_ in ['VERB', "AUX"]:
611
  category = "Dependent Verb phrase"
612
-
613
  elif not argmentless:
614
- category = "Adverbial clause"
615
-
616
  elif spanroot.dep_ == "advcl":
617
- category = "Adverbial phrase"
618
-
619
  else:
620
  category = "Finite adverbial clause "
621
 
622
- if spanroot.dep_ in ['relcl', 'ccomp', 'acl', 'punct', "pcomp"]:
623
-
624
  head = spanroot.head
625
  if ";" in [t.norm_ for t in head.children]:
626
  category = "Main verb 3"
@@ -630,66 +855,96 @@ def construction_classifier2(doc, span):
630
 
631
  elif "mark" in span_dep:
632
  category = "Complement clause"
633
- elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
 
 
 
 
 
 
 
634
  category = "Non-finite complement clause"
635
- elif spanroot.dep_ in ['relcl']:
636
  category = "Relative clause"
637
- elif spanroot.dep_ in ['ccomp']:
638
  category = "Complement clause"
639
- elif spanroot.dep_ in ['acl']:
640
  category = "Noun Complement clause"
641
 
642
  ## Specific constructions
643
  # Extraposed that-clause or to-infinitives
644
- if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in ["VERB", "AUX"]:
 
 
 
645
  # print(c_dep)
646
  if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
647
- #eg it seems odd (oprd) that X.
648
- #eg it is certain (acomp) that X.
649
- category = "Extraposed that-cl (adj-complement)" #e.g., it is certain that X.
 
 
650
 
651
  elif "xcomp" in c_dep or ("advcl" in c_dep):
652
  if "for_mark" in _check_for_to:
653
- category = "Extraposed to-cl (explicit subj)" #eg It is possible to .
 
 
654
  elif _check_to:
655
- category = "Extraposed to-cl 1" #eg It is possible to .
656
  elif _check_ing:
657
- category = "Extraposed -ing 1" #eg It is possible to .
658
- elif ("prep" in right_dep or "npadvmod" in right_dep) and "ccomp" in right_dep and spanroot.lemma_ == "be":
659
- category = "Cleft construction"
 
 
 
 
660
 
661
  elif "attr" in c_dep:
662
- category = "Extraposed that-cl (copula)" #eg It is a wonder that X.
663
 
664
  else:
665
- category = "Extraposed that-cl (VERB)"
666
 
667
  # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
668
  # category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
669
- elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "acomp" in c_dep:
 
 
670
  if "xcomp" in c_dep:
671
  if _check_to:
672
- category = "Extraposed to-cl 2" #eg it is difficult to decide.
673
  elif _check_ing:
674
- category = "Extraposed -ing 2" #eg it is difficult to decide.
675
-
676
  else:
677
  category = "Extraposed that-cl (adj-complement) 2"
678
 
679
  elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
 
 
 
680
 
681
- category = "Extraposed that-cl (adj-complement) 3" #eg it seems odd that X.
682
-
683
-
684
  # something without dummy subject "it"
685
- elif (("nsubj" in c_dep and spanroot.lemma_ in ['be']) or "nsubjpass" in c_dep) and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm:
686
-
 
 
 
687
  # store xcomp, if the head of the xcomp is acomp
688
- _check_xcomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"]
689
- _check_ccomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"]
 
 
 
 
 
 
 
 
690
  # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
691
  # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
692
-
693
 
694
  if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
695
  if any(root_before_ccomp):
@@ -703,13 +958,13 @@ def construction_classifier2(doc, span):
703
  elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
704
  category = "Post-predicate to-cl"
705
 
706
- elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_to:
707
  category = "Subject predicate to-cl"
708
 
709
  elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
710
  category = "Subject predicate to-cl (passive)"
711
 
712
- elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_ing:
713
  category = "Subject predicate -ing"
714
  elif "ccomp" in c_dep:
715
  category = "Subject predicate that-cl"
@@ -724,9 +979,27 @@ def construction_classifier2(doc, span):
724
  category = "Main verb 1"
725
 
726
  ## without dummy subject it, and lexical verbs
727
- elif ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm and spanroot.lemma_ not in ['be']:
728
- _check_wh = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["attr", "advmod", 'dobj', 'nsubj'] and c.tag_ in ["WP", "WRB", "WDT", "WP$"]) and c.head.dep_ == "ccomp"]
729
- _check_if = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) and c.head.dep_ == "ccomp"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
730
 
731
  # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
732
  # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
@@ -746,40 +1019,48 @@ def construction_classifier2(doc, span):
746
  elif _check_ing:
747
  category = "Post-predicate -ing"
748
 
749
-
750
-
751
  # Existential
752
  elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
753
  category = "There is/are NOUN"
754
-
755
- elif "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]:
 
 
756
  category = "Cleft construction"
757
 
758
  ### The end of clausal analysis
759
-
760
- if spanroot.dep_ in ['parataxis']:
761
- if "_".join(span_dep) in ["nsubj_parataxis", "aux_parataxis", "nsubj_aux_parataxis"]:
 
 
 
 
762
  category = "Comment clause"
763
  else:
764
  category = "Parataxis"
765
-
766
 
767
- if spanroot.dep_ in ['dep', "csubj", 'csubjpass']:
768
- if spanroot.head.dep_ in ['ROOT', 'ccomp'] and spanroot.head.pos_ in ['AUX', 'VERB'] and spanroot.pos_ in ['AUX', 'VERB']:
 
 
 
 
769
  if spanroot.morph == spanroot.head.morph:
770
  category = "Main verb 4"
771
  else:
772
  category = "Dependent verb 2"
773
  elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
774
  category = "Gerund"
775
- elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str(spanroot.morph):
 
 
776
  category = "Dependent verb 2"
777
- elif spanroot.dep_ in ["csubj", 'csubjpass']:
778
  category = "Dependent verb (csubj)"
779
 
780
-
781
  # Appositive phrases
782
- if spanroot.dep_ in ['appos']:
783
  if "nummod" in c_dep:
784
  category = "Apposition"
785
  if spanroot.pos_ in ["PROPN"]:
@@ -787,16 +1068,18 @@ def construction_classifier2(doc, span):
787
  elif spanroot.pos_ in ["NOUN"]:
788
  category = "Appositive Noun Phrase"
789
  elif spanroot.pos_ in ["VERB", "AUX"]:
790
- _check = any(c.dep_ in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
 
 
 
791
  if _check:
792
  category = "Appositive Finite-clause"
793
-
794
 
795
- if spanroot.dep_ in ['appos', "dep", "attr"]:
796
- if not subjless and spanroot.pos_ in ['VERB', "AUX"]:
797
  category = "Main verb (likely parsing error)"
798
 
799
- #sometimes the dep are on the conjunctions
800
  if spanroot.dep_ in ["dep", "mark"]:
801
  if spanroot.tag_ in ["RB", "IN", "CC"]:
802
  category = "Conjunction"
@@ -804,9 +1087,12 @@ def construction_classifier2(doc, span):
804
  if spanroot.dep_ in ["intj"]:
805
  category = "Introjection"
806
 
807
-
808
- #sometimes the extra-clausal links are not accurate
809
- if spanroot.dep_ in ['aux', "auxpass", 'oprd', 'appos', "xcomp", "attr", 'dep', "meta", 'prt'] and category == None:
 
 
 
810
  if spanroot.head.dep_ == "ROOT":
811
  category = "Main verb"
812
  else:
@@ -814,7 +1100,7 @@ def construction_classifier2(doc, span):
814
 
815
  if span.label_ == "CITATION":
816
  if "NNP" in span_tag or "NNPS" in span_tag:
817
- if span_dep[0] == 'punct' and span_dep[-1] == 'punct':
818
  category = "Parenthetical Citation"
819
  elif span_tag[0] in ["NNP", "NNPS"]:
820
  category = "Narrative Citation"
@@ -827,18 +1113,32 @@ def construction_classifier2(doc, span):
827
  return category
828
 
829
 
830
-
831
- def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
832
- spans_key: str = "sc",
833
- attrs: List[str] = SPAN_ATTRS):
834
- columns = attrs + ["Conf. score", "sent no.", "grammatical realization", 'span dep', "ner",
835
- "POS", 'span dep seq', "TAG sequence", "POS sequence", "head", "head dep", "children", "morphology", "sent"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
836
  data = []
837
  # data = span_info_aggregator(doc, columns)
838
  sentences = {s: i for i, s in enumerate(doc.sents)}
839
 
840
- for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']):
841
-
842
  span_info = []
843
  span_info.extend([str(getattr(span, attr)) for attr in attrs])
844
 
@@ -854,7 +1154,7 @@ def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
854
  span_info.append(span.root.head.norm_)
855
  span_info.append(span.root.head.dep_)
856
  span_info.append("_".join([c.dep_ for c in span.root.children]))
857
- span_info.append(span.root.morph)
858
  span_info.append(span.sent.text.strip())
859
 
860
  data.append(span_info)
@@ -862,27 +1162,27 @@ def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
862
  return data, columns
863
 
864
 
865
- def ngrammar(seq: list, n=2, concat = False, sep = "-"):
866
  result = []
867
  n_item = len(seq)
868
  for idx, item in enumerate(seq):
869
  if idx + n <= n_item:
870
  if concat:
871
- result.append(sep.join(seq[idx: idx + n]))
872
  else:
873
- result.append(seq[idx: idx + n])
874
  return result
875
 
876
 
877
  def diversity_values(count_vec: list):
878
  result = {}
879
  if len(count_vec) == 0:
880
- count_vec = [0,0,0,0,0,0,0,0,0,0]
881
 
882
- result['shannon'] = dv.alpha.shannon(list(count_vec), base=2)
883
- result['brillouin_d'] = dv.alpha.brillouin_d(list(count_vec))
884
- result["simpson_d"] = 1- dv.alpha.simpson(list(count_vec))
885
- result['simpson_e'] = dv.alpha.simpson_e(list(count_vec))
886
  # result['gini_index'] = dv.alpha.gini_index(list(count_vec))
887
  # result['faith_pd'] = dv.alpha.faith_pd(list(count_vec))
888
 
 
 
1
  from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
2
  import pandas as pd
3
  import spacy
 
5
  from skbio import diversity as dv
6
 
7
  SPAN_ATTRS = ["text", "label_", "start", "end"]
8
+ CATEGORIES = [
9
+ "ATTRIBUTION",
10
+ "CITATION",
11
+ "COUNTER",
12
+ "DENY",
13
+ "ENDOPHORIC",
14
+ "ENTERTAIN",
15
+ "JUSTIFYING",
16
+ "MONOGLOSS",
17
+ "PROCLAIM",
18
+ "SOURCES",
19
+ ]
20
+
21
+
22
+ def simple_table(
23
+ doc: Union[spacy.tokens.Doc, Dict[str, str]],
24
+ spans_key: str = "sc",
25
+ attrs: List[str] = SPAN_ATTRS,
26
+ ):
27
  columns = attrs + ["Conf. score"]
28
  data = [
29
+ [str(getattr(span, attr)) for attr in attrs] + [score] # [f'{score:.5f}']
30
+ for span, score in zip(
31
+ doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]
32
+ )
33
  ]
34
  return data, columns
35
 
36
 
37
  # def span_info_aggregator()
38
 
39
+
40
  def construction_classifier(doc, span):
41
  category = None
42
  spanroot = span.root
 
47
  span_token = [t.norm_ for t in span]
48
  span_tag = [t.tag_ for t in span]
49
 
 
50
  c = [c for c in spanroot.children]
51
  c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
52
 
 
57
 
58
  right_dep = [c.dep_ for c in spanroot.rights]
59
 
60
+ # conditionals
61
+ subjless = all(
62
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
63
+ for c in spanroot.children
64
+ )
65
+ argmentless = all(
66
+ c.dep_
67
+ not in [
68
+ "nsubj",
69
+ "nsubjpass",
70
+ "csubj",
71
+ "csubjpass",
72
+ "dobj",
73
+ "ccomp",
74
+ "xcomp",
75
+ "dative",
76
+ "attr",
77
+ "oprd",
78
+ "acomp",
79
+ ]
80
+ for c in spanroot.children
81
+ )
82
+ argless_span = all(
83
+ c.dep_
84
+ not in [
85
+ "nsubj",
86
+ "nsubjpass",
87
+ "csubj",
88
+ "csubjpass",
89
+ "dobj",
90
+ "ccomp",
91
+ "xcomp",
92
+ "dative",
93
+ "attr",
94
+ "oprd",
95
+ "acomp",
96
+ ]
97
+ for c in span
98
+ )
99
 
100
  ## nesting classifiers
101
  if spanroot.dep_ == "conj":
102
+ while spanroot.dep_ == "conj":
103
  spanroot = spanroot.head
104
  # if spanroot.dep_ == "poss":
105
  # while spanroot.dep_ == 'poss':
106
  # spanroot = spanroot.head
107
 
108
+ ## Conjunctions
109
  # Preconjunctions
110
+ if spanroot.dep_ in ["preconj", "cc"]:
111
  category = "Conjunction"
112
 
113
  ## NOUN PHRASES
114
  # adverbial phrases
115
+ if spanroot.dep_ in ["amod"]:
116
  category = "Adjectival modifier"
117
  # adverbial phrases
118
+ if spanroot.dep_ in ["compound"]:
119
  category = "Compound noun"
120
 
121
  ## Nominal category
 
133
 
134
  ## ADJUNCTS
135
  # prep phrases
136
+ if spanroot.dep_ in ["prep", "agent"]:
137
+ category = "Prepositional phrase"
138
  # adverbial phrases
139
+ if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod"]:
140
  category = "Adverbial phrase"
141
 
142
  ## Predication patterns
143
+ if spanroot.dep_ in ["acomp", "oprd"]:
144
  if "xcomp" in c_dep:
145
  category = "Subject predicate to-cl"
146
  else:
147
  category = "Adjectival complement"
148
 
149
+ if spanroot.dep_ in ["attr"]:
150
+ subjless = all(
151
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
152
+ for c in spanroot.children
153
+ )
154
 
155
  c_head = [c.dep_ for c in spanroot.head.children]
156
  if "expl" in c_head and "no_det" in span_t_dep_:
 
159
  category = "There is/are + Noun complement"
160
  elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
161
  category = "There is/are + Noun complement"
162
+
163
  elif spanroot.pos_ in ["NOUN", "PRON"]:
164
  if "acl" in c_dep:
165
  category = "Noun + Complement (attr)"
166
  else:
167
  category = "Nominal complement"
168
 
169
+ elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
170
  category = "Main verb 4"
171
 
172
+ elif spanroot.tag_ in ["NNP"]:
173
  category = "Nominal complement"
174
 
 
175
  ####################################
176
  ### clausal ####
177
  ####################################
178
+ if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl"]:
179
+ _check_to = [
180
+ c.dep_
181
+ for c in spanroot.subtree
182
+ if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
183
+ and c.head.dep_ == "xcomp"
184
+ ]
185
+ _check_ing = [
186
+ c.dep_
187
+ for c in spanroot.subtree
188
+ if "Prog" in str(c.morph) and c.dep_ == "xcomp"
189
+ ]
190
+ root_before_ccomp = [
191
+ c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
192
+ ]
193
+
194
+ _check_for_to = [
195
+ "_".join([c.norm_, c.dep_])
196
+ for c in spanroot.subtree
197
+ if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
198
+ ]
199
+ entire_cl = (
200
+ spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
201
+ )
202
 
203
  ## Start with broad category, which is then re-evaluated for specific constructions.
204
+ if spanroot.dep_ in ["advcl", "mark", "acl", "pcomp"]:
205
  ## Adverbial clauses
206
  ### Finite-adverbial clauses
207
  ### Non-finite adverbial clauses
208
+ subjless = all(
209
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
210
+ for c in spanroot.children
211
+ )
212
+ entire_cl = (
213
+ spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
214
+ )
215
+
216
+ if "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
217
  category = "Finite adverbial clause"
218
+ elif "mark" in span_dep and "aux" in span_dep:
219
  category = "Finite adverbial clause"
220
 
221
+ elif (
222
+ "mark" in span_dep
223
+ and spanroot.pos_ in ["VERB", "AUX"]
224
+ and "expl" in c_dep
225
+ ):
226
  category = "Finite adverbial clause"
227
 
228
  elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
229
+ if spanroot.pos_ in ["VERB", "AUX"]:
230
  category = "Finite adverbial clause"
231
 
232
+ elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
233
  category = "Non-finite adv clause 1"
234
 
235
  elif entire_cl:
236
  category = "Finite adverbial clause"
237
 
238
+ elif (
239
+ str(spanroot.morph)
240
+ in [
241
+ "Aspect=Prog|Tense=Pres|VerbForm=Part",
242
+ "Aspect=Perf|Tense=Past|VerbForm=Part",
243
+ ]
244
+ and "aux" not in c_dep
245
+ ):
246
  # he doing his job
247
  if argmentless:
248
+ # e.g., frankly speaking, strictly speaking
249
  category = "Adverbial Phrase"
250
  else:
251
  category = "Non-finite adv clause 2"
252
 
253
+ elif (
254
+ spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
255
+ ):
256
  category = "Non-finite adv clause 3"
257
+
258
  elif "aux" in c_dep and "TO" in c_tag:
259
  category = "Adverbial Phrase"
260
 
261
+ elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
 
262
  category = "Dependent Verb phrase"
 
 
 
 
 
 
263
 
264
+ elif not argmentless:
265
+ category = "Adverbial clause"
266
 
267
+ elif spanroot.dep_ == "advcl":
268
+ category = "Adverbial phrase"
269
 
270
+ if spanroot.dep_ in ["relcl", "ccomp", "acl"]:
271
  head = spanroot.head
272
  if ";" in [t.norm_ for t in head.children]:
273
  category = "Main verb 3"
 
275
  category = "Dependent verb 1"
276
  elif "mark" in span_dep:
277
  category = "Complement clause"
278
+ elif (
279
+ str(spanroot.morph)
280
+ in [
281
+ "Aspect=Prog|Tense=Pres|VerbForm=Part",
282
+ "Aspect=Perf|Tense=Past|VerbForm=Part",
283
+ ]
284
+ and "aux" not in c_dep
285
+ ):
286
  category = "Non-finite complement clause"
287
+ elif spanroot.dep_ in ["relcl"]:
288
  category = "Relative clause"
289
+ elif spanroot.dep_ in ["ccomp"]:
290
  category = "Complement clause"
291
+ elif spanroot.dep_ in ["acl"]:
292
  category = "Noun Complement clause"
293
  else:
294
  # print(_check_for_to)
 
296
 
297
  ## Specific constructions
298
  # Extraposed that-clause or to-infinitives
299
+ if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
300
+ "VERB",
301
+ "AUX",
302
+ ]:
303
  print(c_dep)
304
  if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
305
+ # eg it seems odd (oprd) that X.
306
+ # eg it is certain (acomp) that X.
307
+ category = (
308
+ "Extraposed that-cl (adj-complement)" # e.g., it is certain that X.
309
+ )
310
 
311
  elif "xcomp" in c_dep or ("advcl" in c_dep):
312
  if "for_mark" in _check_for_to:
313
+ category = (
314
+ "Extraposed to-cl (explicit subj)" # eg It is possible to .
315
+ )
316
  elif _check_to:
317
+ category = "Extraposed to-cl 1" # eg It is possible to .
318
  elif _check_ing:
319
+ category = "Extraposed -ing 1" # eg It is possible to .
320
+ elif (
321
+ ("prep" in right_dep or "npadvmod" in right_dep)
322
+ and "ccomp" in right_dep
323
+ and spanroot.lemma_ == "be"
324
+ ):
325
+ category = "Cleft construction"
326
 
327
  elif "attr" in c_dep:
328
+ category = "Extraposed that-cl (copula)" # eg It is a wonder that X.
329
 
330
  else:
331
+ category = "Extraposed that-cl (VERB)"
332
 
333
  # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
334
  # category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
335
+ elif (
336
+ "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
337
+ ) and "acomp" in c_dep:
338
  if "xcomp" in c_dep:
339
  if _check_to:
340
+ category = "Extraposed to-cl 2" # eg it is difficult to decide.
341
  elif _check_ing:
342
+ category = "Extraposed -ing 2" # eg it is difficult to decide.
343
+
344
  else:
345
  category = "Extraposed that-cl (adj-complement) 2"
346
 
347
  elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
348
+ category = (
349
+ "Extraposed that-cl (adj-complement) 3" # eg it seems odd that X.
350
+ )
351
 
 
 
 
352
  # something without dummy subject "it"
353
+ elif (
354
+ (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
355
+ and spanroot.pos_ in ["AUX", "VERB"]
356
+ and "it" not in c_norm
357
+ ):
358
  # store xcomp, if the head of the xcomp is acomp
359
+ _check_xcomp = [
360
+ c.dep_
361
+ for c in spanroot.subtree
362
+ if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
363
+ ]
364
+ _check_ccomp = [
365
+ c.dep_
366
+ for c in spanroot.subtree
367
+ if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
368
+ ]
369
  # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
370
  # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
 
371
 
372
  if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
373
  if any(root_before_ccomp):
 
381
  elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
382
  category = "Post-predicate to-cl"
383
 
384
+ elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
385
  category = "Subject predicate to-cl"
386
 
387
  elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
388
  category = "Subject predicate to-cl (passive)"
389
 
390
+ elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
391
  category = "Subject predicate -ing"
392
  elif "ccomp" in c_dep:
393
  category = "Subject predicate that-cl"
 
400
  category = "Main verb 1"
401
 
402
  ## without dummy subject it, and lexical verbs
403
+ elif (
404
+ ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
405
+ and spanroot.pos_ in ["AUX", "VERB"]
406
+ and "it" not in c_norm
407
+ and spanroot.lemma_ not in ["be"]
408
+ ):
409
+ _check_wh = [
410
+ c.dep_
411
+ for c in spanroot.subtree
412
+ if (
413
+ c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
414
+ and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
415
+ )
416
+ and c.head.dep_ == "ccomp"
417
+ ]
418
+ _check_if = [
419
+ c.dep_
420
+ for c in spanroot.subtree
421
+ if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
422
+ and c.head.dep_ == "ccomp"
423
+ ]
424
 
425
  # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
426
  # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
 
443
  # Existential
444
  elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
445
  category = "There is/are NOUN"
 
 
 
446
 
447
+ elif (
448
+ "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
449
+ ):
450
+ category = "Cleft construction"
451
 
452
+ if spanroot.dep_ in ["parataxis"]:
453
+ if "_".join(span_dep) in [
454
+ "nsubj_parataxis",
455
+ "aux_parataxis",
456
+ "nsubj_aux_parataxis",
457
+ ]:
458
  category = "Comment clause"
459
  else:
460
  category = "parataxis (for now)"
 
461
 
462
  ## External comp
463
+ if spanroot.dep_ in ["xcomp"]:
464
+ if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
465
  category = "Adjective complement to-cl"
466
+ if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
467
  category = "Verb complement to-cl"
468
+
469
+ if spanroot.dep_ in ["pcomp"]:
470
+ if (
471
+ str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
472
+ and "ccomp" in c_dep
473
+ ):
474
  category = "Participle + that-cl"
475
  elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
476
  category = "Participle"
 
480
  # if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
481
  # category = "Gerund"
482
 
483
+ if spanroot.dep_ in ["neg"]:
484
  category = "Negative particle"
485
+ if spanroot.dep_ in ["aux", "auxpass"]:
486
  category = "Auxiliary"
487
 
488
  # Modal verbs
489
  if spanroot.tag_ == "MD":
490
  category = "Modal auxiliary"
491
 
492
+ if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
493
+ if (
494
+ spanroot.head.dep_ in ["ROOT", "ccomp"]
495
+ and spanroot.head.pos_ in ["AUX", "VERB"]
496
+ and spanroot.pos_ in ["AUX", "VERB"]
497
+ ):
498
  if spanroot.morph == spanroot.head.morph:
499
  category = "Main verb 4"
500
  else:
501
  category = "Dependent verb 2"
502
  elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
503
  category = "Gerund"
504
+ elif spanroot.head.dep_ in ["conj", "acl", "relcl"]:
505
  if spanroot.morph == spanroot.head.morph:
506
  category = "Main verb 4"
507
  else:
 
510
  category = "Dependent verb 2"
511
 
512
  # Appositive phrases
513
+ if spanroot.dep_ in ["appos"]:
514
  if "nummod" in c_dep:
515
  category = "Apposition"
516
  elif spanroot.pos_ in ["PROPN"]:
 
518
  elif spanroot.pos_ in ["NOUN"]:
519
  category = "Appositive Noun Phrase"
520
  elif spanroot.pos_ in ["VERB", "AUX"]:
521
+ _check = any(
522
+ c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
523
+ for c in spanroot.children
524
+ )
525
  if _check:
526
  category = "Appositive Finite-clause"
527
+
528
+ if spanroot.dep_ in ["appos", "dep", "attr"]:
529
+ if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
530
  category = "Main verb 5"
531
 
532
  if spanroot.dep_ in ["dep", "mark"]:
533
  if spanroot.tag_ in ["RB", "IN", "CC"]:
534
  category = "Conjunction"
535
 
536
+ # sometimes the extra-clausal links are not accurate
537
+ if spanroot.dep_ in ["aux", "auxpass", "oprd", "appos", "xcomp"]:
 
538
  if spanroot.head.dep_ == "ROOT":
539
  category = "Main verb"
540
  else:
 
542
 
543
  if span.label_ == "CITATION":
544
  if "NNP" in span_tag or "NNPS" in span_tag:
545
+ if span_dep[0] == "punct" and span_dep[-1] == "punct":
546
  category = "Parenthetical Citation"
547
  elif span_tag[0] in ["NNP", "NNPS"]:
548
  category = "Narrative Citation"
 
565
  span_token = [t.norm_ for t in span]
566
  span_tag = [t.tag_ for t in span]
567
 
 
568
  c = [c for c in spanroot.children]
569
  c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
570
 
 
575
 
576
  right_dep = [c.dep_ for c in spanroot.rights]
577
 
578
+ # conditionals
579
+ subjless = all(
580
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
581
+ for c in spanroot.children
582
+ )
583
+ argmentless = all(
584
+ c.dep_
585
+ not in [
586
+ "nsubj",
587
+ "nsubjpass",
588
+ "csubj",
589
+ "csubjpass",
590
+ "dobj",
591
+ "ccomp",
592
+ "xcomp",
593
+ "dative",
594
+ "attr",
595
+ "oprd",
596
+ "acomp",
597
+ ]
598
+ for c in spanroot.children
599
+ )
600
+ argless_span = all(
601
+ c.dep_
602
+ not in [
603
+ "nsubj",
604
+ "nsubjpass",
605
+ "csubj",
606
+ "csubjpass",
607
+ "dobj",
608
+ "ccomp",
609
+ "xcomp",
610
+ "dative",
611
+ "attr",
612
+ "oprd",
613
+ "acomp",
614
+ ]
615
+ for c in span
616
+ )
617
+ argless_span = all(
618
+ c.dep_
619
+ not in [
620
+ "nsubj",
621
+ "nsubjpass",
622
+ "csubj",
623
+ "csubjpass",
624
+ "dobj",
625
+ "ccomp",
626
+ "xcomp",
627
+ "dative",
628
+ "attr",
629
+ "oprd",
630
+ "acomp",
631
+ ]
632
+ for c in span
633
+ )
634
 
635
  ## nesting classifiers
636
  if spanroot.dep_ == "conj":
637
+ while spanroot.dep_ == "conj":
638
  spanroot = spanroot.head
639
 
640
  if spanroot.dep_ == "poss":
641
  head = spanroot.head
642
+ if head.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
643
  category = "Posessive Noun (Object)"
644
  elif head.dep_ in ["nsubj", "nsubjpass"]:
645
  category = "Posessive Noun (Subject)"
646
  else:
647
  category = "Posessive Noun (Other)"
648
 
649
+ ## Conjunctions
 
650
  # Preconjunctions
651
+ if spanroot.dep_ in ["preconj", "cc"]:
652
  category = "Conjunction"
653
 
654
  ## NOUN PHRASES
655
  # adverbial phrases
656
+ if spanroot.dep_ in ["amod"]:
657
  category = "Adjectival modifier"
658
  # adverbial phrases
659
+ if spanroot.dep_ in ["compound"]:
660
  category = "Compound noun"
661
 
662
  ## Nominal category
663
+ if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
664
  if "acl" in c_dep:
665
  category = "Noun + Complement (Object)"
666
  else:
 
674
 
675
  ## ADJUNCTS
676
  # prep phrases
677
+ if spanroot.dep_ in ["prep", "agent"]:
678
+ category = "Prepositional phrase"
679
 
680
  # adverbial phrases
681
+ if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod", "nummod"]:
682
  category = "Adverbial phrase"
683
 
684
  ## Predication patterns
685
+ if spanroot.dep_ in ["acomp", "oprd"]:
686
  if "xcomp" in c_dep:
687
  category = "Subject predicate to-cl"
688
  else:
689
  category = "Adjectival complement"
690
 
691
+ if spanroot.dep_ in ["attr"]:
692
+ subjless = all(
693
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
694
+ for c in spanroot.children
695
+ )
696
 
697
  c_head = [c.dep_ for c in spanroot.head.children]
698
  if "expl" in c_head and "no_det" in span_t_dep_:
 
701
  category = "There is/are + Noun complement"
702
  elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
703
  category = "There is/are + Noun complement"
704
+
705
  elif spanroot.pos_ in ["NOUN", "PRON"]:
706
  if "acl" in c_dep:
707
  category = "Noun + Complement (attr)"
708
  else:
709
  category = "Nominal complement"
710
 
711
+ elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
712
  category = "Main verb 4"
713
 
714
+ elif spanroot.tag_ in ["NNP"]:
715
  category = "Nominal complement"
716
 
717
  ## External comp
718
+ if spanroot.dep_ in ["xcomp"]:
719
+ if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
720
  category = "Adjective complement to-cl"
721
+ if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
722
  category = "Verb complement to-cl"
723
+
724
+ if spanroot.dep_ in ["pcomp"]:
725
+ if (
726
+ str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
727
+ and "ccomp" in c_dep
728
+ ):
729
  category = "Participle + that-cl"
730
  elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
731
  category = "Participle"
 
735
  # if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
736
  # category = "Gerund"
737
 
738
+ if spanroot.dep_ in ["neg"]:
739
  category = "Negative particle"
740
+ if spanroot.dep_ in ["aux", "auxpass"]:
741
  category = "Auxiliary"
742
 
743
  # Modal verbs
744
  if spanroot.tag_ == "MD":
745
  category = "Modal auxiliary"
746
 
 
747
  ####################################
748
  ### clausal ####
749
  ####################################
750
+ if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl", "punct"]:
751
+ _check_to = [
752
+ c.dep_
753
+ for c in spanroot.subtree
754
+ if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
755
+ and c.head.dep_ == "xcomp"
756
+ ]
757
+ _check_ing = [
758
+ c.dep_
759
+ for c in spanroot.subtree
760
+ if "Prog" in str(c.morph) and c.dep_ == "xcomp"
761
+ ]
762
+ root_before_ccomp = [
763
+ c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
764
+ ]
765
+
766
+ _check_for_to = [
767
+ "_".join([c.norm_, c.dep_])
768
+ for c in spanroot.subtree
769
+ if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
770
+ ]
771
+ entire_cl = (
772
+ spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
773
+ )
774
 
775
  ## Start with broad category, which is then re-evaluated for specific constructions.
776
+ if spanroot.dep_ in ["advcl", "acl", "punct", "pcomp"]: #'mark',
777
  ## Adverbial clauses
778
+ subjless = all(
779
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
780
+ for c in spanroot.children
781
+ )
782
+ entire_cl = (
783
+ spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
784
+ )
785
 
786
  ### Finite-adverbial clauses
787
+ if "mark" in span_dep and (
788
+ spanroot.pos_ in ["VERB", "AUX"] or "aux" in span_dep
789
+ ):
790
  category = "Finite adverbial clause"
791
 
792
+ elif "mark" in span_dep and "aux" in span_dep:
793
  category = "Finite adverbial clause"
794
 
795
+ elif (
796
+ "mark" in span_dep
797
+ and spanroot.pos_ in ["VERB", "AUX"]
798
+ and "expl" in c_dep
799
+ ):
800
  category = "Finite adverbial clause"
801
 
802
  elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
803
+ if spanroot.pos_ in ["VERB", "AUX"]:
804
  category = "Finite adverbial clause"
805
 
806
+ elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
807
  category = "Non-finite adv clause 1"
808
 
809
  elif not argmentless:
810
+ category = "Finite adverbial clause"
811
 
812
  ## non-finite
813
+ elif (
814
+ str(spanroot.morph)
815
+ in [
816
+ "Aspect=Prog|Tense=Pres|VerbForm=Part",
817
+ "Aspect=Perf|Tense=Past|VerbForm=Part",
818
+ ]
819
+ and "aux" not in c_dep
820
+ ):
821
  # he doing his job
822
  if argmentless:
823
+ # e.g., frankly speaking, strictly speaking
824
  category = "Adverbial Phrase"
825
  else:
826
  category = "Non-finite adv clause 2"
827
 
828
+ elif (
829
+ spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
830
+ ):
831
  category = "Non-finite adv clause 3"
832
+
833
  elif "aux" in c_dep and "TO" in c_tag:
834
  category = "Adverbial Phrase"
835
 
836
+ elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
 
837
  category = "Dependent Verb phrase"
838
+
839
  elif not argmentless:
840
+ category = "Adverbial clause"
841
+
842
  elif spanroot.dep_ == "advcl":
843
+ category = "Adverbial phrase"
844
+
845
  else:
846
  category = "Finite adverbial clause "
847
 
848
+ if spanroot.dep_ in ["relcl", "ccomp", "acl", "punct", "pcomp"]:
 
849
  head = spanroot.head
850
  if ";" in [t.norm_ for t in head.children]:
851
  category = "Main verb 3"
 
855
 
856
  elif "mark" in span_dep:
857
  category = "Complement clause"
858
+ elif (
859
+ str(spanroot.morph)
860
+ in [
861
+ "Aspect=Prog|Tense=Pres|VerbForm=Part",
862
+ "Aspect=Perf|Tense=Past|VerbForm=Part",
863
+ ]
864
+ and "aux" not in c_dep
865
+ ):
866
  category = "Non-finite complement clause"
867
+ elif spanroot.dep_ in ["relcl"]:
868
  category = "Relative clause"
869
+ elif spanroot.dep_ in ["ccomp"]:
870
  category = "Complement clause"
871
+ elif spanroot.dep_ in ["acl"]:
872
  category = "Noun Complement clause"
873
 
874
  ## Specific constructions
875
  # Extraposed that-clause or to-infinitives
876
+ if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
877
+ "VERB",
878
+ "AUX",
879
+ ]:
880
  # print(c_dep)
881
  if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
882
+ # eg it seems odd (oprd) that X.
883
+ # eg it is certain (acomp) that X.
884
+ category = (
885
+ "Extraposed that-cl (adj-complement)" # e.g., it is certain that X.
886
+ )
887
 
888
  elif "xcomp" in c_dep or ("advcl" in c_dep):
889
  if "for_mark" in _check_for_to:
890
+ category = (
891
+ "Extraposed to-cl (explicit subj)" # eg It is possible to .
892
+ )
893
  elif _check_to:
894
+ category = "Extraposed to-cl 1" # eg It is possible to .
895
  elif _check_ing:
896
+ category = "Extraposed -ing 1" # eg It is possible to .
897
+ elif (
898
+ ("prep" in right_dep or "npadvmod" in right_dep)
899
+ and "ccomp" in right_dep
900
+ and spanroot.lemma_ == "be"
901
+ ):
902
+ category = "Cleft construction"
903
 
904
  elif "attr" in c_dep:
905
+ category = "Extraposed that-cl (copula)" # eg It is a wonder that X.
906
 
907
  else:
908
+ category = "Extraposed that-cl (VERB)"
909
 
910
  # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
911
  # category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
912
+ elif (
913
+ "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
914
+ ) and "acomp" in c_dep:
915
  if "xcomp" in c_dep:
916
  if _check_to:
917
+ category = "Extraposed to-cl 2" # eg it is difficult to decide.
918
  elif _check_ing:
919
+ category = "Extraposed -ing 2" # eg it is difficult to decide.
920
+
921
  else:
922
  category = "Extraposed that-cl (adj-complement) 2"
923
 
924
  elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
925
+ category = (
926
+ "Extraposed that-cl (adj-complement) 3" # eg it seems odd that X.
927
+ )
928
 
 
 
 
929
  # something without dummy subject "it"
930
+ elif (
931
+ (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
932
+ and spanroot.pos_ in ["AUX", "VERB"]
933
+ and "it" not in c_norm
934
+ ):
935
  # store xcomp, if the head of the xcomp is acomp
936
+ _check_xcomp = [
937
+ c.dep_
938
+ for c in spanroot.subtree
939
+ if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
940
+ ]
941
+ _check_ccomp = [
942
+ c.dep_
943
+ for c in spanroot.subtree
944
+ if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
945
+ ]
946
  # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
947
  # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
 
948
 
949
  if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
950
  if any(root_before_ccomp):
 
958
  elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
959
  category = "Post-predicate to-cl"
960
 
961
+ elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
962
  category = "Subject predicate to-cl"
963
 
964
  elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
965
  category = "Subject predicate to-cl (passive)"
966
 
967
+ elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
968
  category = "Subject predicate -ing"
969
  elif "ccomp" in c_dep:
970
  category = "Subject predicate that-cl"
 
979
  category = "Main verb 1"
980
 
981
  ## without dummy subject it, and lexical verbs
982
+ elif (
983
+ ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
984
+ and spanroot.pos_ in ["AUX", "VERB"]
985
+ and "it" not in c_norm
986
+ and spanroot.lemma_ not in ["be"]
987
+ ):
988
+ _check_wh = [
989
+ c.dep_
990
+ for c in spanroot.subtree
991
+ if (
992
+ c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
993
+ and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
994
+ )
995
+ and c.head.dep_ == "ccomp"
996
+ ]
997
+ _check_if = [
998
+ c.dep_
999
+ for c in spanroot.subtree
1000
+ if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
1001
+ and c.head.dep_ == "ccomp"
1002
+ ]
1003
 
1004
  # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
1005
  # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
 
1019
  elif _check_ing:
1020
  category = "Post-predicate -ing"
1021
 
 
 
1022
  # Existential
1023
  elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
1024
  category = "There is/are NOUN"
1025
+
1026
+ elif (
1027
+ "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
1028
+ ):
1029
  category = "Cleft construction"
1030
 
1031
  ### The end of clausal analysis
1032
+
1033
+ if spanroot.dep_ in ["parataxis"]:
1034
+ if "_".join(span_dep) in [
1035
+ "nsubj_parataxis",
1036
+ "aux_parataxis",
1037
+ "nsubj_aux_parataxis",
1038
+ ]:
1039
  category = "Comment clause"
1040
  else:
1041
  category = "Parataxis"
 
1042
 
1043
+ if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
1044
+ if (
1045
+ spanroot.head.dep_ in ["ROOT", "ccomp"]
1046
+ and spanroot.head.pos_ in ["AUX", "VERB"]
1047
+ and spanroot.pos_ in ["AUX", "VERB"]
1048
+ ):
1049
  if spanroot.morph == spanroot.head.morph:
1050
  category = "Main verb 4"
1051
  else:
1052
  category = "Dependent verb 2"
1053
  elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
1054
  category = "Gerund"
1055
+ elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str(
1056
+ spanroot.morph
1057
+ ):
1058
  category = "Dependent verb 2"
1059
+ elif spanroot.dep_ in ["csubj", "csubjpass"]:
1060
  category = "Dependent verb (csubj)"
1061
 
 
1062
  # Appositive phrases
1063
+ if spanroot.dep_ in ["appos"]:
1064
  if "nummod" in c_dep:
1065
  category = "Apposition"
1066
  if spanroot.pos_ in ["PROPN"]:
 
1068
  elif spanroot.pos_ in ["NOUN"]:
1069
  category = "Appositive Noun Phrase"
1070
  elif spanroot.pos_ in ["VERB", "AUX"]:
1071
+ _check = any(
1072
+ c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
1073
+ for c in spanroot.children
1074
+ )
1075
  if _check:
1076
  category = "Appositive Finite-clause"
 
1077
 
1078
+ if spanroot.dep_ in ["appos", "dep", "attr"]:
1079
+ if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
1080
  category = "Main verb (likely parsing error)"
1081
 
1082
+ # sometimes the dep are on the conjunctions
1083
  if spanroot.dep_ in ["dep", "mark"]:
1084
  if spanroot.tag_ in ["RB", "IN", "CC"]:
1085
  category = "Conjunction"
 
1087
  if spanroot.dep_ in ["intj"]:
1088
  category = "Introjection"
1089
 
1090
+ # sometimes the extra-clausal links are not accurate
1091
+ if (
1092
+ spanroot.dep_
1093
+ in ["aux", "auxpass", "oprd", "appos", "xcomp", "attr", "dep", "meta", "prt"]
1094
+ and category == None
1095
+ ):
1096
  if spanroot.head.dep_ == "ROOT":
1097
  category = "Main verb"
1098
  else:
 
1100
 
1101
  if span.label_ == "CITATION":
1102
  if "NNP" in span_tag or "NNPS" in span_tag:
1103
+ if span_dep[0] == "punct" and span_dep[-1] == "punct":
1104
  category = "Parenthetical Citation"
1105
  elif span_tag[0] in ["NNP", "NNPS"]:
1106
  category = "Narrative Citation"
 
1113
  return category
1114
 
1115
 
1116
+ def const_table(
1117
+ doc: Union[spacy.tokens.Doc, Dict[str, str]],
1118
+ spans_key: str = "sc",
1119
+ attrs: List[str] = SPAN_ATTRS,
1120
+ ):
1121
+ columns = attrs + [
1122
+ "Conf. score",
1123
+ "sent no.",
1124
+ "grammatical realization",
1125
+ "span dep",
1126
+ "ner",
1127
+ "POS",
1128
+ "span dep seq",
1129
+ "TAG sequence",
1130
+ "POS sequence",
1131
+ "head",
1132
+ "head dep",
1133
+ "children",
1134
+ "morphology",
1135
+ "sent",
1136
+ ]
1137
  data = []
1138
  # data = span_info_aggregator(doc, columns)
1139
  sentences = {s: i for i, s in enumerate(doc.sents)}
1140
 
1141
+ for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]):
 
1142
  span_info = []
1143
  span_info.extend([str(getattr(span, attr)) for attr in attrs])
1144
 
 
1154
  span_info.append(span.root.head.norm_)
1155
  span_info.append(span.root.head.dep_)
1156
  span_info.append("_".join([c.dep_ for c in span.root.children]))
1157
+ span_info.append(str(span.root.morph))
1158
  span_info.append(span.sent.text.strip())
1159
 
1160
  data.append(span_info)
 
1162
  return data, columns
1163
 
1164
 
1165
+ def ngrammar(seq: list, n=2, concat=False, sep="-"):
1166
  result = []
1167
  n_item = len(seq)
1168
  for idx, item in enumerate(seq):
1169
  if idx + n <= n_item:
1170
  if concat:
1171
+ result.append(sep.join(seq[idx : idx + n]))
1172
  else:
1173
+ result.append(seq[idx : idx + n])
1174
  return result
1175
 
1176
 
1177
  def diversity_values(count_vec: list):
1178
  result = {}
1179
  if len(count_vec) == 0:
1180
+ count_vec = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1181
 
1182
+ result["shannon"] = dv.alpha.shannon(list(count_vec), base=2)
1183
+ result["brillouin_d"] = dv.alpha.brillouin_d(list(count_vec))
1184
+ result["simpson_d"] = 1 - dv.alpha.simpson(list(count_vec))
1185
+ result["simpson_e"] = dv.alpha.simpson_e(list(count_vec))
1186
  # result['gini_index'] = dv.alpha.gini_index(list(count_vec))
1187
  # result['faith_pd'] = dv.alpha.faith_pd(list(count_vec))
1188
 
utils/__pycache__/visualize.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/visualize.cpython-310.pyc and b/utils/__pycache__/visualize.cpython-310.pyc differ
 
utils/visualize.py CHANGED
@@ -17,7 +17,12 @@ import streamlit as st
17
  from spacy_streamlit import visualize_spans
18
  from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO
19
 
20
- from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
 
 
 
 
 
21
  from skbio import diversity as dv
22
 
23
  SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))
@@ -43,6 +48,9 @@ def visualize_spans(
43
  manual: bool = False,
44
  displacy_options: Optional[Dict] = None,
45
  simple: bool = True,
 
 
 
46
  ):
47
  """
48
  Visualizer for spans.
@@ -100,13 +108,15 @@ def visualize_spans(
100
  df = pd.DataFrame(data, columns=cols)
101
  df = df.astype({"start": int, "end": int})
102
  df = df.sort_values(by= ['start'])
103
- st.subheader("Span information")
 
104
  st.dataframe(
105
  df.style.highlight_between(subset='Conf. score', right=.7))
106
 
107
- if not simple:
 
 
108
  st.subheader("Label counts & Diagnostic confidence score summary")
109
- counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
110
 
111
  print(counts)
112
  print(list(counts))
@@ -119,7 +129,9 @@ def visualize_spans(
119
  st.dataframe(label_counts)
120
  # print(list(label_counts))
121
 
 
122
  sequences = list(df['label_'])
 
123
  # Engagement ngrams
124
  span_bigrams = ngrammar(seq=sequences, n=2, concat=True)
125
  span_trigrams = ngrammar(seq=sequences, n=3, concat=True)
@@ -132,20 +144,26 @@ def visualize_spans(
132
  label_dep = pd.crosstab(df['grammatical realization'], df['label_'])
133
  st.dataframe(label_dep)
134
 
135
- st.subheader('Quantitative results')
 
136
  # st.markdown(
137
  # f"Shannon's index: {dv.alpha.shannon(list(counts), base=2): .3f}")
138
  # st.markdown(
139
  # f"Simpson's e index: {1 - dv.alpha.simpson_e(list(counts)): .3f}")
140
 
 
 
 
 
141
  div = diversity_values(list(counts))
142
  div_data = pd.DataFrame.from_dict(div, orient='index')
143
- st.dataframe(div_data)
144
 
145
- doc_data = pd.concat([counts, div_data], axis = 0).T
146
  filename = "NA"
147
  doc_data.insert(0, "filename", filename, True)
148
  doc_data.insert(1, "nwords", len(doc), True)
149
  st.dataframe(doc_data)
 
150
  # st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
151
  # print(dv.get_alpha_diversity_metrics())
 
17
  from spacy_streamlit import visualize_spans
18
  from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO
19
 
20
+ from pipeline.post_processors import (
21
+ simple_table,
22
+ const_table,
23
+ ngrammar,
24
+ diversity_values,
25
+ )
26
  from skbio import diversity as dv
27
 
28
  SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))
 
48
  manual: bool = False,
49
  displacy_options: Optional[Dict] = None,
50
  simple: bool = True,
51
+ show_confidence: bool = False,
52
+ show_diversity: bool = False,
53
+ show_ngrams: bool = False,
54
  ):
55
  """
56
  Visualizer for spans.
 
108
  df = pd.DataFrame(data, columns=cols)
109
  df = df.astype({"start": int, "end": int})
110
  df = df.sort_values(by= ['start'])
111
+ st.subheader("Engagement span information")
112
+
113
  st.dataframe(
114
  df.style.highlight_between(subset='Conf. score', right=.7))
115
 
116
+ counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
117
+
118
+ if show_confidence:
119
  st.subheader("Label counts & Diagnostic confidence score summary")
 
120
 
121
  print(counts)
122
  print(list(counts))
 
129
  st.dataframe(label_counts)
130
  # print(list(label_counts))
131
 
132
+ if show_ngrams:
133
  sequences = list(df['label_'])
134
+
135
  # Engagement ngrams
136
  span_bigrams = ngrammar(seq=sequences, n=2, concat=True)
137
  span_trigrams = ngrammar(seq=sequences, n=3, concat=True)
 
144
  label_dep = pd.crosstab(df['grammatical realization'], df['label_'])
145
  st.dataframe(label_dep)
146
 
147
+ if show_diversity:
148
+ st.subheader('Diversity of rhetorical features')
149
  # st.markdown(
150
  # f"Shannon's index: {dv.alpha.shannon(list(counts), base=2): .3f}")
151
  # st.markdown(
152
  # f"Simpson's e index: {1 - dv.alpha.simpson_e(list(counts)): .3f}")
153
 
154
+ st.markdown("##### Entropy based diversity measures")
155
+
156
+ filename = "NA"
157
+
158
  div = diversity_values(list(counts))
159
  div_data = pd.DataFrame.from_dict(div, orient='index')
160
+ # st.dataframe(div_data)
161
 
162
+ doc_data = pd.concat([div_data, counts, ], axis = 0).T
163
  filename = "NA"
164
  doc_data.insert(0, "filename", filename, True)
165
  doc_data.insert(1, "nwords", len(doc), True)
166
  st.dataframe(doc_data)
167
+
168
  # st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
169
  # print(dv.get_alpha_diversity_metrics())