Spaces:

egumasa
/

engagement-analyzer-demo2

Sleeping

App Files Files Community

egumasa commited on Oct 3, 2024

Commit

866b9fc

1 Parent(s): fe18b15

detailed summary

Browse files

Files changed (5) hide show

demo.py +10 -3
pipeline/__pycache__/post_processors.cpython-310.pyc +0 -0
pipeline/post_processors.py +557 -257
utils/__pycache__/visualize.cpython-310.pyc +0 -0
utils/visualize.py +25 -7

demo.py CHANGED Viewed

@@ -32,7 +32,7 @@ st.set_page_config(
 )
-@st.cache(allow_output_mutation=True)
 def load_model():
     # nlp = spacy.load("en_engagement_RoBERTa_context_flz")
     nlp = spacy.load("en_engagement_LSTM")
@@ -123,7 +123,7 @@ TEXT_LIST = [
 ]
-@st.cache(suppress_st_warning=True)
 def preprocess(text):
     text = re.sub("\n\n", " &&&&&&&&#&#&#&#&", text)
     text = re.sub("\n", " ", text)
@@ -132,7 +132,7 @@ def preprocess(text):
     return text
-@st.cache(allow_output_mutation=True)
 def delete_span(span_sc: dict):
     id_del = []
     for n, spn in enumerate(span_sc, start=1):
@@ -297,6 +297,8 @@ visualize_spans(
         },
     },
     simple=False,
 )
 st.subheader("Bibliography")
@@ -307,3 +309,8 @@ st.markdown("""
 * Wu, S. M. (2007). The use of engagement resources in high- and low-rated undergraduate geography essays. _Journal of English for Academic Purposes, 6_ (3), 254–271. https://doi.org/10.1016/j.jeap.2007.09.006
 """)

 )
+@st.cache_resource()
 def load_model():
     # nlp = spacy.load("en_engagement_RoBERTa_context_flz")
     nlp = spacy.load("en_engagement_LSTM")
 ]
+@st.cache_resource()
 def preprocess(text):
     text = re.sub("\n\n", " &&&&&&&&#&#&#&#&", text)
     text = re.sub("\n", " ", text)
     return text
+@st.cache_resource()
 def delete_span(span_sc: dict):
     id_del = []
     for n, spn in enumerate(span_sc, start=1):
         },
     },
     simple=False,
+    show_diversity=True,
+    show_confidence=False,
 )
 st.subheader("Bibliography")
 * Wu, S. M. (2007). The use of engagement resources in high- and low-rated undergraduate geography essays. _Journal of English for Academic Purposes, 6_ (3), 254–271. https://doi.org/10.1016/j.jeap.2007.09.006
 """)
+st.subheader("Please cite the following papers:")
+st.markdown("""* Eguchi, M., & Kyle, K. (2023). Span Identification of Epistemic Stance-Taking in Academic Written English. Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023), 429–442. https://aclanthology.org/2023.bea-1.35
+* Eguchi, M., & Kyle, K. (2024). Building custom NLP tools to annotate discourse-functional features for second language writing research: A tutorial. *Research Methods in Applied Linguistics, 3*(3), 100153. https://doi.org/10.1016/j.rmal.2024.100153
+""")

pipeline/__pycache__/post_processors.cpython-310.pyc CHANGED Viewed

Binary files a/pipeline/__pycache__/post_processors.cpython-310.pyc and b/pipeline/__pycache__/post_processors.cpython-310.pyc differ

pipeline/post_processors.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
 import pandas as pd
 import spacy
@@ -6,23 +5,38 @@ from spacy.language import Language
 from skbio import diversity as dv
 SPAN_ATTRS = ["text", "label_", "start", "end"]
-CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]
-def simple_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
-                 spans_key: str = "sc",
-                 attrs: List[str] = SPAN_ATTRS):
     columns = attrs + ["Conf. score"]
     data = [
-        [str(getattr(span, attr))
-         for attr in attrs] + [score]  # [f'{score:.5f}']
-        for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
     ]
     return data, columns
 # def span_info_aggregator()
 def construction_classifier(doc, span):
     category = None
     spanroot = span.root
@@ -33,7 +47,6 @@ def construction_classifier(doc, span):
     span_token = [t.norm_ for t in span]
     span_tag = [t.tag_ for t in span]
     c = [c for c in spanroot.children]
     c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
@@ -44,30 +57,65 @@ def construction_classifier(doc, span):
     right_dep = [c.dep_ for c in spanroot.rights]
-    #conditionals
-    subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
-    argmentless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in spanroot.children)
-    argless_span = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in span)
     ## nesting classifiers
     if spanroot.dep_ == "conj":
-        while spanroot.dep_ == 'conj':
             spanroot = spanroot.head
     # if spanroot.dep_ == "poss":
     #     while spanroot.dep_ == 'poss':
     #         spanroot = spanroot.head
-    ## Conjunctions
     # Preconjunctions
-    if spanroot.dep_ in ['preconj', 'cc']:
         category = "Conjunction"
     ## NOUN PHRASES
     # adverbial phrases
-    if spanroot.dep_ in ['amod']:
         category = "Adjectival modifier"
         # adverbial phrases
-    if spanroot.dep_ in ['compound']:
         category = "Compound noun"
     ## Nominal category
@@ -85,21 +133,24 @@ def construction_classifier(doc, span):
     ## ADJUNCTS
     # prep phrases
-    if spanroot.dep_ in ['prep', 'agent']:
-        category = 'Prepositional phrase'
     # adverbial phrases
-    if spanroot.dep_ in ['advmod', "npadvmod", "nmod", "npmod", 'quantmod']:
         category = "Adverbial phrase"
     ## Predication patterns
-    if spanroot.dep_ in ['acomp', 'oprd']:
         if "xcomp" in c_dep:
             category = "Subject predicate to-cl"
         else:
             category = "Adjectival complement"
-    if spanroot.dep_ in ['attr']:
-        subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
         c_head = [c.dep_ for c in spanroot.head.children]
         if "expl" in c_head and "no_det" in span_t_dep_:
@@ -108,86 +159,115 @@ def construction_classifier(doc, span):
             category = "There is/are + Noun complement"
         elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
             category = "There is/are + Noun complement"
         elif spanroot.pos_ in ["NOUN", "PRON"]:
             if "acl" in c_dep:
                 category = "Noun + Complement (attr)"
             else:
                 category = "Nominal complement"
-        elif not subjless and spanroot.pos_ in ['VERB', "AUX"]:
             category = "Main verb 4"
-        elif spanroot.tag_ in ['NNP']:
             category = "Nominal complement"
     ####################################
     ### clausal ####
     ####################################
-    if spanroot.dep_ in ["ROOT", "advcl", "ccomp", 'acl', 'pcomp', 'relcl']:
-        _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
-        _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
-        root_before_ccomp = [c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"]
-        _check_for_to = ["_".join([c.norm_, c.dep_])  for c in spanroot.subtree if c.head.dep_ == "advcl" and (c.dep_=="mark" or c.dep_ == "aux")]
-        entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
         ## Start with broad category, which is then re-evaluated for specific constructions.
-        if spanroot.dep_ in ['advcl', 'mark', 'acl', 'pcomp']:
             ## Adverbial clauses
             ### Finite-adverbial clauses
             ### Non-finite adverbial clauses
-            subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
-            entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
-            if "mark" in span_dep and spanroot.pos_ in ['VERB', "AUX"]:
                 category = "Finite adverbial clause"
-            elif "mark" in span_dep and "aux" in span_dep :
                 category = "Finite adverbial clause"
-            elif "mark" in span_dep and spanroot.pos_ in ['VERB', "AUX"] and "expl" in c_dep:
                 category = "Finite adverbial clause"
             elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
-                if spanroot.pos_ in ['VERB', "AUX"]:
                     category = "Finite adverbial clause"
-                elif spanroot.pos_ not in ['VERB', "AUX"] and subjless:
                     category = "Non-finite adv clause 1"
                 elif entire_cl:
                     category = "Finite adverbial clause"
-            elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
                 # he doing his job
                 if argmentless:
-                    #e.g., frankly speaking, strictly speaking
                     category = "Adverbial Phrase"
                 else:
                     category = "Non-finite adv clause 2"
-            elif spanroot.pos_ not in ['VERB', "AUX"] and "mark" in span_dep and subjless:
                 category = "Non-finite adv clause 3"
             elif "aux" in c_dep and "TO" in c_tag:
                 category = "Adverbial Phrase"
-            elif "mark" not in span_dep and spanroot.pos_ in ['VERB', "AUX"]:
                 category = "Dependent Verb phrase"
-            elif not argmentless:
-                category =  "Adverbial clause"
-            elif spanroot.dep_ == "advcl":
-                category =  "Adverbial phrase"
-        if spanroot.dep_ in ['relcl', 'ccomp', 'acl']:
             head = spanroot.head
             if ";" in [t.norm_ for t in head.children]:
                 category = "Main verb 3"
@@ -195,13 +275,20 @@ def construction_classifier(doc, span):
                 category = "Dependent verb 1"
             elif "mark" in span_dep:
                 category = "Complement clause"
-            elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
                 category = "Non-finite complement clause"
-            elif spanroot.dep_ in ['relcl']:
                 category = "Relative clause"
-            elif spanroot.dep_ in ['ccomp']:
                 category = "Complement clause"
-            elif spanroot.dep_ in ['acl']:
                 category = "Noun Complement clause"
             else:
                 # print(_check_for_to)
@@ -209,55 +296,78 @@ def construction_classifier(doc, span):
         ## Specific constructions
         # Extraposed that-clause or to-infinitives
-        if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in ["VERB", "AUX"]:
             print(c_dep)
             if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
-                #eg it seems odd (oprd) that X.
-                #eg it is certain (acomp) that X.
-                category = "Extraposed that-cl (adj-complement)" #e.g., it is certain that X.
             elif "xcomp" in c_dep or ("advcl" in c_dep):
                 if "for_mark" in _check_for_to:
-                    category = "Extraposed to-cl (explicit subj)" #eg It is possible to .
                 elif _check_to:
-                    category = "Extraposed to-cl 1" #eg It is possible to .
                 elif _check_ing:
-                    category = "Extraposed -ing 1" #eg It is possible to .
-            elif ("prep" in right_dep or "npadvmod" in right_dep) and "ccomp" in right_dep and spanroot.lemma_ == "be":
-                category = "Cleft construction"
             elif "attr" in c_dep:
-                category = "Extraposed that-cl (copula)" #eg It is a wonder that X.
             else:
-                category = "Extraposed that-cl (VERB)"
         # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
         #     category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
-        elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "acomp" in c_dep:
             if "xcomp" in c_dep:
                 if _check_to:
-                    category = "Extraposed to-cl 2" #eg it is difficult to decide.
                 elif _check_ing:
-                    category = "Extraposed -ing 2" #eg it is difficult to decide.
             else:
                 category = "Extraposed that-cl (adj-complement) 2"
         elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
-            category = "Extraposed that-cl (adj-complement) 3" #eg it seems odd that X.
         # something without dummy subject "it"
-        elif (("nsubj" in c_dep and spanroot.lemma_ in ['be']) or "nsubjpass" in c_dep) and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm:
             # store xcomp, if the head of the xcomp is acomp
-            _check_xcomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"]
-            _check_ccomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"]
             # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
             # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
             if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
                 if any(root_before_ccomp):
@@ -271,13 +381,13 @@ def construction_classifier(doc, span):
             elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
                 category = "Post-predicate to-cl"
-            elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_to:
                 category = "Subject predicate to-cl"
             elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
                 category = "Subject predicate to-cl (passive)"
-            elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_ing:
                 category = "Subject predicate -ing"
             elif "ccomp" in c_dep:
                 category = "Subject predicate that-cl"
@@ -290,9 +400,27 @@ def construction_classifier(doc, span):
                 category = "Main verb 1"
         ## without dummy subject it, and lexical verbs
-        elif ("nsubj" in c_dep or "nsubjpass" in c_dep)  in c_dep and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm and spanroot.lemma_ not in ['be']:
-            _check_wh = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["attr", "advmod", 'dobj', 'nsubj'] and c.tag_ in ["WP", "WRB", "WDT", "WP$"]) and c.head.dep_ == "ccomp"]
-            _check_if = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) and c.head.dep_ == "ccomp"]
             # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
             # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
@@ -315,27 +443,34 @@ def construction_classifier(doc, span):
         # Existential
         elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
             category = "There is/are NOUN"
-        elif "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]:
-            category = "Cleft construction"
-    if spanroot.dep_ in ['parataxis']:
-        if "_".join(span_dep) in ["nsubj_parataxis", "aux_parataxis", "nsubj_aux_parataxis"]:
             category = "Comment clause"
         else:
             category = "parataxis (for now)"
     ## External comp
-    if spanroot.dep_ in ['xcomp']:
-        if spanroot.head.pos_ == 'ADJ' and "to_aux" in c_t_dep_:
             category = "Adjective complement to-cl"
-        if spanroot.head.pos_ == 'VERB' and "to_aux" in c_t_dep_:
             category = "Verb complement to-cl"
-    if spanroot.dep_ in ['pcomp']:
-        if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and 'ccomp' in c_dep:
             category = "Participle + that-cl"
         elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
             category = "Participle"
@@ -345,25 +480,28 @@ def construction_classifier(doc, span):
     #     if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
     #         category = "Gerund"
-    if spanroot.dep_ in ['neg']:
         category = "Negative particle"
-    if spanroot.dep_ in ['aux', 'auxpass']:
         category = "Auxiliary"
     # Modal verbs
     if spanroot.tag_ == "MD":
         category = "Modal auxiliary"
-    if spanroot.dep_ in ['dep', "csubj", 'csubjpass']:
-        if spanroot.head.dep_ in ['ROOT', 'ccomp'] and spanroot.head.pos_ in ['AUX', 'VERB'] and spanroot.pos_ in ['AUX', 'VERB']:
             if spanroot.morph == spanroot.head.morph:
                 category = "Main verb 4"
             else:
                 category = "Dependent verb 2"
         elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
             category = "Gerund"
-        elif spanroot.head.dep_ in ['conj', 'acl','relcl']:
             if spanroot.morph == spanroot.head.morph:
                 category = "Main verb 4"
             else:
@@ -372,7 +510,7 @@ def construction_classifier(doc, span):
             category = "Dependent verb 2"
     # Appositive phrases
-    if spanroot.dep_ in ['appos']:
         if "nummod" in c_dep:
             category = "Apposition"
         elif spanroot.pos_ in ["PROPN"]:
@@ -380,21 +518,23 @@ def construction_classifier(doc, span):
         elif spanroot.pos_ in ["NOUN"]:
             category = "Appositive Noun Phrase"
         elif spanroot.pos_ in ["VERB", "AUX"]:
-            _check = any(c.dep_ in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
             if _check:
                 category = "Appositive Finite-clause"
-    if spanroot.dep_ in ['appos', "dep", "attr"]:
-        if not subjless and spanroot.pos_ in ['VERB', "AUX"]:
             category = "Main verb 5"
     if spanroot.dep_ in ["dep", "mark"]:
         if spanroot.tag_ in ["RB", "IN", "CC"]:
             category = "Conjunction"
-    #sometimes the extra-clausal links are not accurate
-    if spanroot.dep_ in ['aux', "auxpass", 'oprd', 'appos', "xcomp"]:
         if spanroot.head.dep_ == "ROOT":
             category = "Main verb"
         else:
@@ -402,7 +542,7 @@ def construction_classifier(doc, span):
     if span.label_ == "CITATION":
         if "NNP" in span_tag or "NNPS" in span_tag:
-            if span_dep[0] == 'punct' and span_dep[-1] == 'punct':
                 category = "Parenthetical Citation"
             elif span_tag[0] in ["NNP", "NNPS"]:
                 category = "Narrative Citation"
@@ -425,7 +565,6 @@ def construction_classifier2(doc, span):
     span_token = [t.norm_ for t in span]
     span_tag = [t.tag_ for t in span]
     c = [c for c in spanroot.children]
     c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
@@ -436,43 +575,92 @@ def construction_classifier2(doc, span):
     right_dep = [c.dep_ for c in spanroot.rights]
-    #conditionals
-    subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
-    argmentless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in spanroot.children)
-    argless_span = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in span)
-    argless_span = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass', "dobj", 'ccomp', 'xcomp', 'dative', "attr", "oprd", "acomp"] for c in span)
     ## nesting classifiers
     if spanroot.dep_ == "conj":
-        while spanroot.dep_ == 'conj':
             spanroot = spanroot.head
     if spanroot.dep_ == "poss":
         head = spanroot.head
-        if head.dep_ in ["pobj", "dobj", "obj", "iobj" , "dative"]:
             category = "Posessive Noun (Object)"
         elif head.dep_ in ["nsubj", "nsubjpass"]:
             category = "Posessive Noun (Subject)"
         else:
             category = "Posessive Noun (Other)"
-    ## Conjunctions
     # Preconjunctions
-    if spanroot.dep_ in ['preconj', 'cc']:
         category = "Conjunction"
     ## NOUN PHRASES
     # adverbial phrases
-    if spanroot.dep_ in ['amod']:
         category = "Adjectival modifier"
         # adverbial phrases
-    if spanroot.dep_ in ['compound']:
         category = "Compound noun"
     ## Nominal category
-    if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj" , "dative"]:
         if "acl" in c_dep:
             category = "Noun + Complement (Object)"
         else:
@@ -486,22 +674,25 @@ def construction_classifier2(doc, span):
     ## ADJUNCTS
     # prep phrases
-    if spanroot.dep_ in ['prep', 'agent']:
-        category = 'Prepositional phrase'
     # adverbial phrases
-    if spanroot.dep_ in ['advmod', "npadvmod", "nmod", "npmod", 'quantmod',  'nummod']:
         category = "Adverbial phrase"
     ## Predication patterns
-    if spanroot.dep_ in ['acomp', 'oprd']:
         if "xcomp" in c_dep:
             category = "Subject predicate to-cl"
         else:
             category = "Adjectival complement"
-    if spanroot.dep_ in ['attr']:
-        subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
         c_head = [c.dep_ for c in spanroot.head.children]
         if "expl" in c_head and "no_det" in span_t_dep_:
@@ -510,28 +701,31 @@ def construction_classifier2(doc, span):
             category = "There is/are + Noun complement"
         elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
             category = "There is/are + Noun complement"
         elif spanroot.pos_ in ["NOUN", "PRON"]:
             if "acl" in c_dep:
                 category = "Noun + Complement (attr)"
             else:
                 category = "Nominal complement"
-        elif not subjless and spanroot.pos_ in ['VERB', "AUX"]:
             category = "Main verb 4"
-        elif spanroot.tag_ in ['NNP']:
             category = "Nominal complement"
     ## External comp
-    if spanroot.dep_ in ['xcomp']:
-        if spanroot.head.pos_ == 'ADJ' and "to_aux" in c_t_dep_:
             category = "Adjective complement to-cl"
-        if spanroot.head.pos_ == 'VERB' and "to_aux" in c_t_dep_:
             category = "Verb complement to-cl"
-    if spanroot.dep_ in ['pcomp']:
-        if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and 'ccomp' in c_dep:
             category = "Participle + that-cl"
         elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
             category = "Participle"
@@ -541,86 +735,117 @@ def construction_classifier2(doc, span):
     #     if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
     #         category = "Gerund"
-    if spanroot.dep_ in ['neg']:
         category = "Negative particle"
-    if spanroot.dep_ in ['aux', 'auxpass']:
         category = "Auxiliary"
     # Modal verbs
     if spanroot.tag_ == "MD":
         category = "Modal auxiliary"
     ####################################
     ### clausal ####
     ####################################
-    if spanroot.dep_ in ["ROOT", "advcl", "ccomp", 'acl', 'pcomp', 'relcl', 'punct']:
-        _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
-        _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
-        root_before_ccomp = [c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"]
-        _check_for_to = ["_".join([c.norm_, c.dep_])  for c in spanroot.subtree if c.head.dep_ == "advcl" and (c.dep_=="mark" or c.dep_ == "aux")]
-        entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
         ## Start with broad category, which is then re-evaluated for specific constructions.
-        if spanroot.dep_ in ['advcl', 'acl', 'punct', 'pcomp']: #'mark',
             ## Adverbial clauses
-            subjless = all(c.dep_ not in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
-            entire_cl = spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
             ### Finite-adverbial clauses
-            if "mark" in span_dep and (spanroot.pos_ in ['VERB', "AUX"] or "aux" in span_dep ):
                 category = "Finite adverbial clause"
-            elif "mark" in span_dep and "aux" in span_dep :
                 category = "Finite adverbial clause"
-            elif "mark" in span_dep and spanroot.pos_ in ['VERB', "AUX"] and "expl" in c_dep:
                 category = "Finite adverbial clause"
             elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
-                if spanroot.pos_ in ['VERB', "AUX"]:
                     category = "Finite adverbial clause"
-                elif spanroot.pos_ not in ['VERB', "AUX"] and subjless:
                     category = "Non-finite adv clause 1"
                 elif not argmentless:
-                    category =  "Finite adverbial clause"
             ## non-finite
-            elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
                 # he doing his job
                 if argmentless:
-                    #e.g., frankly speaking, strictly speaking
                     category = "Adverbial Phrase"
                 else:
                     category = "Non-finite adv clause 2"
-            elif spanroot.pos_ not in ['VERB', "AUX"] and "mark" in span_dep and subjless:
                 category = "Non-finite adv clause 3"
             elif "aux" in c_dep and "TO" in c_tag:
                 category = "Adverbial Phrase"
-            elif "mark" not in span_dep and spanroot.pos_ in ['VERB', "AUX"]:
                 category = "Dependent Verb phrase"
             elif not argmentless:
-                category =  "Adverbial clause"
             elif spanroot.dep_ == "advcl":
-                category =  "Adverbial phrase"
             else:
                 category = "Finite adverbial clause "
-        if spanroot.dep_ in ['relcl', 'ccomp', 'acl', 'punct', "pcomp"]:
             head = spanroot.head
             if ";" in [t.norm_ for t in head.children]:
                 category = "Main verb 3"
@@ -630,66 +855,96 @@ def construction_classifier2(doc, span):
             elif "mark" in span_dep:
                 category = "Complement clause"
-            elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part"] and "aux" not in c_dep:
                 category = "Non-finite complement clause"
-            elif spanroot.dep_ in ['relcl']:
                 category = "Relative clause"
-            elif spanroot.dep_ in ['ccomp']:
                 category = "Complement clause"
-            elif spanroot.dep_ in ['acl']:
                 category = "Noun Complement clause"
         ## Specific constructions
         # Extraposed that-clause or to-infinitives
-        if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in ["VERB", "AUX"]:
             # print(c_dep)
             if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
-                #eg it seems odd (oprd) that X.
-                #eg it is certain (acomp) that X.
-                category = "Extraposed that-cl (adj-complement)" #e.g., it is certain that X.
             elif "xcomp" in c_dep or ("advcl" in c_dep):
                 if "for_mark" in _check_for_to:
-                    category = "Extraposed to-cl (explicit subj)" #eg It is possible to .
                 elif _check_to:
-                    category = "Extraposed to-cl 1" #eg It is possible to .
                 elif _check_ing:
-                    category = "Extraposed -ing 1" #eg It is possible to .
-            elif ("prep" in right_dep or "npadvmod" in right_dep) and "ccomp" in right_dep and spanroot.lemma_ == "be":
-                category = "Cleft construction"
             elif "attr" in c_dep:
-                category = "Extraposed that-cl (copula)" #eg It is a wonder that X.
             else:
-                category = "Extraposed that-cl (VERB)"
         # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
         #     category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
-        elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "acomp" in c_dep:
             if "xcomp" in c_dep:
                 if _check_to:
-                    category = "Extraposed to-cl 2" #eg it is difficult to decide.
                 elif _check_ing:
-                    category = "Extraposed -ing 2" #eg it is difficult to decide.
             else:
                 category = "Extraposed that-cl (adj-complement) 2"
         elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
-            category = "Extraposed that-cl (adj-complement) 3" #eg it seems odd that X.
         # something without dummy subject "it"
-        elif (("nsubj" in c_dep and spanroot.lemma_ in ['be']) or "nsubjpass" in c_dep) and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm:
             # store xcomp, if the head of the xcomp is acomp
-            _check_xcomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"]
-            _check_ccomp = [c.dep_ for c in spanroot.subtree if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"]
             # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
             # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
             if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
                 if any(root_before_ccomp):
@@ -703,13 +958,13 @@ def construction_classifier2(doc, span):
             elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
                 category = "Post-predicate to-cl"
-            elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_to:
                 category = "Subject predicate to-cl"
             elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
                 category = "Subject predicate to-cl (passive)"
-            elif "xcomp" in c_dep and spanroot.lemma_ in ['be'] and _check_ing:
                 category = "Subject predicate -ing"
             elif "ccomp" in c_dep:
                 category = "Subject predicate that-cl"
@@ -724,9 +979,27 @@ def construction_classifier2(doc, span):
                 category = "Main verb 1"
         ## without dummy subject it, and lexical verbs
-        elif ("nsubj" in c_dep or "nsubjpass" in c_dep)  in c_dep and spanroot.pos_ in ["AUX", 'VERB'] and "it" not in c_norm and spanroot.lemma_ not in ['be']:
-            _check_wh = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["attr", "advmod", 'dobj', 'nsubj'] and c.tag_ in ["WP", "WRB", "WDT", "WP$"]) and c.head.dep_ == "ccomp"]
-            _check_if = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) and c.head.dep_ == "ccomp"]
             # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
             # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
@@ -746,40 +1019,48 @@ def construction_classifier2(doc, span):
                 elif _check_ing:
                     category = "Post-predicate -ing"
         # Existential
         elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
             category = "There is/are NOUN"
-        elif "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]:
             category = "Cleft construction"
         ### The end of clausal analysis
-    if spanroot.dep_ in ['parataxis']:
-        if "_".join(span_dep) in ["nsubj_parataxis", "aux_parataxis", "nsubj_aux_parataxis"]:
             category = "Comment clause"
         else:
             category = "Parataxis"
-    if spanroot.dep_ in ['dep', "csubj", 'csubjpass']:
-        if spanroot.head.dep_ in ['ROOT', 'ccomp'] and spanroot.head.pos_ in ['AUX', 'VERB'] and spanroot.pos_ in ['AUX', 'VERB']:
             if spanroot.morph == spanroot.head.morph:
                 category = "Main verb 4"
             else:
                 category = "Dependent verb 2"
         elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
             category = "Gerund"
-        elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str(spanroot.morph):
             category = "Dependent verb 2"
-        elif spanroot.dep_ in ["csubj", 'csubjpass']:
             category = "Dependent verb (csubj)"
     # Appositive phrases
-    if spanroot.dep_ in ['appos']:
         if "nummod" in c_dep:
             category = "Apposition"
         if spanroot.pos_ in ["PROPN"]:
@@ -787,16 +1068,18 @@ def construction_classifier2(doc, span):
         elif spanroot.pos_ in ["NOUN"]:
             category = "Appositive Noun Phrase"
         elif spanroot.pos_ in ["VERB", "AUX"]:
-            _check = any(c.dep_ in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass'] for c in spanroot.children)
             if _check:
                 category = "Appositive Finite-clause"
-    if spanroot.dep_ in ['appos', "dep", "attr"]:
-        if not subjless and spanroot.pos_ in ['VERB', "AUX"]:
             category = "Main verb (likely parsing error)"
-    #sometimes the dep are on the conjunctions
     if spanroot.dep_ in ["dep", "mark"]:
         if spanroot.tag_ in ["RB", "IN", "CC"]:
             category = "Conjunction"
@@ -804,9 +1087,12 @@ def construction_classifier2(doc, span):
     if spanroot.dep_ in ["intj"]:
         category = "Introjection"
-    #sometimes the extra-clausal links are not accurate
-    if spanroot.dep_ in ['aux', "auxpass", 'oprd', 'appos', "xcomp", "attr", 'dep', "meta", 'prt'] and category == None:
         if spanroot.head.dep_ == "ROOT":
             category = "Main verb"
         else:
@@ -814,7 +1100,7 @@ def construction_classifier2(doc, span):
     if span.label_ == "CITATION":
         if "NNP" in span_tag or "NNPS" in span_tag:
-            if span_dep[0] == 'punct' and span_dep[-1] == 'punct':
                 category = "Parenthetical Citation"
             elif span_tag[0] in ["NNP", "NNPS"]:
                 category = "Narrative Citation"
@@ -827,18 +1113,32 @@ def construction_classifier2(doc, span):
     return category
-def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
-                spans_key: str = "sc",
-                attrs: List[str] = SPAN_ATTRS):
-    columns = attrs + ["Conf. score", "sent no.", "grammatical realization", 'span dep', "ner",
-                       "POS", 'span dep seq', "TAG sequence", "POS sequence", "head", "head dep", "children", "morphology", "sent"]
     data = []
     # data = span_info_aggregator(doc, columns)
     sentences = {s: i for i, s in enumerate(doc.sents)}
-    for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']):
         span_info = []
         span_info.extend([str(getattr(span, attr)) for attr in attrs])
@@ -854,7 +1154,7 @@ def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
         span_info.append(span.root.head.norm_)
         span_info.append(span.root.head.dep_)
         span_info.append("_".join([c.dep_ for c in span.root.children]))
-        span_info.append(span.root.morph)
         span_info.append(span.sent.text.strip())
         data.append(span_info)
@@ -862,27 +1162,27 @@ def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
     return data, columns
-def ngrammar(seq: list, n=2, concat = False, sep = "-"):
     result = []
     n_item = len(seq)
     for idx, item in enumerate(seq):
         if idx + n <= n_item:
             if concat:
-                result.append(sep.join(seq[idx: idx + n]))
             else:
-                result.append(seq[idx: idx + n])
     return result
 def diversity_values(count_vec: list):
     result = {}
     if len(count_vec) == 0:
-        count_vec = [0,0,0,0,0,0,0,0,0,0]
-    result['shannon'] = dv.alpha.shannon(list(count_vec), base=2)
-    result['brillouin_d'] = dv.alpha.brillouin_d(list(count_vec))
-    result["simpson_d"] = 1- dv.alpha.simpson(list(count_vec))
-    result['simpson_e'] = dv.alpha.simpson_e(list(count_vec))
     # result['gini_index'] = dv.alpha.gini_index(list(count_vec))
     # result['faith_pd'] = dv.alpha.faith_pd(list(count_vec))

 from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
 import pandas as pd
 import spacy
 from skbio import diversity as dv
 SPAN_ATTRS = ["text", "label_", "start", "end"]
+CATEGORIES = [
+    "ATTRIBUTION",
+    "CITATION",
+    "COUNTER",
+    "DENY",
+    "ENDOPHORIC",
+    "ENTERTAIN",
+    "JUSTIFYING",
+    "MONOGLOSS",
+    "PROCLAIM",
+    "SOURCES",
+]
+def simple_table(
+    doc: Union[spacy.tokens.Doc, Dict[str, str]],
+    spans_key: str = "sc",
+    attrs: List[str] = SPAN_ATTRS,
+):
     columns = attrs + ["Conf. score"]
     data = [
+        [str(getattr(span, attr)) for attr in attrs] + [score]  # [f'{score:.5f}']
+        for span, score in zip(
+            doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]
+        )
     ]
     return data, columns
 # def span_info_aggregator()
 def construction_classifier(doc, span):
     category = None
     spanroot = span.root
     span_token = [t.norm_ for t in span]
     span_tag = [t.tag_ for t in span]
     c = [c for c in spanroot.children]
     c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
     right_dep = [c.dep_ for c in spanroot.rights]
+    # conditionals
+    subjless = all(
+        c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
+        for c in spanroot.children
+    )
+    argmentless = all(
+        c.dep_
+        not in [
+            "nsubj",
+            "nsubjpass",
+            "csubj",
+            "csubjpass",
+            "dobj",
+            "ccomp",
+            "xcomp",
+            "dative",
+            "attr",
+            "oprd",
+            "acomp",
+        ]
+        for c in spanroot.children
+    )
+    argless_span = all(
+        c.dep_
+        not in [
+            "nsubj",
+            "nsubjpass",
+            "csubj",
+            "csubjpass",
+            "dobj",
+            "ccomp",
+            "xcomp",
+            "dative",
+            "attr",
+            "oprd",
+            "acomp",
+        ]
+        for c in span
+    )
     ## nesting classifiers
     if spanroot.dep_ == "conj":
+        while spanroot.dep_ == "conj":
             spanroot = spanroot.head
     # if spanroot.dep_ == "poss":
     #     while spanroot.dep_ == 'poss':
     #         spanroot = spanroot.head
+    ## Conjunctions
     # Preconjunctions
+    if spanroot.dep_ in ["preconj", "cc"]:
         category = "Conjunction"
     ## NOUN PHRASES
     # adverbial phrases
+    if spanroot.dep_ in ["amod"]:
         category = "Adjectival modifier"
         # adverbial phrases
+    if spanroot.dep_ in ["compound"]:
         category = "Compound noun"
     ## Nominal category
     ## ADJUNCTS
     # prep phrases
+    if spanroot.dep_ in ["prep", "agent"]:
+        category = "Prepositional phrase"
     # adverbial phrases
+    if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod"]:
         category = "Adverbial phrase"
     ## Predication patterns
+    if spanroot.dep_ in ["acomp", "oprd"]:
         if "xcomp" in c_dep:
             category = "Subject predicate to-cl"
         else:
             category = "Adjectival complement"
+    if spanroot.dep_ in ["attr"]:
+        subjless = all(
+            c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
+            for c in spanroot.children
+        )
         c_head = [c.dep_ for c in spanroot.head.children]
         if "expl" in c_head and "no_det" in span_t_dep_:
             category = "There is/are + Noun complement"
         elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
             category = "There is/are + Noun complement"
         elif spanroot.pos_ in ["NOUN", "PRON"]:
             if "acl" in c_dep:
                 category = "Noun + Complement (attr)"
             else:
                 category = "Nominal complement"
+        elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
             category = "Main verb 4"
+        elif spanroot.tag_ in ["NNP"]:
             category = "Nominal complement"
     ####################################
     ### clausal ####
     ####################################
+    if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl"]:
+        _check_to = [
+            c.dep_
+            for c in spanroot.subtree
+            if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
+            and c.head.dep_ == "xcomp"
+        ]
+        _check_ing = [
+            c.dep_
+            for c in spanroot.subtree
+            if "Prog" in str(c.morph) and c.dep_ == "xcomp"
+        ]
+        root_before_ccomp = [
+            c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
+        ]
+        _check_for_to = [
+            "_".join([c.norm_, c.dep_])
+            for c in spanroot.subtree
+            if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
+        ]
+        entire_cl = (
+            spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
+        )
         ## Start with broad category, which is then re-evaluated for specific constructions.
+        if spanroot.dep_ in ["advcl", "mark", "acl", "pcomp"]:
             ## Adverbial clauses
             ### Finite-adverbial clauses
             ### Non-finite adverbial clauses
+            subjless = all(
+                c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
+                for c in spanroot.children
+            )
+            entire_cl = (
+                spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
+            )
+            if "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
                 category = "Finite adverbial clause"
+            elif "mark" in span_dep and "aux" in span_dep:
                 category = "Finite adverbial clause"
+            elif (
+                "mark" in span_dep
+                and spanroot.pos_ in ["VERB", "AUX"]
+                and "expl" in c_dep
+            ):
                 category = "Finite adverbial clause"
             elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
+                if spanroot.pos_ in ["VERB", "AUX"]:
                     category = "Finite adverbial clause"
+                elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
                     category = "Non-finite adv clause 1"
                 elif entire_cl:
                     category = "Finite adverbial clause"
+            elif (
+                str(spanroot.morph)
+                in [
+                    "Aspect=Prog|Tense=Pres|VerbForm=Part",
+                    "Aspect=Perf|Tense=Past|VerbForm=Part",
+                ]
+                and "aux" not in c_dep
+            ):
                 # he doing his job
                 if argmentless:
+                    # e.g., frankly speaking, strictly speaking
                     category = "Adverbial Phrase"
                 else:
                     category = "Non-finite adv clause 2"
+            elif (
+                spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
+            ):
                 category = "Non-finite adv clause 3"
             elif "aux" in c_dep and "TO" in c_tag:
                 category = "Adverbial Phrase"
+            elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
                 category = "Dependent Verb phrase"
+            elif not argmentless:
+                category = "Adverbial clause"
+            elif spanroot.dep_ == "advcl":
+                category = "Adverbial phrase"
+        if spanroot.dep_ in ["relcl", "ccomp", "acl"]:
             head = spanroot.head
             if ";" in [t.norm_ for t in head.children]:
                 category = "Main verb 3"
                 category = "Dependent verb 1"
             elif "mark" in span_dep:
                 category = "Complement clause"
+            elif (
+                str(spanroot.morph)
+                in [
+                    "Aspect=Prog|Tense=Pres|VerbForm=Part",
+                    "Aspect=Perf|Tense=Past|VerbForm=Part",
+                ]
+                and "aux" not in c_dep
+            ):
                 category = "Non-finite complement clause"
+            elif spanroot.dep_ in ["relcl"]:
                 category = "Relative clause"
+            elif spanroot.dep_ in ["ccomp"]:
                 category = "Complement clause"
+            elif spanroot.dep_ in ["acl"]:
                 category = "Noun Complement clause"
             else:
                 # print(_check_for_to)
         ## Specific constructions
         # Extraposed that-clause or to-infinitives
+        if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
+            "VERB",
+            "AUX",
+        ]:
             print(c_dep)
             if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
+                # eg it seems odd (oprd) that X.
+                # eg it is certain (acomp) that X.
+                category = (
+                    "Extraposed that-cl (adj-complement)"  # e.g., it is certain that X.
+                )
             elif "xcomp" in c_dep or ("advcl" in c_dep):
                 if "for_mark" in _check_for_to:
+                    category = (
+                        "Extraposed to-cl (explicit subj)"  # eg It is possible to .
+                    )
                 elif _check_to:
+                    category = "Extraposed to-cl 1"  # eg It is possible to .
                 elif _check_ing:
+                    category = "Extraposed -ing 1"  # eg It is possible to .
+            elif (
+                ("prep" in right_dep or "npadvmod" in right_dep)
+                and "ccomp" in right_dep
+                and spanroot.lemma_ == "be"
+            ):
+                category = "Cleft construction"
             elif "attr" in c_dep:
+                category = "Extraposed that-cl (copula)"  # eg It is a wonder that X.
             else:
+                category = "Extraposed that-cl (VERB)"
         # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
         #     category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
+        elif (
+            "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
+        ) and "acomp" in c_dep:
             if "xcomp" in c_dep:
                 if _check_to:
+                    category = "Extraposed to-cl 2"  # eg it is difficult to decide.
                 elif _check_ing:
+                    category = "Extraposed -ing 2"  # eg it is difficult to decide.
             else:
                 category = "Extraposed that-cl (adj-complement) 2"
         elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
+            category = (
+                "Extraposed that-cl (adj-complement) 3"  # eg it seems odd that X.
+            )
         # something without dummy subject "it"
+        elif (
+            (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
+            and spanroot.pos_ in ["AUX", "VERB"]
+            and "it" not in c_norm
+        ):
             # store xcomp, if the head of the xcomp is acomp
+            _check_xcomp = [
+                c.dep_
+                for c in spanroot.subtree
+                if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
+            ]
+            _check_ccomp = [
+                c.dep_
+                for c in spanroot.subtree
+                if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
+            ]
             # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
             # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
             if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
                 if any(root_before_ccomp):
             elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
                 category = "Post-predicate to-cl"
+            elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
                 category = "Subject predicate to-cl"
             elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
                 category = "Subject predicate to-cl (passive)"
+            elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
                 category = "Subject predicate -ing"
             elif "ccomp" in c_dep:
                 category = "Subject predicate that-cl"
                 category = "Main verb 1"
         ## without dummy subject it, and lexical verbs
+        elif (
+            ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
+            and spanroot.pos_ in ["AUX", "VERB"]
+            and "it" not in c_norm
+            and spanroot.lemma_ not in ["be"]
+        ):
+            _check_wh = [
+                c.dep_
+                for c in spanroot.subtree
+                if (
+                    c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
+                    and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
+                )
+                and c.head.dep_ == "ccomp"
+            ]
+            _check_if = [
+                c.dep_
+                for c in spanroot.subtree
+                if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
+                and c.head.dep_ == "ccomp"
+            ]
             # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
             # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
         # Existential
         elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
             category = "There is/are NOUN"
+        elif (
+            "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
+        ):
+            category = "Cleft construction"
+    if spanroot.dep_ in ["parataxis"]:
+        if "_".join(span_dep) in [
+            "nsubj_parataxis",
+            "aux_parataxis",
+            "nsubj_aux_parataxis",
+        ]:
             category = "Comment clause"
         else:
             category = "parataxis (for now)"
     ## External comp
+    if spanroot.dep_ in ["xcomp"]:
+        if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
             category = "Adjective complement to-cl"
+        if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
             category = "Verb complement to-cl"
+    if spanroot.dep_ in ["pcomp"]:
+        if (
+            str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
+            and "ccomp" in c_dep
+        ):
             category = "Participle + that-cl"
         elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
             category = "Participle"
     #     if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
     #         category = "Gerund"
+    if spanroot.dep_ in ["neg"]:
         category = "Negative particle"
+    if spanroot.dep_ in ["aux", "auxpass"]:
         category = "Auxiliary"
     # Modal verbs
     if spanroot.tag_ == "MD":
         category = "Modal auxiliary"
+    if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
+        if (
+            spanroot.head.dep_ in ["ROOT", "ccomp"]
+            and spanroot.head.pos_ in ["AUX", "VERB"]
+            and spanroot.pos_ in ["AUX", "VERB"]
+        ):
             if spanroot.morph == spanroot.head.morph:
                 category = "Main verb 4"
             else:
                 category = "Dependent verb 2"
         elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
             category = "Gerund"
+        elif spanroot.head.dep_ in ["conj", "acl", "relcl"]:
             if spanroot.morph == spanroot.head.morph:
                 category = "Main verb 4"
             else:
             category = "Dependent verb 2"
     # Appositive phrases
+    if spanroot.dep_ in ["appos"]:
         if "nummod" in c_dep:
             category = "Apposition"
         elif spanroot.pos_ in ["PROPN"]:
         elif spanroot.pos_ in ["NOUN"]:
             category = "Appositive Noun Phrase"
         elif spanroot.pos_ in ["VERB", "AUX"]:
+            _check = any(
+                c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
+                for c in spanroot.children
+            )
             if _check:
                 category = "Appositive Finite-clause"
+    if spanroot.dep_ in ["appos", "dep", "attr"]:
+        if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
             category = "Main verb 5"
     if spanroot.dep_ in ["dep", "mark"]:
         if spanroot.tag_ in ["RB", "IN", "CC"]:
             category = "Conjunction"
+    # sometimes the extra-clausal links are not accurate
+    if spanroot.dep_ in ["aux", "auxpass", "oprd", "appos", "xcomp"]:
         if spanroot.head.dep_ == "ROOT":
             category = "Main verb"
         else:
     if span.label_ == "CITATION":
         if "NNP" in span_tag or "NNPS" in span_tag:
+            if span_dep[0] == "punct" and span_dep[-1] == "punct":
                 category = "Parenthetical Citation"
             elif span_tag[0] in ["NNP", "NNPS"]:
                 category = "Narrative Citation"
     span_token = [t.norm_ for t in span]
     span_tag = [t.tag_ for t in span]
     c = [c for c in spanroot.children]
     c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
     right_dep = [c.dep_ for c in spanroot.rights]
+    # conditionals
+    subjless = all(
+        c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
+        for c in spanroot.children
+    )
+    argmentless = all(
+        c.dep_
+        not in [
+            "nsubj",
+            "nsubjpass",
+            "csubj",
+            "csubjpass",
+            "dobj",
+            "ccomp",
+            "xcomp",
+            "dative",
+            "attr",
+            "oprd",
+            "acomp",
+        ]
+        for c in spanroot.children
+    )
+    argless_span = all(
+        c.dep_
+        not in [
+            "nsubj",
+            "nsubjpass",
+            "csubj",
+            "csubjpass",
+            "dobj",
+            "ccomp",
+            "xcomp",
+            "dative",
+            "attr",
+            "oprd",
+            "acomp",
+        ]
+        for c in span
+    )
+    argless_span = all(
+        c.dep_
+        not in [
+            "nsubj",
+            "nsubjpass",
+            "csubj",
+            "csubjpass",
+            "dobj",
+            "ccomp",
+            "xcomp",
+            "dative",
+            "attr",
+            "oprd",
+            "acomp",
+        ]
+        for c in span
+    )
     ## nesting classifiers
     if spanroot.dep_ == "conj":
+        while spanroot.dep_ == "conj":
             spanroot = spanroot.head
     if spanroot.dep_ == "poss":
         head = spanroot.head
+        if head.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
             category = "Posessive Noun (Object)"
         elif head.dep_ in ["nsubj", "nsubjpass"]:
             category = "Posessive Noun (Subject)"
         else:
             category = "Posessive Noun (Other)"
+    ## Conjunctions
     # Preconjunctions
+    if spanroot.dep_ in ["preconj", "cc"]:
         category = "Conjunction"
     ## NOUN PHRASES
     # adverbial phrases
+    if spanroot.dep_ in ["amod"]:
         category = "Adjectival modifier"
         # adverbial phrases
+    if spanroot.dep_ in ["compound"]:
         category = "Compound noun"
     ## Nominal category
+    if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
         if "acl" in c_dep:
             category = "Noun + Complement (Object)"
         else:
     ## ADJUNCTS
     # prep phrases
+    if spanroot.dep_ in ["prep", "agent"]:
+        category = "Prepositional phrase"
     # adverbial phrases
+    if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod", "nummod"]:
         category = "Adverbial phrase"
     ## Predication patterns
+    if spanroot.dep_ in ["acomp", "oprd"]:
         if "xcomp" in c_dep:
             category = "Subject predicate to-cl"
         else:
             category = "Adjectival complement"
+    if spanroot.dep_ in ["attr"]:
+        subjless = all(
+            c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
+            for c in spanroot.children
+        )
         c_head = [c.dep_ for c in spanroot.head.children]
         if "expl" in c_head and "no_det" in span_t_dep_:
             category = "There is/are + Noun complement"
         elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
             category = "There is/are + Noun complement"
         elif spanroot.pos_ in ["NOUN", "PRON"]:
             if "acl" in c_dep:
                 category = "Noun + Complement (attr)"
             else:
                 category = "Nominal complement"
+        elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
             category = "Main verb 4"
+        elif spanroot.tag_ in ["NNP"]:
             category = "Nominal complement"
     ## External comp
+    if spanroot.dep_ in ["xcomp"]:
+        if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
             category = "Adjective complement to-cl"
+        if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
             category = "Verb complement to-cl"
+    if spanroot.dep_ in ["pcomp"]:
+        if (
+            str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
+            and "ccomp" in c_dep
+        ):
             category = "Participle + that-cl"
         elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
             category = "Participle"
     #     if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
     #         category = "Gerund"
+    if spanroot.dep_ in ["neg"]:
         category = "Negative particle"
+    if spanroot.dep_ in ["aux", "auxpass"]:
         category = "Auxiliary"
     # Modal verbs
     if spanroot.tag_ == "MD":
         category = "Modal auxiliary"
     ####################################
     ### clausal ####
     ####################################
+    if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl", "punct"]:
+        _check_to = [
+            c.dep_
+            for c in spanroot.subtree
+            if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
+            and c.head.dep_ == "xcomp"
+        ]
+        _check_ing = [
+            c.dep_
+            for c in spanroot.subtree
+            if "Prog" in str(c.morph) and c.dep_ == "xcomp"
+        ]
+        root_before_ccomp = [
+            c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
+        ]
+        _check_for_to = [
+            "_".join([c.norm_, c.dep_])
+            for c in spanroot.subtree
+            if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
+        ]
+        entire_cl = (
+            spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
+        )
         ## Start with broad category, which is then re-evaluated for specific constructions.
+        if spanroot.dep_ in ["advcl", "acl", "punct", "pcomp"]:  #'mark',
             ## Adverbial clauses
+            subjless = all(
+                c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
+                for c in spanroot.children
+            )
+            entire_cl = (
+                spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
+            )
             ### Finite-adverbial clauses
+            if "mark" in span_dep and (
+                spanroot.pos_ in ["VERB", "AUX"] or "aux" in span_dep
+            ):
                 category = "Finite adverbial clause"
+            elif "mark" in span_dep and "aux" in span_dep:
                 category = "Finite adverbial clause"
+            elif (
+                "mark" in span_dep
+                and spanroot.pos_ in ["VERB", "AUX"]
+                and "expl" in c_dep
+            ):
                 category = "Finite adverbial clause"
             elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
+                if spanroot.pos_ in ["VERB", "AUX"]:
                     category = "Finite adverbial clause"
+                elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
                     category = "Non-finite adv clause 1"
                 elif not argmentless:
+                    category = "Finite adverbial clause"
             ## non-finite
+            elif (
+                str(spanroot.morph)
+                in [
+                    "Aspect=Prog|Tense=Pres|VerbForm=Part",
+                    "Aspect=Perf|Tense=Past|VerbForm=Part",
+                ]
+                and "aux" not in c_dep
+            ):
                 # he doing his job
                 if argmentless:
+                    # e.g., frankly speaking, strictly speaking
                     category = "Adverbial Phrase"
                 else:
                     category = "Non-finite adv clause 2"
+            elif (
+                spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
+            ):
                 category = "Non-finite adv clause 3"
             elif "aux" in c_dep and "TO" in c_tag:
                 category = "Adverbial Phrase"
+            elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
                 category = "Dependent Verb phrase"
             elif not argmentless:
+                category = "Adverbial clause"
             elif spanroot.dep_ == "advcl":
+                category = "Adverbial phrase"
             else:
                 category = "Finite adverbial clause "
+        if spanroot.dep_ in ["relcl", "ccomp", "acl", "punct", "pcomp"]:
             head = spanroot.head
             if ";" in [t.norm_ for t in head.children]:
                 category = "Main verb 3"
             elif "mark" in span_dep:
                 category = "Complement clause"
+            elif (
+                str(spanroot.morph)
+                in [
+                    "Aspect=Prog|Tense=Pres|VerbForm=Part",
+                    "Aspect=Perf|Tense=Past|VerbForm=Part",
+                ]
+                and "aux" not in c_dep
+            ):
                 category = "Non-finite complement clause"
+            elif spanroot.dep_ in ["relcl"]:
                 category = "Relative clause"
+            elif spanroot.dep_ in ["ccomp"]:
                 category = "Complement clause"
+            elif spanroot.dep_ in ["acl"]:
                 category = "Noun Complement clause"
         ## Specific constructions
         # Extraposed that-clause or to-infinitives
+        if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
+            "VERB",
+            "AUX",
+        ]:
             # print(c_dep)
             if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
+                # eg it seems odd (oprd) that X.
+                # eg it is certain (acomp) that X.
+                category = (
+                    "Extraposed that-cl (adj-complement)"  # e.g., it is certain that X.
+                )
             elif "xcomp" in c_dep or ("advcl" in c_dep):
                 if "for_mark" in _check_for_to:
+                    category = (
+                        "Extraposed to-cl (explicit subj)"  # eg It is possible to .
+                    )
                 elif _check_to:
+                    category = "Extraposed to-cl 1"  # eg It is possible to .
                 elif _check_ing:
+                    category = "Extraposed -ing 1"  # eg It is possible to .
+            elif (
+                ("prep" in right_dep or "npadvmod" in right_dep)
+                and "ccomp" in right_dep
+                and spanroot.lemma_ == "be"
+            ):
+                category = "Cleft construction"
             elif "attr" in c_dep:
+                category = "Extraposed that-cl (copula)"  # eg It is a wonder that X.
             else:
+                category = "Extraposed that-cl (VERB)"
         # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
         #     category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
+        elif (
+            "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
+        ) and "acomp" in c_dep:
             if "xcomp" in c_dep:
                 if _check_to:
+                    category = "Extraposed to-cl 2"  # eg it is difficult to decide.
                 elif _check_ing:
+                    category = "Extraposed -ing 2"  # eg it is difficult to decide.
             else:
                 category = "Extraposed that-cl (adj-complement) 2"
         elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
+            category = (
+                "Extraposed that-cl (adj-complement) 3"  # eg it seems odd that X.
+            )
         # something without dummy subject "it"
+        elif (
+            (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
+            and spanroot.pos_ in ["AUX", "VERB"]
+            and "it" not in c_norm
+        ):
             # store xcomp, if the head of the xcomp is acomp
+            _check_xcomp = [
+                c.dep_
+                for c in spanroot.subtree
+                if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
+            ]
+            _check_ccomp = [
+                c.dep_
+                for c in spanroot.subtree
+                if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
+            ]
             # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
             # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
             if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
                 if any(root_before_ccomp):
             elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
                 category = "Post-predicate to-cl"
+            elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
                 category = "Subject predicate to-cl"
             elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
                 category = "Subject predicate to-cl (passive)"
+            elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
                 category = "Subject predicate -ing"
             elif "ccomp" in c_dep:
                 category = "Subject predicate that-cl"
                 category = "Main verb 1"
         ## without dummy subject it, and lexical verbs
+        elif (
+            ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
+            and spanroot.pos_ in ["AUX", "VERB"]
+            and "it" not in c_norm
+            and spanroot.lemma_ not in ["be"]
+        ):
+            _check_wh = [
+                c.dep_
+                for c in spanroot.subtree
+                if (
+                    c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
+                    and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
+                )
+                and c.head.dep_ == "ccomp"
+            ]
+            _check_if = [
+                c.dep_
+                for c in spanroot.subtree
+                if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
+                and c.head.dep_ == "ccomp"
+            ]
             # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
             # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
                 elif _check_ing:
                     category = "Post-predicate -ing"
         # Existential
         elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
             category = "There is/are NOUN"
+        elif (
+            "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
+        ):
             category = "Cleft construction"
         ### The end of clausal analysis
+    if spanroot.dep_ in ["parataxis"]:
+        if "_".join(span_dep) in [
+            "nsubj_parataxis",
+            "aux_parataxis",
+            "nsubj_aux_parataxis",
+        ]:
             category = "Comment clause"
         else:
             category = "Parataxis"
+    if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
+        if (
+            spanroot.head.dep_ in ["ROOT", "ccomp"]
+            and spanroot.head.pos_ in ["AUX", "VERB"]
+            and spanroot.pos_ in ["AUX", "VERB"]
+        ):
             if spanroot.morph == spanroot.head.morph:
                 category = "Main verb 4"
             else:
                 category = "Dependent verb 2"
         elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
             category = "Gerund"
+        elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str(
+            spanroot.morph
+        ):
             category = "Dependent verb 2"
+        elif spanroot.dep_ in ["csubj", "csubjpass"]:
             category = "Dependent verb (csubj)"
     # Appositive phrases
+    if spanroot.dep_ in ["appos"]:
         if "nummod" in c_dep:
             category = "Apposition"
         if spanroot.pos_ in ["PROPN"]:
         elif spanroot.pos_ in ["NOUN"]:
             category = "Appositive Noun Phrase"
         elif spanroot.pos_ in ["VERB", "AUX"]:
+            _check = any(
+                c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
+                for c in spanroot.children
+            )
             if _check:
                 category = "Appositive Finite-clause"
+    if spanroot.dep_ in ["appos", "dep", "attr"]:
+        if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
             category = "Main verb (likely parsing error)"
+    # sometimes the dep are on the conjunctions
     if spanroot.dep_ in ["dep", "mark"]:
         if spanroot.tag_ in ["RB", "IN", "CC"]:
             category = "Conjunction"
     if spanroot.dep_ in ["intj"]:
         category = "Introjection"
+    # sometimes the extra-clausal links are not accurate
+    if (
+        spanroot.dep_
+        in ["aux", "auxpass", "oprd", "appos", "xcomp", "attr", "dep", "meta", "prt"]
+        and category == None
+    ):
         if spanroot.head.dep_ == "ROOT":
             category = "Main verb"
         else:
     if span.label_ == "CITATION":
         if "NNP" in span_tag or "NNPS" in span_tag:
+            if span_dep[0] == "punct" and span_dep[-1] == "punct":
                 category = "Parenthetical Citation"
             elif span_tag[0] in ["NNP", "NNPS"]:
                 category = "Narrative Citation"
     return category
+def const_table(
+    doc: Union[spacy.tokens.Doc, Dict[str, str]],
+    spans_key: str = "sc",
+    attrs: List[str] = SPAN_ATTRS,
+):
+    columns = attrs + [
+        "Conf. score",
+        "sent no.",
+        "grammatical realization",
+        "span dep",
+        "ner",
+        "POS",
+        "span dep seq",
+        "TAG sequence",
+        "POS sequence",
+        "head",
+        "head dep",
+        "children",
+        "morphology",
+        "sent",
+    ]
     data = []
     # data = span_info_aggregator(doc, columns)
     sentences = {s: i for i, s in enumerate(doc.sents)}
+    for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]):
         span_info = []
         span_info.extend([str(getattr(span, attr)) for attr in attrs])
         span_info.append(span.root.head.norm_)
         span_info.append(span.root.head.dep_)
         span_info.append("_".join([c.dep_ for c in span.root.children]))
+        span_info.append(str(span.root.morph))
         span_info.append(span.sent.text.strip())
         data.append(span_info)
     return data, columns
+def ngrammar(seq: list, n=2, concat=False, sep="-"):
     result = []
     n_item = len(seq)
     for idx, item in enumerate(seq):
         if idx + n <= n_item:
             if concat:
+                result.append(sep.join(seq[idx : idx + n]))
             else:
+                result.append(seq[idx : idx + n])
     return result
 def diversity_values(count_vec: list):
     result = {}
     if len(count_vec) == 0:
+        count_vec = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    result["shannon"] = dv.alpha.shannon(list(count_vec), base=2)
+    result["brillouin_d"] = dv.alpha.brillouin_d(list(count_vec))
+    result["simpson_d"] = 1 - dv.alpha.simpson(list(count_vec))
+    result["simpson_e"] = dv.alpha.simpson_e(list(count_vec))
     # result['gini_index'] = dv.alpha.gini_index(list(count_vec))
     # result['faith_pd'] = dv.alpha.faith_pd(list(count_vec))

utils/__pycache__/visualize.cpython-310.pyc CHANGED Viewed

Binary files a/utils/__pycache__/visualize.cpython-310.pyc and b/utils/__pycache__/visualize.cpython-310.pyc differ

utils/visualize.py CHANGED Viewed

@@ -17,7 +17,12 @@ import streamlit as st
 from spacy_streamlit import visualize_spans
 from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO
-from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
 from skbio import diversity as dv
 SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))
@@ -43,6 +48,9 @@ def visualize_spans(
     manual: bool = False,
     displacy_options: Optional[Dict] = None,
     simple: bool = True,
 ):
     """
     Visualizer for spans.
@@ -100,13 +108,15 @@ def visualize_spans(
             df = pd.DataFrame(data, columns=cols)
             df = df.astype({"start": int, "end": int})
             df = df.sort_values(by= ['start'])
-            st.subheader("Span information")
             st.dataframe(
                 df.style.highlight_between(subset='Conf. score', right=.7))
-            if not simple:
                 st.subheader("Label counts & Diagnostic confidence score summary")
-                counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
                 print(counts)
                 print(list(counts))
@@ -119,7 +129,9 @@ def visualize_spans(
                 st.dataframe(label_counts)
                 # print(list(label_counts))
                 sequences = list(df['label_'])
                 # Engagement ngrams
                 span_bigrams = ngrammar(seq=sequences, n=2, concat=True)
                 span_trigrams = ngrammar(seq=sequences, n=3, concat=True)
@@ -132,20 +144,26 @@ def visualize_spans(
                 label_dep = pd.crosstab(df['grammatical realization'], df['label_'])
                 st.dataframe(label_dep)
-                st.subheader('Quantitative results')
                 # st.markdown(
                 #     f"Shannon's index: {dv.alpha.shannon(list(counts), base=2): .3f}")
                 # st.markdown(
                 #     f"Simpson's e index: {1 - dv.alpha.simpson_e(list(counts)): .3f}")
                 div = diversity_values(list(counts))
                 div_data = pd.DataFrame.from_dict(div, orient='index')
-                st.dataframe(div_data)
-                doc_data = pd.concat([counts, div_data], axis = 0).T
                 filename = "NA"
                 doc_data.insert(0, "filename", filename, True)
                 doc_data.insert(1, "nwords", len(doc), True)
                 st.dataframe(doc_data)
             # st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
             # print(dv.get_alpha_diversity_metrics())

 from spacy_streamlit import visualize_spans
 from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO
+from pipeline.post_processors import (
+    simple_table,
+    const_table,
+    ngrammar,
+    diversity_values,
+)
 from skbio import diversity as dv
 SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))
     manual: bool = False,
     displacy_options: Optional[Dict] = None,
     simple: bool = True,
+    show_confidence: bool = False,
+    show_diversity: bool = False,
+    show_ngrams: bool = False,
 ):
     """
     Visualizer for spans.
             df = pd.DataFrame(data, columns=cols)
             df = df.astype({"start": int, "end": int})
             df = df.sort_values(by= ['start'])
+            st.subheader("Engagement span information")
             st.dataframe(
                 df.style.highlight_between(subset='Conf. score', right=.7))
+            counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
+            if show_confidence:
                 st.subheader("Label counts & Diagnostic confidence score summary")
                 print(counts)
                 print(list(counts))
                 st.dataframe(label_counts)
                 # print(list(label_counts))
+            if show_ngrams:
                 sequences = list(df['label_'])
                 # Engagement ngrams
                 span_bigrams = ngrammar(seq=sequences, n=2, concat=True)
                 span_trigrams = ngrammar(seq=sequences, n=3, concat=True)
                 label_dep = pd.crosstab(df['grammatical realization'], df['label_'])
                 st.dataframe(label_dep)
+            if show_diversity:
+                st.subheader('Diversity of rhetorical features')
                 # st.markdown(
                 #     f"Shannon's index: {dv.alpha.shannon(list(counts), base=2): .3f}")
                 # st.markdown(
                 #     f"Simpson's e index: {1 - dv.alpha.simpson_e(list(counts)): .3f}")
+                st.markdown("##### Entropy based diversity measures")
+                filename = "NA"
                 div = diversity_values(list(counts))
                 div_data = pd.DataFrame.from_dict(div, orient='index')
+                # st.dataframe(div_data)
+                doc_data = pd.concat([div_data, counts, ], axis = 0).T
                 filename = "NA"
                 doc_data.insert(0, "filename", filename, True)
                 doc_data.insert(1, "nwords", len(doc), True)
                 st.dataframe(doc_data)
             # st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
             # print(dv.get_alpha_diversity_metrics())