engagement-analyzer-demo2 / pipeline /post_processors.py
egumasa's picture
detailed summary
866b9fc
raw
history blame
41.3 kB
from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
import pandas as pd
import spacy
from spacy.language import Language
from skbio import diversity as dv
SPAN_ATTRS = ["text", "label_", "start", "end"]
CATEGORIES = [
"ATTRIBUTION",
"CITATION",
"COUNTER",
"DENY",
"ENDOPHORIC",
"ENTERTAIN",
"JUSTIFYING",
"MONOGLOSS",
"PROCLAIM",
"SOURCES",
]
def simple_table(
doc: Union[spacy.tokens.Doc, Dict[str, str]],
spans_key: str = "sc",
attrs: List[str] = SPAN_ATTRS,
):
columns = attrs + ["Conf. score"]
data = [
[str(getattr(span, attr)) for attr in attrs] + [score] # [f'{score:.5f}']
for span, score in zip(
doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]
)
]
return data, columns
# def span_info_aggregator()
def construction_classifier(doc, span):
category = None
spanroot = span.root
## Grabbing lexico-grammatical information
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
span_dep = [t.dep_ for t in span]
span_token = [t.norm_ for t in span]
span_tag = [t.tag_ for t in span]
c = [c for c in spanroot.children]
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
c_norm = [c.norm_ for c in spanroot.children]
c_dep = [c.dep_ for c in spanroot.children]
c_pos = [c.pos_ for c in spanroot.children]
c_tag = [c.tag_ for c in spanroot.children]
right_dep = [c.dep_ for c in spanroot.rights]
# conditionals
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
argmentless = all(
c.dep_
not in [
"nsubj",
"nsubjpass",
"csubj",
"csubjpass",
"dobj",
"ccomp",
"xcomp",
"dative",
"attr",
"oprd",
"acomp",
]
for c in spanroot.children
)
argless_span = all(
c.dep_
not in [
"nsubj",
"nsubjpass",
"csubj",
"csubjpass",
"dobj",
"ccomp",
"xcomp",
"dative",
"attr",
"oprd",
"acomp",
]
for c in span
)
## nesting classifiers
if spanroot.dep_ == "conj":
while spanroot.dep_ == "conj":
spanroot = spanroot.head
# if spanroot.dep_ == "poss":
# while spanroot.dep_ == 'poss':
# spanroot = spanroot.head
## Conjunctions
# Preconjunctions
if spanroot.dep_ in ["preconj", "cc"]:
category = "Conjunction"
## NOUN PHRASES
# adverbial phrases
if spanroot.dep_ in ["amod"]:
category = "Adjectival modifier"
# adverbial phrases
if spanroot.dep_ in ["compound"]:
category = "Compound noun"
## Nominal category
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
if "acl" in c_dep:
category = "Noun + Complement (Object)"
else:
category = "Object"
if spanroot.dep_ in ["nsubj", "nsubjpass"]:
if "acl" in c_dep:
category = "Noun + Complement (Subject)"
else:
category = "Subject"
## ADJUNCTS
# prep phrases
if spanroot.dep_ in ["prep", "agent"]:
category = "Prepositional phrase"
# adverbial phrases
if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod"]:
category = "Adverbial phrase"
## Predication patterns
if spanroot.dep_ in ["acomp", "oprd"]:
if "xcomp" in c_dep:
category = "Subject predicate to-cl"
else:
category = "Adjectival complement"
if spanroot.dep_ in ["attr"]:
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
c_head = [c.dep_ for c in spanroot.head.children]
if "expl" in c_head and "no_det" in span_t_dep_:
category = "There is/are no NOUN"
elif "expl" in c_head and spanroot.pos_ in ["NOUN"]:
category = "There is/are + Noun complement"
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
category = "There is/are + Noun complement"
elif spanroot.pos_ in ["NOUN", "PRON"]:
if "acl" in c_dep:
category = "Noun + Complement (attr)"
else:
category = "Nominal complement"
elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
category = "Main verb 4"
elif spanroot.tag_ in ["NNP"]:
category = "Nominal complement"
####################################
### clausal ####
####################################
if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl"]:
_check_to = [
c.dep_
for c in spanroot.subtree
if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
and c.head.dep_ == "xcomp"
]
_check_ing = [
c.dep_
for c in spanroot.subtree
if "Prog" in str(c.morph) and c.dep_ == "xcomp"
]
root_before_ccomp = [
c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
]
_check_for_to = [
"_".join([c.norm_, c.dep_])
for c in spanroot.subtree
if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
]
entire_cl = (
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
)
## Start with broad category, which is then re-evaluated for specific constructions.
if spanroot.dep_ in ["advcl", "mark", "acl", "pcomp"]:
## Adverbial clauses
### Finite-adverbial clauses
### Non-finite adverbial clauses
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
entire_cl = (
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
)
if "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
category = "Finite adverbial clause"
elif "mark" in span_dep and "aux" in span_dep:
category = "Finite adverbial clause"
elif (
"mark" in span_dep
and spanroot.pos_ in ["VERB", "AUX"]
and "expl" in c_dep
):
category = "Finite adverbial clause"
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
if spanroot.pos_ in ["VERB", "AUX"]:
category = "Finite adverbial clause"
elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
category = "Non-finite adv clause 1"
elif entire_cl:
category = "Finite adverbial clause"
elif (
str(spanroot.morph)
in [
"Aspect=Prog|Tense=Pres|VerbForm=Part",
"Aspect=Perf|Tense=Past|VerbForm=Part",
]
and "aux" not in c_dep
):
# he doing his job
if argmentless:
# e.g., frankly speaking, strictly speaking
category = "Adverbial Phrase"
else:
category = "Non-finite adv clause 2"
elif (
spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
):
category = "Non-finite adv clause 3"
elif "aux" in c_dep and "TO" in c_tag:
category = "Adverbial Phrase"
elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
category = "Dependent Verb phrase"
elif not argmentless:
category = "Adverbial clause"
elif spanroot.dep_ == "advcl":
category = "Adverbial phrase"
if spanroot.dep_ in ["relcl", "ccomp", "acl"]:
head = spanroot.head
if ";" in [t.norm_ for t in head.children]:
category = "Main verb 3"
elif "nsubj" not in span_dep:
category = "Dependent verb 1"
elif "mark" in span_dep:
category = "Complement clause"
elif (
str(spanroot.morph)
in [
"Aspect=Prog|Tense=Pres|VerbForm=Part",
"Aspect=Perf|Tense=Past|VerbForm=Part",
]
and "aux" not in c_dep
):
category = "Non-finite complement clause"
elif spanroot.dep_ in ["relcl"]:
category = "Relative clause"
elif spanroot.dep_ in ["ccomp"]:
category = "Complement clause"
elif spanroot.dep_ in ["acl"]:
category = "Noun Complement clause"
else:
# print(_check_for_to)
category = "this one"
## Specific constructions
# Extraposed that-clause or to-infinitives
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
"VERB",
"AUX",
]:
print(c_dep)
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
# eg it seems odd (oprd) that X.
# eg it is certain (acomp) that X.
category = (
"Extraposed that-cl (adj-complement)" # e.g., it is certain that X.
)
elif "xcomp" in c_dep or ("advcl" in c_dep):
if "for_mark" in _check_for_to:
category = (
"Extraposed to-cl (explicit subj)" # eg It is possible to .
)
elif _check_to:
category = "Extraposed to-cl 1" # eg It is possible to .
elif _check_ing:
category = "Extraposed -ing 1" # eg It is possible to .
elif (
("prep" in right_dep or "npadvmod" in right_dep)
and "ccomp" in right_dep
and spanroot.lemma_ == "be"
):
category = "Cleft construction"
elif "attr" in c_dep:
category = "Extraposed that-cl (copula)" # eg It is a wonder that X.
else:
category = "Extraposed that-cl (VERB)"
# if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
# category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
elif (
"it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
) and "acomp" in c_dep:
if "xcomp" in c_dep:
if _check_to:
category = "Extraposed to-cl 2" # eg it is difficult to decide.
elif _check_ing:
category = "Extraposed -ing 2" # eg it is difficult to decide.
else:
category = "Extraposed that-cl (adj-complement) 2"
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
category = (
"Extraposed that-cl (adj-complement) 3" # eg it seems odd that X.
)
# something without dummy subject "it"
elif (
(("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
and spanroot.pos_ in ["AUX", "VERB"]
and "it" not in c_norm
):
# store xcomp, if the head of the xcomp is acomp
_check_xcomp = [
c.dep_
for c in spanroot.subtree
if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
]
_check_ccomp = [
c.dep_
for c in spanroot.subtree
if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
]
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
if any(root_before_ccomp):
category = "Post-predicate that-cl"
else:
category = "Comment clause"
elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp:
category = "Post-predicate that-cl 2"
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
category = "Post-predicate to-cl"
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
category = "Subject predicate to-cl"
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
category = "Subject predicate to-cl (passive)"
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
category = "Subject predicate -ing"
elif "ccomp" in c_dep:
category = "Subject predicate that-cl"
elif "acomp" in c_dep:
category = "Adjectival predicate"
elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep):
category = "Finite-adverbial clause"
else:
category = "Main verb 1"
## without dummy subject it, and lexical verbs
elif (
("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
and spanroot.pos_ in ["AUX", "VERB"]
and "it" not in c_norm
and spanroot.lemma_ not in ["be"]
):
_check_wh = [
c.dep_
for c in spanroot.subtree
if (
c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
)
and c.head.dep_ == "ccomp"
]
_check_if = [
c.dep_
for c in spanroot.subtree
if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
and c.head.dep_ == "ccomp"
]
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
if "ccomp" in c_dep and (_check_wh or _check_if):
category = "Post-predicate wh-cl"
elif "ccomp" in c_dep:
if any(root_before_ccomp):
category = "Post-predicate that-cl"
else:
category = "Comment clause"
elif "xcomp" in c_dep:
if _check_to:
category = "Post-predicate to-cl"
elif _check_ing:
category = "Post-predicate -ing"
# Existential
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
category = "There is/are NOUN"
elif (
"ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
):
category = "Cleft construction"
if spanroot.dep_ in ["parataxis"]:
if "_".join(span_dep) in [
"nsubj_parataxis",
"aux_parataxis",
"nsubj_aux_parataxis",
]:
category = "Comment clause"
else:
category = "parataxis (for now)"
## External comp
if spanroot.dep_ in ["xcomp"]:
if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
category = "Adjective complement to-cl"
if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
category = "Verb complement to-cl"
if spanroot.dep_ in ["pcomp"]:
if (
str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
and "ccomp" in c_dep
):
category = "Participle + that-cl"
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
category = "Participle"
## Simple classifier
# if spanroot.dep_ in ['pcomp']:
# if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
# category = "Gerund"
if spanroot.dep_ in ["neg"]:
category = "Negative particle"
if spanroot.dep_ in ["aux", "auxpass"]:
category = "Auxiliary"
# Modal verbs
if spanroot.tag_ == "MD":
category = "Modal auxiliary"
if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
if (
spanroot.head.dep_ in ["ROOT", "ccomp"]
and spanroot.head.pos_ in ["AUX", "VERB"]
and spanroot.pos_ in ["AUX", "VERB"]
):
if spanroot.morph == spanroot.head.morph:
category = "Main verb 4"
else:
category = "Dependent verb 2"
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
category = "Gerund"
elif spanroot.head.dep_ in ["conj", "acl", "relcl"]:
if spanroot.morph == spanroot.head.morph:
category = "Main verb 4"
else:
category = "Dependent verb 2"
elif "VerbForm=Fin" in str(spanroot.morph):
category = "Dependent verb 2"
# Appositive phrases
if spanroot.dep_ in ["appos"]:
if "nummod" in c_dep:
category = "Apposition"
elif spanroot.pos_ in ["PROPN"]:
category = "Appositive Proper Nouns"
elif spanroot.pos_ in ["NOUN"]:
category = "Appositive Noun Phrase"
elif spanroot.pos_ in ["VERB", "AUX"]:
_check = any(
c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
if _check:
category = "Appositive Finite-clause"
if spanroot.dep_ in ["appos", "dep", "attr"]:
if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
category = "Main verb 5"
if spanroot.dep_ in ["dep", "mark"]:
if spanroot.tag_ in ["RB", "IN", "CC"]:
category = "Conjunction"
# sometimes the extra-clausal links are not accurate
if spanroot.dep_ in ["aux", "auxpass", "oprd", "appos", "xcomp"]:
if spanroot.head.dep_ == "ROOT":
category = "Main verb"
else:
category = "dependent verb 5"
if span.label_ == "CITATION":
if "NNP" in span_tag or "NNPS" in span_tag:
if span_dep[0] == "punct" and span_dep[-1] == "punct":
category = "Parenthetical Citation"
elif span_tag[0] in ["NNP", "NNPS"]:
category = "Narrative Citation"
else:
category = "Other Citation"
if category == None:
category = spanroot.dep_
return category
def construction_classifier2(doc, span):
category = None
spanroot = span.root
## Grabbing lexico-grammatical information
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
span_dep = [t.dep_ for t in span]
span_token = [t.norm_ for t in span]
span_tag = [t.tag_ for t in span]
c = [c for c in spanroot.children]
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
c_norm = [c.norm_ for c in spanroot.children]
c_dep = [c.dep_ for c in spanroot.children]
c_pos = [c.pos_ for c in spanroot.children]
c_tag = [c.tag_ for c in spanroot.children]
right_dep = [c.dep_ for c in spanroot.rights]
# conditionals
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
argmentless = all(
c.dep_
not in [
"nsubj",
"nsubjpass",
"csubj",
"csubjpass",
"dobj",
"ccomp",
"xcomp",
"dative",
"attr",
"oprd",
"acomp",
]
for c in spanroot.children
)
argless_span = all(
c.dep_
not in [
"nsubj",
"nsubjpass",
"csubj",
"csubjpass",
"dobj",
"ccomp",
"xcomp",
"dative",
"attr",
"oprd",
"acomp",
]
for c in span
)
argless_span = all(
c.dep_
not in [
"nsubj",
"nsubjpass",
"csubj",
"csubjpass",
"dobj",
"ccomp",
"xcomp",
"dative",
"attr",
"oprd",
"acomp",
]
for c in span
)
## nesting classifiers
if spanroot.dep_ == "conj":
while spanroot.dep_ == "conj":
spanroot = spanroot.head
if spanroot.dep_ == "poss":
head = spanroot.head
if head.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
category = "Posessive Noun (Object)"
elif head.dep_ in ["nsubj", "nsubjpass"]:
category = "Posessive Noun (Subject)"
else:
category = "Posessive Noun (Other)"
## Conjunctions
# Preconjunctions
if spanroot.dep_ in ["preconj", "cc"]:
category = "Conjunction"
## NOUN PHRASES
# adverbial phrases
if spanroot.dep_ in ["amod"]:
category = "Adjectival modifier"
# adverbial phrases
if spanroot.dep_ in ["compound"]:
category = "Compound noun"
## Nominal category
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
if "acl" in c_dep:
category = "Noun + Complement (Object)"
else:
category = "Object"
if spanroot.dep_ in ["nsubj", "nsubjpass"]:
if "acl" in c_dep:
category = "Noun + Complement (Subject)"
else:
category = "Subject"
## ADJUNCTS
# prep phrases
if spanroot.dep_ in ["prep", "agent"]:
category = "Prepositional phrase"
# adverbial phrases
if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod", "nummod"]:
category = "Adverbial phrase"
## Predication patterns
if spanroot.dep_ in ["acomp", "oprd"]:
if "xcomp" in c_dep:
category = "Subject predicate to-cl"
else:
category = "Adjectival complement"
if spanroot.dep_ in ["attr"]:
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
c_head = [c.dep_ for c in spanroot.head.children]
if "expl" in c_head and "no_det" in span_t_dep_:
category = "There is/are no NOUN"
elif "expl" in c_head and spanroot.pos_ in ["NOUN"]:
category = "There is/are + Noun complement"
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
category = "There is/are + Noun complement"
elif spanroot.pos_ in ["NOUN", "PRON"]:
if "acl" in c_dep:
category = "Noun + Complement (attr)"
else:
category = "Nominal complement"
elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
category = "Main verb 4"
elif spanroot.tag_ in ["NNP"]:
category = "Nominal complement"
## External comp
if spanroot.dep_ in ["xcomp"]:
if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
category = "Adjective complement to-cl"
if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
category = "Verb complement to-cl"
if spanroot.dep_ in ["pcomp"]:
if (
str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
and "ccomp" in c_dep
):
category = "Participle + that-cl"
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
category = "Participle"
## Simple classifier
# if spanroot.dep_ in ['pcomp']:
# if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
# category = "Gerund"
if spanroot.dep_ in ["neg"]:
category = "Negative particle"
if spanroot.dep_ in ["aux", "auxpass"]:
category = "Auxiliary"
# Modal verbs
if spanroot.tag_ == "MD":
category = "Modal auxiliary"
####################################
### clausal ####
####################################
if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl", "punct"]:
_check_to = [
c.dep_
for c in spanroot.subtree
if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
and c.head.dep_ == "xcomp"
]
_check_ing = [
c.dep_
for c in spanroot.subtree
if "Prog" in str(c.morph) and c.dep_ == "xcomp"
]
root_before_ccomp = [
c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
]
_check_for_to = [
"_".join([c.norm_, c.dep_])
for c in spanroot.subtree
if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
]
entire_cl = (
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
)
## Start with broad category, which is then re-evaluated for specific constructions.
if spanroot.dep_ in ["advcl", "acl", "punct", "pcomp"]: #'mark',
## Adverbial clauses
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
entire_cl = (
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
)
### Finite-adverbial clauses
if "mark" in span_dep and (
spanroot.pos_ in ["VERB", "AUX"] or "aux" in span_dep
):
category = "Finite adverbial clause"
elif "mark" in span_dep and "aux" in span_dep:
category = "Finite adverbial clause"
elif (
"mark" in span_dep
and spanroot.pos_ in ["VERB", "AUX"]
and "expl" in c_dep
):
category = "Finite adverbial clause"
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
if spanroot.pos_ in ["VERB", "AUX"]:
category = "Finite adverbial clause"
elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
category = "Non-finite adv clause 1"
elif not argmentless:
category = "Finite adverbial clause"
## non-finite
elif (
str(spanroot.morph)
in [
"Aspect=Prog|Tense=Pres|VerbForm=Part",
"Aspect=Perf|Tense=Past|VerbForm=Part",
]
and "aux" not in c_dep
):
# he doing his job
if argmentless:
# e.g., frankly speaking, strictly speaking
category = "Adverbial Phrase"
else:
category = "Non-finite adv clause 2"
elif (
spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
):
category = "Non-finite adv clause 3"
elif "aux" in c_dep and "TO" in c_tag:
category = "Adverbial Phrase"
elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
category = "Dependent Verb phrase"
elif not argmentless:
category = "Adverbial clause"
elif spanroot.dep_ == "advcl":
category = "Adverbial phrase"
else:
category = "Finite adverbial clause "
if spanroot.dep_ in ["relcl", "ccomp", "acl", "punct", "pcomp"]:
head = spanroot.head
if ";" in [t.norm_ for t in head.children]:
category = "Main verb 3"
elif "nsubj" not in span_dep:
category = "Dependent verb 1"
elif "mark" in span_dep:
category = "Complement clause"
elif (
str(spanroot.morph)
in [
"Aspect=Prog|Tense=Pres|VerbForm=Part",
"Aspect=Perf|Tense=Past|VerbForm=Part",
]
and "aux" not in c_dep
):
category = "Non-finite complement clause"
elif spanroot.dep_ in ["relcl"]:
category = "Relative clause"
elif spanroot.dep_ in ["ccomp"]:
category = "Complement clause"
elif spanroot.dep_ in ["acl"]:
category = "Noun Complement clause"
## Specific constructions
# Extraposed that-clause or to-infinitives
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
"VERB",
"AUX",
]:
# print(c_dep)
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
# eg it seems odd (oprd) that X.
# eg it is certain (acomp) that X.
category = (
"Extraposed that-cl (adj-complement)" # e.g., it is certain that X.
)
elif "xcomp" in c_dep or ("advcl" in c_dep):
if "for_mark" in _check_for_to:
category = (
"Extraposed to-cl (explicit subj)" # eg It is possible to .
)
elif _check_to:
category = "Extraposed to-cl 1" # eg It is possible to .
elif _check_ing:
category = "Extraposed -ing 1" # eg It is possible to .
elif (
("prep" in right_dep or "npadvmod" in right_dep)
and "ccomp" in right_dep
and spanroot.lemma_ == "be"
):
category = "Cleft construction"
elif "attr" in c_dep:
category = "Extraposed that-cl (copula)" # eg It is a wonder that X.
else:
category = "Extraposed that-cl (VERB)"
# if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
# category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
elif (
"it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
) and "acomp" in c_dep:
if "xcomp" in c_dep:
if _check_to:
category = "Extraposed to-cl 2" # eg it is difficult to decide.
elif _check_ing:
category = "Extraposed -ing 2" # eg it is difficult to decide.
else:
category = "Extraposed that-cl (adj-complement) 2"
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
category = (
"Extraposed that-cl (adj-complement) 3" # eg it seems odd that X.
)
# something without dummy subject "it"
elif (
(("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
and spanroot.pos_ in ["AUX", "VERB"]
and "it" not in c_norm
):
# store xcomp, if the head of the xcomp is acomp
_check_xcomp = [
c.dep_
for c in spanroot.subtree
if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
]
_check_ccomp = [
c.dep_
for c in spanroot.subtree
if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
]
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
if any(root_before_ccomp):
category = "Post-predicate that-cl"
else:
category = "Comment clause"
elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp:
category = "Post-predicate that-cl 2"
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
category = "Post-predicate to-cl"
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
category = "Subject predicate to-cl"
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
category = "Subject predicate to-cl (passive)"
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
category = "Subject predicate -ing"
elif "ccomp" in c_dep:
category = "Subject predicate that-cl"
elif "acomp" in c_dep:
category = "Adjectival predicate"
elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep):
category = "Finite-adverbial clause"
elif not argmentless and "SCONJ" in c_pos:
category = "Finite-adverbial clause"
else:
category = "Main verb 1"
## without dummy subject it, and lexical verbs
elif (
("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
and spanroot.pos_ in ["AUX", "VERB"]
and "it" not in c_norm
and spanroot.lemma_ not in ["be"]
):
_check_wh = [
c.dep_
for c in spanroot.subtree
if (
c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
)
and c.head.dep_ == "ccomp"
]
_check_if = [
c.dep_
for c in spanroot.subtree
if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
and c.head.dep_ == "ccomp"
]
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
if "ccomp" in c_dep and (_check_wh or _check_if):
category = "Post-predicate wh-cl"
elif "ccomp" in c_dep:
if any(root_before_ccomp):
category = "Post-predicate that-cl"
else:
category = "Comment clause"
elif "xcomp" in c_dep:
if _check_to:
category = "Post-predicate to-cl"
elif _check_ing:
category = "Post-predicate -ing"
# Existential
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
category = "There is/are NOUN"
elif (
"ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
):
category = "Cleft construction"
### The end of clausal analysis
if spanroot.dep_ in ["parataxis"]:
if "_".join(span_dep) in [
"nsubj_parataxis",
"aux_parataxis",
"nsubj_aux_parataxis",
]:
category = "Comment clause"
else:
category = "Parataxis"
if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
if (
spanroot.head.dep_ in ["ROOT", "ccomp"]
and spanroot.head.pos_ in ["AUX", "VERB"]
and spanroot.pos_ in ["AUX", "VERB"]
):
if spanroot.morph == spanroot.head.morph:
category = "Main verb 4"
else:
category = "Dependent verb 2"
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
category = "Gerund"
elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str(
spanroot.morph
):
category = "Dependent verb 2"
elif spanroot.dep_ in ["csubj", "csubjpass"]:
category = "Dependent verb (csubj)"
# Appositive phrases
if spanroot.dep_ in ["appos"]:
if "nummod" in c_dep:
category = "Apposition"
if spanroot.pos_ in ["PROPN"]:
category = "Appositive Proper Nouns"
elif spanroot.pos_ in ["NOUN"]:
category = "Appositive Noun Phrase"
elif spanroot.pos_ in ["VERB", "AUX"]:
_check = any(
c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
if _check:
category = "Appositive Finite-clause"
if spanroot.dep_ in ["appos", "dep", "attr"]:
if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
category = "Main verb (likely parsing error)"
# sometimes the dep are on the conjunctions
if spanroot.dep_ in ["dep", "mark"]:
if spanroot.tag_ in ["RB", "IN", "CC"]:
category = "Conjunction"
if spanroot.dep_ in ["intj"]:
category = "Introjection"
# sometimes the extra-clausal links are not accurate
if (
spanroot.dep_
in ["aux", "auxpass", "oprd", "appos", "xcomp", "attr", "dep", "meta", "prt"]
and category == None
):
if spanroot.head.dep_ == "ROOT":
category = "Main verb"
else:
category = "dependent verb 5"
if span.label_ == "CITATION":
if "NNP" in span_tag or "NNPS" in span_tag:
if span_dep[0] == "punct" and span_dep[-1] == "punct":
category = "Parenthetical Citation"
elif span_tag[0] in ["NNP", "NNPS"]:
category = "Narrative Citation"
else:
category = "Other Citation"
if category == None:
category = spanroot.dep_
return category
def const_table(
doc: Union[spacy.tokens.Doc, Dict[str, str]],
spans_key: str = "sc",
attrs: List[str] = SPAN_ATTRS,
):
columns = attrs + [
"Conf. score",
"sent no.",
"grammatical realization",
"span dep",
"ner",
"POS",
"span dep seq",
"TAG sequence",
"POS sequence",
"head",
"head dep",
"children",
"morphology",
"sent",
]
data = []
# data = span_info_aggregator(doc, columns)
sentences = {s: i for i, s in enumerate(doc.sents)}
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]):
span_info = []
span_info.extend([str(getattr(span, attr)) for attr in attrs])
span_info.append(score)
span_info.append(int(sentences[span.sent]))
span_info.append(construction_classifier2(doc, span))
span_info.append(span.root.dep_)
span_info.append(span.root.ent_type_)
span_info.append(span.root.tag_)
span_info.append("_".join([t.dep_ for t in span]))
span_info.append("_".join([t.tag_ for t in span]))
span_info.append("_".join([t.pos_ for t in span]))
span_info.append(span.root.head.norm_)
span_info.append(span.root.head.dep_)
span_info.append("_".join([c.dep_ for c in span.root.children]))
span_info.append(str(span.root.morph))
span_info.append(span.sent.text.strip())
data.append(span_info)
return data, columns
def ngrammar(seq: list, n=2, concat=False, sep="-"):
result = []
n_item = len(seq)
for idx, item in enumerate(seq):
if idx + n <= n_item:
if concat:
result.append(sep.join(seq[idx : idx + n]))
else:
result.append(seq[idx : idx + n])
return result
def diversity_values(count_vec: list):
result = {}
if len(count_vec) == 0:
count_vec = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
result["shannon"] = dv.alpha.shannon(list(count_vec), base=2)
result["brillouin_d"] = dv.alpha.brillouin_d(list(count_vec))
result["simpson_d"] = 1 - dv.alpha.simpson(list(count_vec))
result["simpson_e"] = dv.alpha.simpson_e(list(count_vec))
# result['gini_index'] = dv.alpha.gini_index(list(count_vec))
# result['faith_pd'] = dv.alpha.faith_pd(list(count_vec))
return result