|
from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable |
|
import pandas as pd |
|
import spacy |
|
from spacy.language import Language |
|
from skbio import diversity as dv |
|
|
|
SPAN_ATTRS = ["text", "label_", "start", "end"] |
|
CATEGORIES = [ |
|
"ATTRIBUTION", |
|
"CITATION", |
|
"COUNTER", |
|
"DENY", |
|
"ENDOPHORIC", |
|
"ENTERTAIN", |
|
"JUSTIFYING", |
|
"MONOGLOSS", |
|
"PROCLAIM", |
|
"SOURCES", |
|
] |
|
|
|
|
|
def simple_table( |
|
doc: Union[spacy.tokens.Doc, Dict[str, str]], |
|
spans_key: str = "sc", |
|
attrs: List[str] = SPAN_ATTRS, |
|
): |
|
columns = attrs + ["Conf. score"] |
|
data = [ |
|
[str(getattr(span, attr)) for attr in attrs] + [score] |
|
for span, score in zip( |
|
doc.spans[spans_key], doc.spans[spans_key].attrs["scores"] |
|
) |
|
] |
|
return data, columns |
|
|
|
|
|
|
|
|
|
|
|
def construction_classifier(doc, span): |
|
category = None |
|
spanroot = span.root |
|
|
|
|
|
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span] |
|
span_dep = [t.dep_ for t in span] |
|
span_token = [t.norm_ for t in span] |
|
span_tag = [t.tag_ for t in span] |
|
|
|
c = [c for c in spanroot.children] |
|
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children] |
|
|
|
c_norm = [c.norm_ for c in spanroot.children] |
|
c_dep = [c.dep_ for c in spanroot.children] |
|
c_pos = [c.pos_ for c in spanroot.children] |
|
c_tag = [c.tag_ for c in spanroot.children] |
|
|
|
right_dep = [c.dep_ for c in spanroot.rights] |
|
|
|
|
|
subjless = all( |
|
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] |
|
for c in spanroot.children |
|
) |
|
argmentless = all( |
|
c.dep_ |
|
not in [ |
|
"nsubj", |
|
"nsubjpass", |
|
"csubj", |
|
"csubjpass", |
|
"dobj", |
|
"ccomp", |
|
"xcomp", |
|
"dative", |
|
"attr", |
|
"oprd", |
|
"acomp", |
|
] |
|
for c in spanroot.children |
|
) |
|
argless_span = all( |
|
c.dep_ |
|
not in [ |
|
"nsubj", |
|
"nsubjpass", |
|
"csubj", |
|
"csubjpass", |
|
"dobj", |
|
"ccomp", |
|
"xcomp", |
|
"dative", |
|
"attr", |
|
"oprd", |
|
"acomp", |
|
] |
|
for c in span |
|
) |
|
|
|
|
|
if spanroot.dep_ == "conj": |
|
while spanroot.dep_ == "conj": |
|
spanroot = spanroot.head |
|
|
|
|
|
|
|
|
|
|
|
|
|
if spanroot.dep_ in ["preconj", "cc"]: |
|
category = "Conjunction" |
|
|
|
|
|
|
|
if spanroot.dep_ in ["amod"]: |
|
category = "Adjectival modifier" |
|
|
|
if spanroot.dep_ in ["compound"]: |
|
category = "Compound noun" |
|
|
|
|
|
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]: |
|
if "acl" in c_dep: |
|
category = "Noun + Complement (Object)" |
|
else: |
|
category = "Object" |
|
|
|
if spanroot.dep_ in ["nsubj", "nsubjpass"]: |
|
if "acl" in c_dep: |
|
category = "Noun + Complement (Subject)" |
|
else: |
|
category = "Subject" |
|
|
|
|
|
|
|
if spanroot.dep_ in ["prep", "agent"]: |
|
category = "Prepositional phrase" |
|
|
|
if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod"]: |
|
category = "Adverbial phrase" |
|
|
|
|
|
if spanroot.dep_ in ["acomp", "oprd"]: |
|
if "xcomp" in c_dep: |
|
category = "Subject predicate to-cl" |
|
else: |
|
category = "Adjectival complement" |
|
|
|
if spanroot.dep_ in ["attr"]: |
|
subjless = all( |
|
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] |
|
for c in spanroot.children |
|
) |
|
|
|
c_head = [c.dep_ for c in spanroot.head.children] |
|
if "expl" in c_head and "no_det" in span_t_dep_: |
|
category = "There is/are no NOUN" |
|
elif "expl" in c_head and spanroot.pos_ in ["NOUN"]: |
|
category = "There is/are + Noun complement" |
|
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]: |
|
category = "There is/are + Noun complement" |
|
|
|
elif spanroot.pos_ in ["NOUN", "PRON"]: |
|
if "acl" in c_dep: |
|
category = "Noun + Complement (attr)" |
|
else: |
|
category = "Nominal complement" |
|
|
|
elif not subjless and spanroot.pos_ in ["VERB", "AUX"]: |
|
category = "Main verb 4" |
|
|
|
elif spanroot.tag_ in ["NNP"]: |
|
category = "Nominal complement" |
|
|
|
|
|
|
|
|
|
if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl"]: |
|
_check_to = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) |
|
and c.head.dep_ == "xcomp" |
|
] |
|
_check_ing = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if "Prog" in str(c.morph) and c.dep_ == "xcomp" |
|
] |
|
root_before_ccomp = [ |
|
c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp" |
|
] |
|
|
|
_check_for_to = [ |
|
"_".join([c.norm_, c.dep_]) |
|
for c in spanroot.subtree |
|
if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux") |
|
] |
|
entire_cl = ( |
|
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end |
|
) |
|
|
|
|
|
if spanroot.dep_ in ["advcl", "mark", "acl", "pcomp"]: |
|
|
|
|
|
|
|
subjless = all( |
|
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] |
|
for c in spanroot.children |
|
) |
|
entire_cl = ( |
|
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end |
|
) |
|
|
|
if "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"]: |
|
category = "Finite adverbial clause" |
|
elif "mark" in span_dep and "aux" in span_dep: |
|
category = "Finite adverbial clause" |
|
|
|
elif ( |
|
"mark" in span_dep |
|
and spanroot.pos_ in ["VERB", "AUX"] |
|
and "expl" in c_dep |
|
): |
|
category = "Finite adverbial clause" |
|
|
|
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag): |
|
if spanroot.pos_ in ["VERB", "AUX"]: |
|
category = "Finite adverbial clause" |
|
|
|
elif spanroot.pos_ not in ["VERB", "AUX"] and subjless: |
|
category = "Non-finite adv clause 1" |
|
|
|
elif entire_cl: |
|
category = "Finite adverbial clause" |
|
|
|
elif ( |
|
str(spanroot.morph) |
|
in [ |
|
"Aspect=Prog|Tense=Pres|VerbForm=Part", |
|
"Aspect=Perf|Tense=Past|VerbForm=Part", |
|
] |
|
and "aux" not in c_dep |
|
): |
|
|
|
if argmentless: |
|
|
|
category = "Adverbial Phrase" |
|
else: |
|
category = "Non-finite adv clause 2" |
|
|
|
elif ( |
|
spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless |
|
): |
|
category = "Non-finite adv clause 3" |
|
|
|
elif "aux" in c_dep and "TO" in c_tag: |
|
category = "Adverbial Phrase" |
|
|
|
elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]: |
|
category = "Dependent Verb phrase" |
|
|
|
elif not argmentless: |
|
category = "Adverbial clause" |
|
|
|
elif spanroot.dep_ == "advcl": |
|
category = "Adverbial phrase" |
|
|
|
if spanroot.dep_ in ["relcl", "ccomp", "acl"]: |
|
head = spanroot.head |
|
if ";" in [t.norm_ for t in head.children]: |
|
category = "Main verb 3" |
|
elif "nsubj" not in span_dep: |
|
category = "Dependent verb 1" |
|
elif "mark" in span_dep: |
|
category = "Complement clause" |
|
elif ( |
|
str(spanroot.morph) |
|
in [ |
|
"Aspect=Prog|Tense=Pres|VerbForm=Part", |
|
"Aspect=Perf|Tense=Past|VerbForm=Part", |
|
] |
|
and "aux" not in c_dep |
|
): |
|
category = "Non-finite complement clause" |
|
elif spanroot.dep_ in ["relcl"]: |
|
category = "Relative clause" |
|
elif spanroot.dep_ in ["ccomp"]: |
|
category = "Complement clause" |
|
elif spanroot.dep_ in ["acl"]: |
|
category = "Noun Complement clause" |
|
else: |
|
|
|
category = "this one" |
|
|
|
|
|
|
|
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [ |
|
"VERB", |
|
"AUX", |
|
]: |
|
print(c_dep) |
|
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep: |
|
|
|
|
|
category = ( |
|
"Extraposed that-cl (adj-complement)" |
|
) |
|
|
|
elif "xcomp" in c_dep or ("advcl" in c_dep): |
|
if "for_mark" in _check_for_to: |
|
category = ( |
|
"Extraposed to-cl (explicit subj)" |
|
) |
|
elif _check_to: |
|
category = "Extraposed to-cl 1" |
|
elif _check_ing: |
|
category = "Extraposed -ing 1" |
|
elif ( |
|
("prep" in right_dep or "npadvmod" in right_dep) |
|
and "ccomp" in right_dep |
|
and spanroot.lemma_ == "be" |
|
): |
|
category = "Cleft construction" |
|
|
|
elif "attr" in c_dep: |
|
category = "Extraposed that-cl (copula)" |
|
|
|
else: |
|
category = "Extraposed that-cl (VERB)" |
|
|
|
|
|
|
|
elif ( |
|
"it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_ |
|
) and "acomp" in c_dep: |
|
if "xcomp" in c_dep: |
|
if _check_to: |
|
category = "Extraposed to-cl 2" |
|
elif _check_ing: |
|
category = "Extraposed -ing 2" |
|
|
|
else: |
|
category = "Extraposed that-cl (adj-complement) 2" |
|
|
|
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep: |
|
category = ( |
|
"Extraposed that-cl (adj-complement) 3" |
|
) |
|
|
|
|
|
elif ( |
|
(("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep) |
|
and spanroot.pos_ in ["AUX", "VERB"] |
|
and "it" not in c_norm |
|
): |
|
|
|
_check_xcomp = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp" |
|
] |
|
_check_ccomp = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp" |
|
] |
|
|
|
|
|
|
|
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep: |
|
if any(root_before_ccomp): |
|
category = "Post-predicate that-cl" |
|
else: |
|
category = "Comment clause" |
|
|
|
elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp: |
|
category = "Post-predicate that-cl 2" |
|
|
|
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp: |
|
category = "Post-predicate to-cl" |
|
|
|
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to: |
|
category = "Subject predicate to-cl" |
|
|
|
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to: |
|
category = "Subject predicate to-cl (passive)" |
|
|
|
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing: |
|
category = "Subject predicate -ing" |
|
elif "ccomp" in c_dep: |
|
category = "Subject predicate that-cl" |
|
elif "acomp" in c_dep: |
|
category = "Adjectival predicate" |
|
|
|
elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep): |
|
category = "Finite-adverbial clause" |
|
else: |
|
category = "Main verb 1" |
|
|
|
|
|
elif ( |
|
("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep |
|
and spanroot.pos_ in ["AUX", "VERB"] |
|
and "it" not in c_norm |
|
and spanroot.lemma_ not in ["be"] |
|
): |
|
_check_wh = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if ( |
|
c.dep_ in ["attr", "advmod", "dobj", "nsubj"] |
|
and c.tag_ in ["WP", "WRB", "WDT", "WP$"] |
|
) |
|
and c.head.dep_ == "ccomp" |
|
] |
|
_check_if = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) |
|
and c.head.dep_ == "ccomp" |
|
] |
|
|
|
|
|
|
|
|
|
if "ccomp" in c_dep and (_check_wh or _check_if): |
|
category = "Post-predicate wh-cl" |
|
|
|
elif "ccomp" in c_dep: |
|
if any(root_before_ccomp): |
|
category = "Post-predicate that-cl" |
|
else: |
|
category = "Comment clause" |
|
|
|
elif "xcomp" in c_dep: |
|
if _check_to: |
|
category = "Post-predicate to-cl" |
|
elif _check_ing: |
|
category = "Post-predicate -ing" |
|
|
|
|
|
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep: |
|
category = "There is/are NOUN" |
|
|
|
elif ( |
|
"ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"] |
|
): |
|
category = "Cleft construction" |
|
|
|
if spanroot.dep_ in ["parataxis"]: |
|
if "_".join(span_dep) in [ |
|
"nsubj_parataxis", |
|
"aux_parataxis", |
|
"nsubj_aux_parataxis", |
|
]: |
|
category = "Comment clause" |
|
else: |
|
category = "parataxis (for now)" |
|
|
|
|
|
if spanroot.dep_ in ["xcomp"]: |
|
if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_: |
|
category = "Adjective complement to-cl" |
|
if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_: |
|
category = "Verb complement to-cl" |
|
|
|
if spanroot.dep_ in ["pcomp"]: |
|
if ( |
|
str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] |
|
and "ccomp" in c_dep |
|
): |
|
category = "Participle + that-cl" |
|
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]: |
|
category = "Participle" |
|
|
|
|
|
|
|
|
|
|
|
|
|
if spanroot.dep_ in ["neg"]: |
|
category = "Negative particle" |
|
if spanroot.dep_ in ["aux", "auxpass"]: |
|
category = "Auxiliary" |
|
|
|
|
|
if spanroot.tag_ == "MD": |
|
category = "Modal auxiliary" |
|
|
|
if spanroot.dep_ in ["dep", "csubj", "csubjpass"]: |
|
if ( |
|
spanroot.head.dep_ in ["ROOT", "ccomp"] |
|
and spanroot.head.pos_ in ["AUX", "VERB"] |
|
and spanroot.pos_ in ["AUX", "VERB"] |
|
): |
|
if spanroot.morph == spanroot.head.morph: |
|
category = "Main verb 4" |
|
else: |
|
category = "Dependent verb 2" |
|
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part": |
|
category = "Gerund" |
|
elif spanroot.head.dep_ in ["conj", "acl", "relcl"]: |
|
if spanroot.morph == spanroot.head.morph: |
|
category = "Main verb 4" |
|
else: |
|
category = "Dependent verb 2" |
|
elif "VerbForm=Fin" in str(spanroot.morph): |
|
category = "Dependent verb 2" |
|
|
|
|
|
if spanroot.dep_ in ["appos"]: |
|
if "nummod" in c_dep: |
|
category = "Apposition" |
|
elif spanroot.pos_ in ["PROPN"]: |
|
category = "Appositive Proper Nouns" |
|
elif spanroot.pos_ in ["NOUN"]: |
|
category = "Appositive Noun Phrase" |
|
elif spanroot.pos_ in ["VERB", "AUX"]: |
|
_check = any( |
|
c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"] |
|
for c in spanroot.children |
|
) |
|
if _check: |
|
category = "Appositive Finite-clause" |
|
|
|
if spanroot.dep_ in ["appos", "dep", "attr"]: |
|
if not subjless and spanroot.pos_ in ["VERB", "AUX"]: |
|
category = "Main verb 5" |
|
|
|
if spanroot.dep_ in ["dep", "mark"]: |
|
if spanroot.tag_ in ["RB", "IN", "CC"]: |
|
category = "Conjunction" |
|
|
|
|
|
if spanroot.dep_ in ["aux", "auxpass", "oprd", "appos", "xcomp"]: |
|
if spanroot.head.dep_ == "ROOT": |
|
category = "Main verb" |
|
else: |
|
category = "dependent verb 5" |
|
|
|
if span.label_ == "CITATION": |
|
if "NNP" in span_tag or "NNPS" in span_tag: |
|
if span_dep[0] == "punct" and span_dep[-1] == "punct": |
|
category = "Parenthetical Citation" |
|
elif span_tag[0] in ["NNP", "NNPS"]: |
|
category = "Narrative Citation" |
|
else: |
|
category = "Other Citation" |
|
|
|
if category == None: |
|
category = spanroot.dep_ |
|
|
|
return category |
|
|
|
|
|
def construction_classifier2(doc, span): |
|
category = None |
|
spanroot = span.root |
|
|
|
|
|
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span] |
|
span_dep = [t.dep_ for t in span] |
|
span_token = [t.norm_ for t in span] |
|
span_tag = [t.tag_ for t in span] |
|
|
|
c = [c for c in spanroot.children] |
|
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children] |
|
|
|
c_norm = [c.norm_ for c in spanroot.children] |
|
c_dep = [c.dep_ for c in spanroot.children] |
|
c_pos = [c.pos_ for c in spanroot.children] |
|
c_tag = [c.tag_ for c in spanroot.children] |
|
|
|
right_dep = [c.dep_ for c in spanroot.rights] |
|
|
|
|
|
subjless = all( |
|
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] |
|
for c in spanroot.children |
|
) |
|
argmentless = all( |
|
c.dep_ |
|
not in [ |
|
"nsubj", |
|
"nsubjpass", |
|
"csubj", |
|
"csubjpass", |
|
"dobj", |
|
"ccomp", |
|
"xcomp", |
|
"dative", |
|
"attr", |
|
"oprd", |
|
"acomp", |
|
] |
|
for c in spanroot.children |
|
) |
|
argless_span = all( |
|
c.dep_ |
|
not in [ |
|
"nsubj", |
|
"nsubjpass", |
|
"csubj", |
|
"csubjpass", |
|
"dobj", |
|
"ccomp", |
|
"xcomp", |
|
"dative", |
|
"attr", |
|
"oprd", |
|
"acomp", |
|
] |
|
for c in span |
|
) |
|
argless_span = all( |
|
c.dep_ |
|
not in [ |
|
"nsubj", |
|
"nsubjpass", |
|
"csubj", |
|
"csubjpass", |
|
"dobj", |
|
"ccomp", |
|
"xcomp", |
|
"dative", |
|
"attr", |
|
"oprd", |
|
"acomp", |
|
] |
|
for c in span |
|
) |
|
|
|
|
|
if spanroot.dep_ == "conj": |
|
while spanroot.dep_ == "conj": |
|
spanroot = spanroot.head |
|
|
|
if spanroot.dep_ == "poss": |
|
head = spanroot.head |
|
if head.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]: |
|
category = "Posessive Noun (Object)" |
|
elif head.dep_ in ["nsubj", "nsubjpass"]: |
|
category = "Posessive Noun (Subject)" |
|
else: |
|
category = "Posessive Noun (Other)" |
|
|
|
|
|
|
|
if spanroot.dep_ in ["preconj", "cc"]: |
|
category = "Conjunction" |
|
|
|
|
|
|
|
if spanroot.dep_ in ["amod"]: |
|
category = "Adjectival modifier" |
|
|
|
if spanroot.dep_ in ["compound"]: |
|
category = "Compound noun" |
|
|
|
|
|
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]: |
|
if "acl" in c_dep: |
|
category = "Noun + Complement (Object)" |
|
else: |
|
category = "Object" |
|
|
|
if spanroot.dep_ in ["nsubj", "nsubjpass"]: |
|
if "acl" in c_dep: |
|
category = "Noun + Complement (Subject)" |
|
else: |
|
category = "Subject" |
|
|
|
|
|
|
|
if spanroot.dep_ in ["prep", "agent"]: |
|
category = "Prepositional phrase" |
|
|
|
|
|
if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod", "nummod"]: |
|
category = "Adverbial phrase" |
|
|
|
|
|
if spanroot.dep_ in ["acomp", "oprd"]: |
|
if "xcomp" in c_dep: |
|
category = "Subject predicate to-cl" |
|
else: |
|
category = "Adjectival complement" |
|
|
|
if spanroot.dep_ in ["attr"]: |
|
subjless = all( |
|
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] |
|
for c in spanroot.children |
|
) |
|
|
|
c_head = [c.dep_ for c in spanroot.head.children] |
|
if "expl" in c_head and "no_det" in span_t_dep_: |
|
category = "There is/are no NOUN" |
|
elif "expl" in c_head and spanroot.pos_ in ["NOUN"]: |
|
category = "There is/are + Noun complement" |
|
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]: |
|
category = "There is/are + Noun complement" |
|
|
|
elif spanroot.pos_ in ["NOUN", "PRON"]: |
|
if "acl" in c_dep: |
|
category = "Noun + Complement (attr)" |
|
else: |
|
category = "Nominal complement" |
|
|
|
elif not subjless and spanroot.pos_ in ["VERB", "AUX"]: |
|
category = "Main verb 4" |
|
|
|
elif spanroot.tag_ in ["NNP"]: |
|
category = "Nominal complement" |
|
|
|
|
|
if spanroot.dep_ in ["xcomp"]: |
|
if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_: |
|
category = "Adjective complement to-cl" |
|
if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_: |
|
category = "Verb complement to-cl" |
|
|
|
if spanroot.dep_ in ["pcomp"]: |
|
if ( |
|
str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] |
|
and "ccomp" in c_dep |
|
): |
|
category = "Participle + that-cl" |
|
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]: |
|
category = "Participle" |
|
|
|
|
|
|
|
|
|
|
|
|
|
if spanroot.dep_ in ["neg"]: |
|
category = "Negative particle" |
|
if spanroot.dep_ in ["aux", "auxpass"]: |
|
category = "Auxiliary" |
|
|
|
|
|
if spanroot.tag_ == "MD": |
|
category = "Modal auxiliary" |
|
|
|
|
|
|
|
|
|
if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl", "punct"]: |
|
_check_to = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) |
|
and c.head.dep_ == "xcomp" |
|
] |
|
_check_ing = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if "Prog" in str(c.morph) and c.dep_ == "xcomp" |
|
] |
|
root_before_ccomp = [ |
|
c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp" |
|
] |
|
|
|
_check_for_to = [ |
|
"_".join([c.norm_, c.dep_]) |
|
for c in spanroot.subtree |
|
if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux") |
|
] |
|
entire_cl = ( |
|
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end |
|
) |
|
|
|
|
|
if spanroot.dep_ in ["advcl", "acl", "punct", "pcomp"]: |
|
|
|
subjless = all( |
|
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] |
|
for c in spanroot.children |
|
) |
|
entire_cl = ( |
|
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end |
|
) |
|
|
|
|
|
if "mark" in span_dep and ( |
|
spanroot.pos_ in ["VERB", "AUX"] or "aux" in span_dep |
|
): |
|
category = "Finite adverbial clause" |
|
|
|
elif "mark" in span_dep and "aux" in span_dep: |
|
category = "Finite adverbial clause" |
|
|
|
elif ( |
|
"mark" in span_dep |
|
and spanroot.pos_ in ["VERB", "AUX"] |
|
and "expl" in c_dep |
|
): |
|
category = "Finite adverbial clause" |
|
|
|
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag): |
|
if spanroot.pos_ in ["VERB", "AUX"]: |
|
category = "Finite adverbial clause" |
|
|
|
elif spanroot.pos_ not in ["VERB", "AUX"] and subjless: |
|
category = "Non-finite adv clause 1" |
|
|
|
elif not argmentless: |
|
category = "Finite adverbial clause" |
|
|
|
|
|
elif ( |
|
str(spanroot.morph) |
|
in [ |
|
"Aspect=Prog|Tense=Pres|VerbForm=Part", |
|
"Aspect=Perf|Tense=Past|VerbForm=Part", |
|
] |
|
and "aux" not in c_dep |
|
): |
|
|
|
if argmentless: |
|
|
|
category = "Adverbial Phrase" |
|
else: |
|
category = "Non-finite adv clause 2" |
|
|
|
elif ( |
|
spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless |
|
): |
|
category = "Non-finite adv clause 3" |
|
|
|
elif "aux" in c_dep and "TO" in c_tag: |
|
category = "Adverbial Phrase" |
|
|
|
elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]: |
|
category = "Dependent Verb phrase" |
|
|
|
elif not argmentless: |
|
category = "Adverbial clause" |
|
|
|
elif spanroot.dep_ == "advcl": |
|
category = "Adverbial phrase" |
|
|
|
else: |
|
category = "Finite adverbial clause " |
|
|
|
if spanroot.dep_ in ["relcl", "ccomp", "acl", "punct", "pcomp"]: |
|
head = spanroot.head |
|
if ";" in [t.norm_ for t in head.children]: |
|
category = "Main verb 3" |
|
|
|
elif "nsubj" not in span_dep: |
|
category = "Dependent verb 1" |
|
|
|
elif "mark" in span_dep: |
|
category = "Complement clause" |
|
elif ( |
|
str(spanroot.morph) |
|
in [ |
|
"Aspect=Prog|Tense=Pres|VerbForm=Part", |
|
"Aspect=Perf|Tense=Past|VerbForm=Part", |
|
] |
|
and "aux" not in c_dep |
|
): |
|
category = "Non-finite complement clause" |
|
elif spanroot.dep_ in ["relcl"]: |
|
category = "Relative clause" |
|
elif spanroot.dep_ in ["ccomp"]: |
|
category = "Complement clause" |
|
elif spanroot.dep_ in ["acl"]: |
|
category = "Noun Complement clause" |
|
|
|
|
|
|
|
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [ |
|
"VERB", |
|
"AUX", |
|
]: |
|
|
|
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep: |
|
|
|
|
|
category = ( |
|
"Extraposed that-cl (adj-complement)" |
|
) |
|
|
|
elif "xcomp" in c_dep or ("advcl" in c_dep): |
|
if "for_mark" in _check_for_to: |
|
category = ( |
|
"Extraposed to-cl (explicit subj)" |
|
) |
|
elif _check_to: |
|
category = "Extraposed to-cl 1" |
|
elif _check_ing: |
|
category = "Extraposed -ing 1" |
|
elif ( |
|
("prep" in right_dep or "npadvmod" in right_dep) |
|
and "ccomp" in right_dep |
|
and spanroot.lemma_ == "be" |
|
): |
|
category = "Cleft construction" |
|
|
|
elif "attr" in c_dep: |
|
category = "Extraposed that-cl (copula)" |
|
|
|
else: |
|
category = "Extraposed that-cl (VERB)" |
|
|
|
|
|
|
|
elif ( |
|
"it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_ |
|
) and "acomp" in c_dep: |
|
if "xcomp" in c_dep: |
|
if _check_to: |
|
category = "Extraposed to-cl 2" |
|
elif _check_ing: |
|
category = "Extraposed -ing 2" |
|
|
|
else: |
|
category = "Extraposed that-cl (adj-complement) 2" |
|
|
|
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep: |
|
category = ( |
|
"Extraposed that-cl (adj-complement) 3" |
|
) |
|
|
|
|
|
elif ( |
|
(("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep) |
|
and spanroot.pos_ in ["AUX", "VERB"] |
|
and "it" not in c_norm |
|
): |
|
|
|
_check_xcomp = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp" |
|
] |
|
_check_ccomp = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp" |
|
] |
|
|
|
|
|
|
|
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep: |
|
if any(root_before_ccomp): |
|
category = "Post-predicate that-cl" |
|
else: |
|
category = "Comment clause" |
|
|
|
elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp: |
|
category = "Post-predicate that-cl 2" |
|
|
|
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp: |
|
category = "Post-predicate to-cl" |
|
|
|
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to: |
|
category = "Subject predicate to-cl" |
|
|
|
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to: |
|
category = "Subject predicate to-cl (passive)" |
|
|
|
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing: |
|
category = "Subject predicate -ing" |
|
elif "ccomp" in c_dep: |
|
category = "Subject predicate that-cl" |
|
elif "acomp" in c_dep: |
|
category = "Adjectival predicate" |
|
|
|
elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep): |
|
category = "Finite-adverbial clause" |
|
elif not argmentless and "SCONJ" in c_pos: |
|
category = "Finite-adverbial clause" |
|
else: |
|
category = "Main verb 1" |
|
|
|
|
|
elif ( |
|
("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep |
|
and spanroot.pos_ in ["AUX", "VERB"] |
|
and "it" not in c_norm |
|
and spanroot.lemma_ not in ["be"] |
|
): |
|
_check_wh = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if ( |
|
c.dep_ in ["attr", "advmod", "dobj", "nsubj"] |
|
and c.tag_ in ["WP", "WRB", "WDT", "WP$"] |
|
) |
|
and c.head.dep_ == "ccomp" |
|
] |
|
_check_if = [ |
|
c.dep_ |
|
for c in spanroot.subtree |
|
if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) |
|
and c.head.dep_ == "ccomp" |
|
] |
|
|
|
|
|
|
|
|
|
if "ccomp" in c_dep and (_check_wh or _check_if): |
|
category = "Post-predicate wh-cl" |
|
|
|
elif "ccomp" in c_dep: |
|
if any(root_before_ccomp): |
|
category = "Post-predicate that-cl" |
|
else: |
|
category = "Comment clause" |
|
|
|
elif "xcomp" in c_dep: |
|
if _check_to: |
|
category = "Post-predicate to-cl" |
|
elif _check_ing: |
|
category = "Post-predicate -ing" |
|
|
|
|
|
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep: |
|
category = "There is/are NOUN" |
|
|
|
elif ( |
|
"ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"] |
|
): |
|
category = "Cleft construction" |
|
|
|
|
|
|
|
if spanroot.dep_ in ["parataxis"]: |
|
if "_".join(span_dep) in [ |
|
"nsubj_parataxis", |
|
"aux_parataxis", |
|
"nsubj_aux_parataxis", |
|
]: |
|
category = "Comment clause" |
|
else: |
|
category = "Parataxis" |
|
|
|
if spanroot.dep_ in ["dep", "csubj", "csubjpass"]: |
|
if ( |
|
spanroot.head.dep_ in ["ROOT", "ccomp"] |
|
and spanroot.head.pos_ in ["AUX", "VERB"] |
|
and spanroot.pos_ in ["AUX", "VERB"] |
|
): |
|
if spanroot.morph == spanroot.head.morph: |
|
category = "Main verb 4" |
|
else: |
|
category = "Dependent verb 2" |
|
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part": |
|
category = "Gerund" |
|
elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str( |
|
spanroot.morph |
|
): |
|
category = "Dependent verb 2" |
|
elif spanroot.dep_ in ["csubj", "csubjpass"]: |
|
category = "Dependent verb (csubj)" |
|
|
|
|
|
if spanroot.dep_ in ["appos"]: |
|
if "nummod" in c_dep: |
|
category = "Apposition" |
|
if spanroot.pos_ in ["PROPN"]: |
|
category = "Appositive Proper Nouns" |
|
elif spanroot.pos_ in ["NOUN"]: |
|
category = "Appositive Noun Phrase" |
|
elif spanroot.pos_ in ["VERB", "AUX"]: |
|
_check = any( |
|
c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"] |
|
for c in spanroot.children |
|
) |
|
if _check: |
|
category = "Appositive Finite-clause" |
|
|
|
if spanroot.dep_ in ["appos", "dep", "attr"]: |
|
if not subjless and spanroot.pos_ in ["VERB", "AUX"]: |
|
category = "Main verb (likely parsing error)" |
|
|
|
|
|
if spanroot.dep_ in ["dep", "mark"]: |
|
if spanroot.tag_ in ["RB", "IN", "CC"]: |
|
category = "Conjunction" |
|
|
|
if spanroot.dep_ in ["intj"]: |
|
category = "Introjection" |
|
|
|
|
|
if ( |
|
spanroot.dep_ |
|
in ["aux", "auxpass", "oprd", "appos", "xcomp", "attr", "dep", "meta", "prt"] |
|
and category == None |
|
): |
|
if spanroot.head.dep_ == "ROOT": |
|
category = "Main verb" |
|
else: |
|
category = "dependent verb 5" |
|
|
|
if span.label_ == "CITATION": |
|
if "NNP" in span_tag or "NNPS" in span_tag: |
|
if span_dep[0] == "punct" and span_dep[-1] == "punct": |
|
category = "Parenthetical Citation" |
|
elif span_tag[0] in ["NNP", "NNPS"]: |
|
category = "Narrative Citation" |
|
else: |
|
category = "Other Citation" |
|
|
|
if category == None: |
|
category = spanroot.dep_ |
|
|
|
return category |
|
|
|
|
|
def const_table( |
|
doc: Union[spacy.tokens.Doc, Dict[str, str]], |
|
spans_key: str = "sc", |
|
attrs: List[str] = SPAN_ATTRS, |
|
): |
|
columns = attrs + [ |
|
"Conf. score", |
|
"sent no.", |
|
"grammatical realization", |
|
"span dep", |
|
"ner", |
|
"POS", |
|
"span dep seq", |
|
"TAG sequence", |
|
"POS sequence", |
|
"head", |
|
"head dep", |
|
"children", |
|
"morphology", |
|
"sent", |
|
] |
|
data = [] |
|
|
|
sentences = {s: i for i, s in enumerate(doc.sents)} |
|
|
|
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]): |
|
span_info = [] |
|
span_info.extend([str(getattr(span, attr)) for attr in attrs]) |
|
|
|
span_info.append(score) |
|
span_info.append(int(sentences[span.sent])) |
|
span_info.append(construction_classifier2(doc, span)) |
|
span_info.append(span.root.dep_) |
|
span_info.append(span.root.ent_type_) |
|
span_info.append(span.root.tag_) |
|
span_info.append("_".join([t.dep_ for t in span])) |
|
span_info.append("_".join([t.tag_ for t in span])) |
|
span_info.append("_".join([t.pos_ for t in span])) |
|
span_info.append(span.root.head.norm_) |
|
span_info.append(span.root.head.dep_) |
|
span_info.append("_".join([c.dep_ for c in span.root.children])) |
|
span_info.append(str(span.root.morph)) |
|
span_info.append(span.sent.text.strip()) |
|
|
|
data.append(span_info) |
|
|
|
return data, columns |
|
|
|
|
|
def ngrammar(seq: list, n=2, concat=False, sep="-"): |
|
result = [] |
|
n_item = len(seq) |
|
for idx, item in enumerate(seq): |
|
if idx + n <= n_item: |
|
if concat: |
|
result.append(sep.join(seq[idx : idx + n])) |
|
else: |
|
result.append(seq[idx : idx + n]) |
|
return result |
|
|
|
|
|
def diversity_values(count_vec: list): |
|
result = {} |
|
if len(count_vec) == 0: |
|
count_vec = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] |
|
|
|
result["shannon"] = dv.alpha.shannon(list(count_vec), base=2) |
|
result["brillouin_d"] = dv.alpha.brillouin_d(list(count_vec)) |
|
result["simpson_d"] = 1 - dv.alpha.simpson(list(count_vec)) |
|
result["simpson_e"] = dv.alpha.simpson_e(list(count_vec)) |
|
|
|
|
|
|
|
return result |
|
|