|
import re |
|
from collections import Counter |
|
|
|
|
|
def preprocess(text): |
|
text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text) |
|
text = re.sub('\n', ' ', text) |
|
text = re.sub('\s+', " ", text) |
|
text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text) |
|
return text |
|
|
|
|
|
def delete_overlapping_span(span_sc: dict): |
|
|
|
start_token_list = [spn.start for spn in span_sc] |
|
dict_ = Counter(start_token_list) |
|
overlap = {k: v for k, v in dict_.items() if v > 1} |
|
|
|
id_del = [] |
|
id_comp = {} |
|
|
|
info = {} |
|
for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']), |
|
start=1): |
|
res = { |
|
'score': score, |
|
'spn': spn, |
|
'label': spn.label_, |
|
'start': spn.start, |
|
'end': spn.end, |
|
'compare': spn.start in overlap, |
|
"sents": len(list(spn.sents)) |
|
} |
|
|
|
info[n] = res |
|
|
|
if res['compare']: |
|
if spn.start not in id_comp: |
|
id_comp[spn.start] = n |
|
else: |
|
same_lbl = res['label'] == info[id_comp[spn.start]]['label'] |
|
update = res['score'] > info[id_comp[spn.start]]['score'] |
|
if update and same_lbl: |
|
print(res['label'], info[id_comp[spn.start]]['label']) |
|
print(same_lbl) |
|
id_del.append(id_comp[spn.start]) |
|
id_comp[spn.start] = n |
|
else: |
|
id_del.append(n) |
|
|
|
|
|
|
|
if len(list(spn.sents)) > 1: |
|
id_del.append(n) |
|
|
|
|
|
|
|
for n, idx in enumerate(id_del): |
|
|
|
try: |
|
del span_sc[idx - n] |
|
except IndexError: |
|
continue |