egumasa's picture
new UI
a937724
raw
history blame
1.84 kB
import re
from collections import Counter
def preprocess(text):
text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text)
text = re.sub('\n', ' ', text)
text = re.sub('\s+', " ", text)
text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
return text
def delete_overlapping_span(span_sc: dict):
# print(span_sc)
start_token_list = [spn.start for spn in span_sc]
dict_ = Counter(start_token_list)
overlap = {k: v for k, v in dict_.items() if v > 1}
id_del = []
id_comp = {}
info = {}
for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
start=1):
res = {
'score': score,
'spn': spn,
'label': spn.label_,
'start': spn.start,
'end': spn.end,
'compare': spn.start in overlap,
"sents": len(list(spn.sents))
}
# print(res)
info[n] = res
if res['compare']:
if spn.start not in id_comp:
id_comp[spn.start] = n
else:
same_lbl = res['label'] == info[id_comp[spn.start]]['label']
update = res['score'] > info[id_comp[spn.start]]['score']
if update and same_lbl:
print(res['label'], info[id_comp[spn.start]]['label'])
print(same_lbl)
id_del.append(id_comp[spn.start])
id_comp[spn.start] = n
else:
id_del.append(n)
# print(update)
# delete span beyond sentences
if len(list(spn.sents)) > 1:
id_del.append(n)
# print(id_comp)
for n, idx in enumerate(id_del):
# print(idx)
try:
del span_sc[idx - n]
except IndexError:
continue