import re from collections import Counter def preprocess(text): text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text) text = re.sub('\n', ' ', text) text = re.sub('\s+', " ", text) text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text) return text def delete_overlapping_span(span_sc: dict): # print(span_sc) start_token_list = [spn.start for spn in span_sc] dict_ = Counter(start_token_list) overlap = {k: v for k, v in dict_.items() if v > 1} id_del = [] id_comp = {} info = {} for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']), start=1): res = { 'score': score, 'spn': spn, 'label': spn.label_, 'start': spn.start, 'end': spn.end, 'compare': spn.start in overlap, "sents": len(list(spn.sents)) } # print(res) info[n] = res if res['compare']: if spn.start not in id_comp: id_comp[spn.start] = n else: same_lbl = res['label'] == info[id_comp[spn.start]]['label'] update = res['score'] > info[id_comp[spn.start]]['score'] if update and same_lbl: print(res['label'], info[id_comp[spn.start]]['label']) print(same_lbl) id_del.append(id_comp[spn.start]) id_comp[spn.start] = n else: id_del.append(n) # print(update) # delete span beyond sentences if len(list(spn.sents)) > 1: id_del.append(n) # print(id_comp) for n, idx in enumerate(id_del): # print(idx) try: del span_sc[idx - n] except IndexError: continue