File size: 4,039 Bytes
5edd591
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import re
from collections import Counter
from spacy.tokens import SpanGroup


def preprocess(text):
    text = re.sub("--- Para SEP ---", '\n', text)
    text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text)
    text = re.sub('\n', ' ', text)
    text = re.sub(r'\s+', " ", text)
    text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
    return text


def del_spans(span_sc, indexes: list):

    indexes.sort(
        reverse=True
    )  # reversing allows the deletion from the last, keeping the original index

    for idx in indexes:
        if idx + 1 < len(span_sc):
            del span_sc[idx + 1]


def delete_overlapping_span(span_sc: dict):
    # print(span_sc)
    start_token_list = [spn.start for spn in span_sc]
    dict_ = Counter(start_token_list)
    overlap = {k: v for k, v in dict_.items() if v > 1}

    id_del = []
    id_comp = {}

    info = {}
    for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
                                     start=0):
        res = {
            'score': score,
            'spn': spn,
            'label': spn.label_,
            'start': spn.start,
            'end': spn.end,
            'compare': spn.start in overlap,
            "sents": len(list(spn.sents))
        }
        # print(res)
        info[n] = res

        if res['compare']:
            if spn.start not in id_comp:
                id_comp[spn.start] = n
            else:
                same_lbl = res['label'] == info[id_comp[spn.start]]['label']
                update = res['score'] > info[id_comp[spn.start]]['score']
                if update and same_lbl:
                    print(res['label'], info[id_comp[spn.start]]['label'])
                    print(same_lbl)
                    id_del.append(id_comp[spn.start])
                    id_comp[spn.start] = n
                else:
                    id_del.append(n)
                # print(update)

        # delete span beyond sentences
        if len(list(spn.sents)) > 1:
            id_del.append(n)

    # print(id_comp)
    del_spans(span_sc, id_del)
    # for n, idx in enumerate(id_del):
    #     # print(idx)

    #     try:
    #         del span_sc[idx - n]
    #     except IndexError:
    #         continue


def cleanup_justify(doc, span_sc: dict):
    # This function adjusts the JUSTIFYING span

    # First create an index of span with JUSTIFYING tags
    justifies = {}
    for idx, span in enumerate(span_sc):
        # temp_root = span.root
        # while span.start <= temp_root.head.i <= span.end:
        #     temp_root = temp_root.head
        if span.label_ in ['JUSTIFYING']:
            justifies[span.root] = {
                "span": span,
                "head": span.root.head,
                "start": span.start,
                "end": span.end,
                "del": False,
                "dependent": False,
                "span_idx": idx
            }
    # print(justifies)

    # flagging the dependency
    for spanroot, info in justifies.items():
        if spanroot.head in justifies:
            info['dependent'] = True
            info['del'] = True

    # print(justifies)
    new_spans = []
    for spanroot, info in justifies.items():

        if not info['dependent']:
            # print("New Justifying candidate span:")
            # print(doc[spanroot.left_edge.i:spanroot.right_edge.i + 1])

            new_span = doc[spanroot.left_edge.i:spanroot.right_edge.i + 1]
            new_span.label_ = "JUSTIFYING"

            if new_span not in span_sc:
                new_spans.append(new_span)
                info['del'] = True

        else:
            info['del'] = True

    to_delete = [
        info['span_idx'] for spanroot, info in justifies.items() if info['del']
    ]

    to_delete_span = [
        info['span'] for spanroot, info in justifies.items() if info['del']
    ]

    # print(to_delete)
    # print(to_delete_span)

    del_spans(span_sc, to_delete)

    span_grp = SpanGroup(doc, spans=new_spans)
    span_sc.extend(span_grp)

    # print(justifies)