SHSH0819 commited on
Commit
10058cd
·
1 Parent(s): 9db14d3

Upload event_detection_dataclean.py

Browse files
Files changed (1) hide show
  1. event_detection_dataclean.py +118 -0
event_detection_dataclean.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ from collections import Counter
4
+
5
+
6
+ def load_texttag_file(texttag_filename):
7
+ try:
8
+ with open(texttag_filename, "r") as data_file:
9
+ data_all = data_file.read()
10
+ tags_all = list()
11
+ texts_selected = list()
12
+ tags_selected = list()
13
+
14
+ for line in re.split(r'\n\t?\n', data_all):
15
+ if len(line) != 0:
16
+ texts_line = list()
17
+ tags_line = list()
18
+ for item in line.split("\n"):
19
+ if len(item)!=0:
20
+ text, tag = item.split("\t")
21
+ if re.search(r"[@|?|!+?|:|(|)]|\\|\.*?\|-|/|/|/.*?/|http\S+|www\S+", text) == None:
22
+ texts_line.append(text.lower())
23
+ tags_line.append(tag)
24
+ tags_all.append(tag)
25
+
26
+ texts_selected.append(texts_line)
27
+ tags_selected.append(tags_line)
28
+ except FileNotFoundError as error:
29
+ msg = "Sorry, the file" + data_file + "does not exist."
30
+ print(msg)
31
+ print("error:" + error)
32
+
33
+ return texts_selected, tags_selected, tags_all
34
+
35
+
36
+ def tag_ids_map(tags_all, tags2ids_name, ids2tags_name):
37
+ tags = list(set(tags_all))
38
+ tags.sort()
39
+ unique_tags = len(tags)
40
+ ids = [i for i in range(unique_tags)]
41
+
42
+ tags2ids = dict(zip(tags, ids))
43
+ ids2tags = dict(zip(ids, tags))
44
+
45
+ with open(tags2ids_name, "w") as filename:
46
+ json.dump(tags2ids, filename)
47
+
48
+ with open(ids2tags_name, "w") as filename:
49
+ json.dump(ids2tags, filename)
50
+
51
+ return tags2ids, ids2tags
52
+
53
+
54
+ def add_tagids(tags_selected, tags2ids, ids2tags):
55
+ tagids_selected = list()
56
+ for tags_line in tags_selected:
57
+ tagids_line = list()
58
+ for tag in tags_line:
59
+ tagids_line.append(tags2ids[tag])
60
+ tagids_selected.append(tagids_line)
61
+ # print(tagids_selected)
62
+ return tagids_selected
63
+
64
+
65
+ def add_text_tagid(tags_selected, tags2ids, ids2tags):
66
+ tags_chunk = list()
67
+ tagids_chunk = list()
68
+ for tags_line in tags_selected:
69
+ tag_line_chunk = list()
70
+ tagid_line_chunk = list()
71
+ tag_line_count = Counter(tags_line)
72
+ if len(tag_line_count) == 1:
73
+ tag_line_chunk.append(max(tag_line_count))
74
+ tagid_line_chunk.append(tags2ids[max(tag_line_count)])
75
+ else:
76
+ del tag_line_count["O"]
77
+ tag_line_chunk.append(max(tag_line_count))
78
+ tagid_line_chunk.append(tags2ids[max(tag_line_count)])
79
+
80
+ tags_chunk.append(tag_line_chunk)
81
+ tagids_chunk.append(tagid_line_chunk)
82
+
83
+ return tags_chunk, tagids_chunk
84
+
85
+ def save_json(json_filename, texts_selected, tags_selected, tagids_selected, tags_chunk, tagids_chunk):
86
+ total_length = len(texts_selected)
87
+ save_datalist = list()
88
+ total_length = 32
89
+ for index in range(total_length):
90
+ item_dict = dict()
91
+ item_dict["text"] = texts_selected[index]
92
+ item_dict["word_tag"] = tags_selected[index]
93
+ item_dict["word_tag_id"] = tagids_selected[index]
94
+ item_dict["text_tag"] = tags_chunk[index]
95
+ item_dict["text_tag_id"] = tagids_chunk[index]
96
+ save_datalist.append(item_dict)
97
+
98
+ with open(json_filename, 'w') as file:
99
+ json.dump(save_datalist, file)
100
+
101
+ return
102
+
103
+ def main(data_filename, json_filename, tags2ids_name, ids2tags_name):
104
+ texts_selected, tags_selected, tags_all = load_texttag_file(data_filename)
105
+ tags2ids, ids2tags = tag_ids_map(tags_all, tags2ids_name, ids2tags_name)
106
+
107
+ tagids_selected = add_tagids(tags_selected, tags2ids, ids2tags)
108
+ tags_chunk, tagids_chunk = add_text_tagid(tags_selected, tags2ids, ids2tags)
109
+
110
+ save_json(json_filename, texts_selected, tags_selected, tagids_selected, tags_chunk, tagids_chunk)
111
+
112
+
113
+ if __name__ == "__main__":
114
+ test_raw = "../data/raw_EDT/Event_detection/dev.txt"
115
+ test_save = '../data/raw_EDT/Event_detection/dev.json'
116
+ tags2ids_name = "../data/raw_EDT/Event_detection/tags2ids.json"
117
+ ids2tags_name = "../data/raw_EDT/Event_detection/ids2tags.json"
118
+ main(test_raw, test_save, tags2ids_name, ids2tags_name)