ValadisCERTH commited on
Commit
4d76748
·
1 Parent(s): 65f09a7

Create datesIdentification

Browse files
Files changed (1) hide show
  1. datesIdentification +213 -0
datesIdentification ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import re
3
+
4
+ from datetime import datetime
5
+
6
+ # Load the spacy model with GloVe embeddings
7
+ nlp = spacy.load("en_core_web_lg")
8
+
9
+
10
+ # Define a function to extract dates from text
11
+ def extract_dates(text):
12
+ """
13
+ Identify dates both in numeric and free-text from text, using date regex patterns and NER tag
14
+ """
15
+
16
+ # Define regex patterns for common date formats
17
+ # Regular expressions that include the \b word boundary character to ensure that the date pattern only matches if it is not part of a longer pattern that has already been matched
18
+ date_patterns = [
19
+ r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', # Matches dates like "01/01/22" or "1-1-2022"
20
+ r'\b\d{1,2}[-/]\d{1,2}\b(?!\d)', # Matches dates like "01/01" or "1-1"
21
+ r'\b[A-Z][a-z]{2,8} \d{1,2},? \d{2,4}\b', # Matches dates like "January 1, 2022" or "Feb 28, 22"
22
+ r'\b\d{1,2} [A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "1 January 2022" or "28 Feb 22"
23
+ r'\b[A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "January 2022" or "Feb 22"
24
+ r'\d{1,2}[/-]\d{4}|\d{2}\s\d{4}'
25
+ # Matches dates like (05/2018, 05-2018, 05 2018, 5/2018, 5-2018, 5 2018, 05/18, 05-18, 05 18, 5/18, 5-18, 5 18) etc.
26
+
27
+ ]
28
+
29
+ # Find all matches for date patterns in the text
30
+ matches = []
31
+ for pattern in date_patterns:
32
+ for match in re.findall(pattern, text):
33
+
34
+ # Check if the match is part of a longer date pattern that has already been matched
35
+ if all(match not in m for m in matches):
36
+ matches.append(match)
37
+
38
+ # Use SpaCy to extract additional dates
39
+ doc = nlp(text)
40
+
41
+ for ent in doc.ents:
42
+ if ent.label_ == 'DATE':
43
+ date_str = ent.text
44
+
45
+ # Checks each SpaCy date reference against the matches list to ensure that it is not already included
46
+ if all(date_str not in m for m in matches):
47
+ matches.append(date_str)
48
+
49
+ # Remove duplicates and return the matches
50
+ return list(set(matches))
51
+
52
+ def helper_fix_format_date_sf(input_list):
53
+
54
+ input_str = input_list[0]
55
+
56
+ # Split the string into separate key-value pairs
57
+ pairs = input_str.split(", ")
58
+ pairs_dict = {}
59
+
60
+ # Convert the key-value pairs into a dictionary
61
+ for pair in pairs:
62
+ key, value = pair.split(":")
63
+ pairs_dict[key] = value
64
+
65
+ # Create a list of dictionaries, ensuring all keys are present
66
+ output_list = {"day": pairs_dict.get("day", 0),
67
+ "month": pairs_dict.get("month", 0),
68
+ "year": pairs_dict.get("year", 0)}
69
+
70
+ return [{"date":output_list}]
71
+
72
+
73
+ def convert_dates(date_list):
74
+ """
75
+ Assign to the identified formatted dates the proper date format and then, on the formatted dates, assign the relevant date tags (e.g. specify which is the day, the month, etc)
76
+ """
77
+
78
+ DATE_FORMATS = {
79
+ '%B %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
80
+ '%-m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
81
+ '%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
82
+ '%d/%m': 'day:{dt.day}, month:{dt.month}',
83
+ '%B %d': 'day:{dt.day}, month:{dt.month}',
84
+ '%b %d': 'day:{dt.day}, month:{dt.month}',
85
+ '%B %Y': 'month:{dt.month}, year:{dt.year}',
86
+ '%Y': 'year:{dt.year}',
87
+ '%d/%m/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
88
+ '%B %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
89
+ '%b %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
90
+ '%d-%m-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
91
+ '%d/%m/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
92
+ '%d-%m-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
93
+ '%m/%d/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
94
+ '%m/%d/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
95
+ '%m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
96
+ '%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
97
+ '%d/%m/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
98
+ '%d/%m/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
99
+ '%m/%d/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
100
+ '%m/%d/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
101
+ '%Y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
102
+ '%y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
103
+ '%m-%d-%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
104
+ '%m-%d-%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
105
+ '%m-%d': 'month:{dt.month}, day:{dt.day}',
106
+ '%-m-%-d': 'month:{dt.month}, day:{dt.day}',
107
+ '%d %b %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
108
+ '%d %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
109
+ '%b %Y': 'month:{dt.month}, year:{dt.year}',
110
+ '%b %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
111
+ '%d %B %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
112
+
113
+ # 09 05 2018
114
+ '%d %m %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
115
+
116
+ # 05/2018, 05-2018, 05 2018, 5/2018, 5-2018, 5 2018
117
+ '%m %Y': 'month:{dt.month}, year:{dt.year}',
118
+ '%m/%Y': 'month:{dt.month}, year:{dt.year}',
119
+ '%m-%Y': 'month:{dt.month}, year:{dt.year}',
120
+
121
+ # 05/18, 05-18, 05 18, 5/18, 5-18, 5 18
122
+ '%m/%y': 'month:{dt.month}, year:{dt.year}',
123
+ '%m-%y': 'month:{dt.month}, year:{dt.year}',
124
+ '%m %y': 'month:{dt.month}, year:{dt.year}',
125
+ '%-m/%y': 'month:{dt.month}, year:{dt.year}',
126
+ '%-m-%y': 'month:{dt.month}, year:{dt.year}',
127
+ '%-m %y': 'month:{dt.month}, year:{dt.year}',
128
+
129
+ # 9th May 2018 etc
130
+ '%dth %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
131
+ '%dth %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
132
+ '%dst %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
133
+ '%dst %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
134
+ '%dnd %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
135
+ '%dnd %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
136
+ '%drd %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
137
+ '%drd %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
138
+
139
+ # August 9 2018, August 9 18, Jan 1 23, etc.
140
+ '%B %d %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
141
+ '%B %d %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
142
+ '%b %d %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
143
+ '%b %d %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}'
144
+ }
145
+
146
+ output_list = []
147
+ for date_str in date_list:
148
+ valid_format = False
149
+ for fmt, out_fmt in DATE_FORMATS.items():
150
+ try:
151
+ dt = datetime.strptime(date_str, fmt)
152
+ output_list.append(out_fmt.format(dt=dt))
153
+ valid_format = True
154
+ break
155
+ except ValueError:
156
+ pass
157
+ if not valid_format:
158
+ # Attempt to parse using a custom format
159
+ try:
160
+ if '-' in date_str:
161
+ dt = datetime.strptime(date_str, '%m-%d-%y')
162
+ else:
163
+ dt = datetime.strptime(date_str, '%d/%m/%y')
164
+ output_list.append(f'day:{dt.day}, month:{dt.month}, year:{dt.year}')
165
+ except ValueError:
166
+ output_list.append(f'INVALID FORMAT: {date_str}')
167
+
168
+ # return output_list
169
+ return helper_fix_format_date_sf(output_list)
170
+
171
+
172
+ def dates_binding(text):
173
+ '''
174
+ This is a function that binds together all the subcomponents of the dates identification, while also controlling for multiple, or zero date references
175
+ '''
176
+
177
+ try:
178
+
179
+ # capture the referred dates
180
+ ident_dates = extract_dates(text)
181
+
182
+ # since we now cope for formats like '05 2018' and '09 05 2018', our module would capture them as two seperate cases.
183
+ # with this line we check if '05 2018' is contained on '09 05 2018', in which case we delete it
184
+ identified_dates = [elem for elem in ident_dates if not any(elem in other_elem for other_elem in ident_dates if elem != other_elem)]
185
+
186
+ # we only accept for one date reference
187
+ if len(identified_dates) == 1:
188
+
189
+ formatted_dates = convert_dates(identified_dates)
190
+
191
+ # in case there is a wrong date format then return the appropriate code to prompt back the proper message
192
+ if 'INVALID FORMAT' in formatted_dates[0]:
193
+ return (0,'DATES','wrong_date_format')
194
+
195
+ else:
196
+ return [formatted_dates, identified_dates]
197
+
198
+ # in case of zero references return the appropriate code (to aid returning the correct prompt)
199
+ elif len(identified_dates) == 0:
200
+ return (0,'DATES','no_date')
201
+
202
+ # in case of more than one references return the appropriate code (to aid returning the correct prompt)
203
+ elif len(identified_dates) > 1:
204
+ return (0,'DATES','more_dates')
205
+
206
+ # in case of unexpected error return the appropriate code (to aid returning the correct prompt)
207
+ else:
208
+ return (0,'DATES','unknown_error')
209
+
210
+ except:
211
+ return (0,'DATES','unknown_error')
212
+
213
+