File size: 9,439 Bytes
4d76748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import spacy
import re

from datetime import datetime

# Load the spacy model with GloVe embeddings
nlp = spacy.load("en_core_web_lg")


# Define a function to extract dates from text
def extract_dates(text):
    """
    Identify dates both in numeric and free-text from text, using date regex patterns and NER tag
    """

    # Define regex patterns for common date formats
    # Regular expressions that include the \b word boundary character to ensure that the date pattern only matches if it is not part of a longer pattern that has already been matched
    date_patterns = [
        r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b',  # Matches dates like "01/01/22" or "1-1-2022"
        r'\b\d{1,2}[-/]\d{1,2}\b(?!\d)',  # Matches dates like "01/01" or "1-1"
        r'\b[A-Z][a-z]{2,8} \d{1,2},? \d{2,4}\b',  # Matches dates like "January 1, 2022" or "Feb 28, 22"
        r'\b\d{1,2} [A-Z][a-z]{2,8} \d{2,4}\b',  # Matches dates like "1 January 2022" or "28 Feb 22"
        r'\b[A-Z][a-z]{2,8} \d{2,4}\b',  # Matches dates like "January 2022" or "Feb 22"
        r'\d{1,2}[/-]\d{4}|\d{2}\s\d{4}'
        # Matches dates like (05/2018, 05-2018, 05 2018, 5/2018, 5-2018, 5 2018, 05/18, 05-18, 05 18, 5/18, 5-18, 5 18) etc.

    ]

    # Find all matches for date patterns in the text
    matches = []
    for pattern in date_patterns:
        for match in re.findall(pattern, text):

            # Check if the match is part of a longer date pattern that has already been matched
            if all(match not in m for m in matches):
                matches.append(match)

    # Use SpaCy to extract additional dates
    doc = nlp(text)

    for ent in doc.ents:
        if ent.label_ == 'DATE':
            date_str = ent.text

            # Checks each SpaCy date reference against the matches list to ensure that it is not already included
            if all(date_str not in m for m in matches):
                matches.append(date_str)

    # Remove duplicates and return the matches
    return list(set(matches))

def helper_fix_format_date_sf(input_list):

    input_str = input_list[0]

    # Split the string into separate key-value pairs
    pairs = input_str.split(", ")
    pairs_dict = {}

    # Convert the key-value pairs into a dictionary
    for pair in pairs:
        key, value = pair.split(":")
        pairs_dict[key] = value

    # Create a list of dictionaries, ensuring all keys are present
    output_list = {"day": pairs_dict.get("day", 0),
                   "month": pairs_dict.get("month", 0),
                   "year": pairs_dict.get("year", 0)}

    return [{"date":output_list}]


def convert_dates(date_list):
    """
    Assign to the identified formatted dates the proper date format and then, on the formatted dates, assign the relevant date tags (e.g. specify which is the day, the month, etc)
    """

    DATE_FORMATS = {
        '%B %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%-m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%d/%m': 'day:{dt.day}, month:{dt.month}',
        '%B %d': 'day:{dt.day}, month:{dt.month}',
        '%b %d': 'day:{dt.day}, month:{dt.month}',
        '%B %Y': 'month:{dt.month}, year:{dt.year}',
        '%Y': 'year:{dt.year}',
        '%d/%m/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%B %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%b %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%d-%m-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%d/%m/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%d-%m-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%m/%d/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%m/%d/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%d/%m/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
        '%d/%m/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
        '%m/%d/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
        '%m/%d/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
        '%Y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%m-%d-%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
        '%m-%d-%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}',
        '%m-%d': 'month:{dt.month}, day:{dt.day}',
        '%-m-%-d': 'month:{dt.month}, day:{dt.day}',
        '%d %b %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%d %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%b %Y': 'month:{dt.month}, year:{dt.year}',
        '%b %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%d %B %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',

        # 09 05 2018
        '%d %m %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',

        # 05/2018, 05-2018, 05 2018, 5/2018, 5-2018, 5 2018
        '%m %Y': 'month:{dt.month}, year:{dt.year}',
        '%m/%Y': 'month:{dt.month}, year:{dt.year}',
        '%m-%Y': 'month:{dt.month}, year:{dt.year}',

        # 05/18, 05-18, 05 18, 5/18, 5-18, 5 18
        '%m/%y': 'month:{dt.month}, year:{dt.year}',
        '%m-%y': 'month:{dt.month}, year:{dt.year}',
        '%m %y': 'month:{dt.month}, year:{dt.year}',
        '%-m/%y': 'month:{dt.month}, year:{dt.year}',
        '%-m-%y': 'month:{dt.month}, year:{dt.year}',
        '%-m %y': 'month:{dt.month}, year:{dt.year}',

        # 9th May 2018 etc
        '%dth %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%dth %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%dst %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%dst %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%dnd %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%dnd %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%drd %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%drd %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',

        # August 9 2018, August 9 18, Jan 1 23, etc.
        '%B %d %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%B %d %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%b %d %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}',
        '%b %d %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}'
    }

    output_list = []
    for date_str in date_list:
        valid_format = False
        for fmt, out_fmt in DATE_FORMATS.items():
            try:
                dt = datetime.strptime(date_str, fmt)
                output_list.append(out_fmt.format(dt=dt))
                valid_format = True
                break
            except ValueError:
                pass
        if not valid_format:
            # Attempt to parse using a custom format
            try:
                if '-' in date_str:
                    dt = datetime.strptime(date_str, '%m-%d-%y')
                else:
                    dt = datetime.strptime(date_str, '%d/%m/%y')
                output_list.append(f'day:{dt.day}, month:{dt.month}, year:{dt.year}')
            except ValueError:
                output_list.append(f'INVALID FORMAT: {date_str}')

    # return output_list
    return helper_fix_format_date_sf(output_list)


def dates_binding(text):
  '''
  This is a function that binds together all the subcomponents of the dates identification, while also controlling for multiple, or zero date references
  '''

  try:

      # capture the referred dates
      ident_dates = extract_dates(text)

      # since we now cope for formats like '05 2018' and '09 05 2018', our module would capture them as two seperate cases.
      # with this line we check if '05 2018' is contained on '09 05 2018', in which case we delete it
      identified_dates = [elem for elem in ident_dates if not any(elem in other_elem for other_elem in ident_dates if elem != other_elem)]

      # we only accept for one date reference
      if len(identified_dates) == 1:

        formatted_dates = convert_dates(identified_dates)

        # in case there is a wrong date format then return the appropriate code to prompt back the proper message
        if 'INVALID FORMAT' in formatted_dates[0]:
          return (0,'DATES','wrong_date_format')

        else:
          return [formatted_dates, identified_dates]

      # in case of zero references return the appropriate code (to aid returning the correct prompt)
      elif len(identified_dates) == 0:
        return (0,'DATES','no_date')

      # in case of more than one references return the appropriate code (to aid returning the correct prompt)
      elif len(identified_dates) > 1:
        return (0,'DATES','more_dates')

      # in case of unexpected error return the appropriate code (to aid returning the correct prompt)
      else:
        return (0,'DATES','unknown_error')

  except:
      return (0,'DATES','unknown_error')