Spaces:
Runtime error
Runtime error
import spacy | |
import re | |
from datetime import datetime | |
# Load the spacy model with GloVe embeddings | |
nlp = spacy.load("en_core_web_lg") | |
# Define a function to extract dates from text | |
def extract_dates(text): | |
""" | |
Identify dates both in numeric and free-text from text, using date regex patterns and NER tag | |
""" | |
# Define regex patterns for common date formats | |
# Regular expressions that include the \b word boundary character to ensure that the date pattern only matches if it is not part of a longer pattern that has already been matched | |
date_patterns = [ | |
r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', # Matches dates like "01/01/22" or "1-1-2022" | |
r'\b\d{1,2}[-/]\d{1,2}\b(?!\d)', # Matches dates like "01/01" or "1-1" | |
r'\b[A-Z][a-z]{2,8} \d{1,2},? \d{2,4}\b', # Matches dates like "January 1, 2022" or "Feb 28, 22" | |
r'\b\d{1,2} [A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "1 January 2022" or "28 Feb 22" | |
r'\b[A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "January 2022" or "Feb 22" | |
r'\d{1,2}[/-]\d{4}|\d{2}\s\d{4}' | |
# Matches dates like (05/2018, 05-2018, 05 2018, 5/2018, 5-2018, 5 2018, 05/18, 05-18, 05 18, 5/18, 5-18, 5 18) etc. | |
] | |
# Find all matches for date patterns in the text | |
matches = [] | |
for pattern in date_patterns: | |
for match in re.findall(pattern, text): | |
# Check if the match is part of a longer date pattern that has already been matched | |
if all(match not in m for m in matches): | |
matches.append(match) | |
# Use SpaCy to extract additional dates | |
doc = nlp(text) | |
for ent in doc.ents: | |
if ent.label_ == 'DATE': | |
date_str = ent.text | |
# Checks each SpaCy date reference against the matches list to ensure that it is not already included | |
if all(date_str not in m for m in matches): | |
matches.append(date_str) | |
# Remove duplicates and return the matches | |
return list(set(matches)) | |
def helper_fix_format_date_sf(input_list): | |
input_str = input_list[0] | |
# Split the string into separate key-value pairs | |
pairs = input_str.split(", ") | |
pairs_dict = {} | |
# Convert the key-value pairs into a dictionary | |
for pair in pairs: | |
key, value = pair.split(":") | |
pairs_dict[key] = value | |
# Create a list of dictionaries, ensuring all keys are present | |
output_list = {"day": pairs_dict.get("day", 0), | |
"month": pairs_dict.get("month", 0), | |
"year": pairs_dict.get("year", 0)} | |
return [{"date":output_list}] | |
def convert_dates(date_list): | |
""" | |
Assign to the identified formatted dates the proper date format and then, on the formatted dates, assign the relevant date tags (e.g. specify which is the day, the month, etc) | |
""" | |
DATE_FORMATS = { | |
'%B %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%-m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d/%m': 'day:{dt.day}, month:{dt.month}', | |
'%B %d': 'day:{dt.day}, month:{dt.month}', | |
'%b %d': 'day:{dt.day}, month:{dt.month}', | |
'%B %Y': 'month:{dt.month}, year:{dt.year}', | |
'%Y': 'year:{dt.year}', | |
'%d/%m/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%B %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%b %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d-%m-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d/%m/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d-%m-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m/%d/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m/%d/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d/%m/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%d/%m/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%m/%d/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%m/%d/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%Y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%m-%d-%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%m-%d-%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
'%m-%d': 'month:{dt.month}, day:{dt.day}', | |
'%-m-%-d': 'month:{dt.month}, day:{dt.day}', | |
'%d %b %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%b %Y': 'month:{dt.month}, year:{dt.year}', | |
'%b %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%d %B %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
# 09 05 2018 | |
'%d %m %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
# 05/2018, 05-2018, 05 2018, 5/2018, 5-2018, 5 2018 | |
'%m %Y': 'month:{dt.month}, year:{dt.year}', | |
'%m/%Y': 'month:{dt.month}, year:{dt.year}', | |
'%m-%Y': 'month:{dt.month}, year:{dt.year}', | |
# 05/18, 05-18, 05 18, 5/18, 5-18, 5 18 | |
'%m/%y': 'month:{dt.month}, year:{dt.year}', | |
'%m-%y': 'month:{dt.month}, year:{dt.year}', | |
'%m %y': 'month:{dt.month}, year:{dt.year}', | |
'%-m/%y': 'month:{dt.month}, year:{dt.year}', | |
'%-m-%y': 'month:{dt.month}, year:{dt.year}', | |
'%-m %y': 'month:{dt.month}, year:{dt.year}', | |
# 9th May 2018 etc | |
'%dth %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%dth %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%dst %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%dst %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%dnd %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%dnd %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%drd %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%drd %b %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
# August 9 2018, August 9 18, Jan 1 23, etc. | |
'%B %d %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%B %d %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%b %d %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
'%b %d %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}' | |
} | |
output_list = [] | |
for date_str in date_list: | |
valid_format = False | |
for fmt, out_fmt in DATE_FORMATS.items(): | |
try: | |
dt = datetime.strptime(date_str, fmt) | |
output_list.append(out_fmt.format(dt=dt)) | |
valid_format = True | |
break | |
except ValueError: | |
pass | |
if not valid_format: | |
# Attempt to parse using a custom format | |
try: | |
if '-' in date_str: | |
dt = datetime.strptime(date_str, '%m-%d-%y') | |
else: | |
dt = datetime.strptime(date_str, '%d/%m/%y') | |
output_list.append(f'day:{dt.day}, month:{dt.month}, year:{dt.year}') | |
except ValueError: | |
output_list.append(f'INVALID FORMAT: {date_str}') | |
# return output_list | |
return helper_fix_format_date_sf(output_list) | |
def dates_binding(text): | |
''' | |
This is a function that binds together all the subcomponents of the dates identification, while also controlling for multiple, or zero date references | |
''' | |
try: | |
# capture the referred dates | |
ident_dates = extract_dates(text) | |
# since we now cope for formats like '05 2018' and '09 05 2018', our module would capture them as two seperate cases. | |
# with this line we check if '05 2018' is contained on '09 05 2018', in which case we delete it | |
identified_dates = [elem for elem in ident_dates if not any(elem in other_elem for other_elem in ident_dates if elem != other_elem)] | |
# we only accept for one date reference | |
if len(identified_dates) == 1: | |
formatted_dates = convert_dates(identified_dates) | |
# in case there is a wrong date format then return the appropriate code to prompt back the proper message | |
if 'INVALID FORMAT' in formatted_dates[0]: | |
return (0,'DATES','wrong_date_format') | |
else: | |
return [formatted_dates, identified_dates] | |
# in case of zero references return the appropriate code (to aid returning the correct prompt) | |
elif len(identified_dates) == 0: | |
return (0,'DATES','no_date') | |
# in case of more than one references return the appropriate code (to aid returning the correct prompt) | |
elif len(identified_dates) > 1: | |
return (0,'DATES','more_dates') | |
# in case of unexpected error return the appropriate code (to aid returning the correct prompt) | |
else: | |
return (0,'DATES','unknown_error') | |
except: | |
return (0,'DATES','unknown_error') | |