NaturalLanguageModule_complete / magnitudeIdentification.py
ValadisCERTH's picture
Rename magnitudeIdentification to magnitudeIdentification.py
68d674a
raw
history blame
12.3 kB
import spacy
import re
from word2number import w2n
# Load the spacy model with GloVe embeddings
nlp = spacy.load("en_core_web_lg")
def capture_numbers(input_sentence):
'''
This is a function to capture cases of refered numbers either in numeric or free-text form
'''
try:
# Define the regular expression patterns
pattern1 = r"(\d+|\w+(?:\s+\w+)*)\s+(decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
# Find all matches in the text
matches = re.findall(pattern1, input_sentence)
# This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
pattern_numbers = []
for match in matches:
if len(match) == 3:
# add the $pattern string to easily specify them in a subsequent step
full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
pattern_numbers.append(full_string)
for elem in pattern_numbers:
input_sentence = input_sentence.replace(elem, " ")
if pattern_numbers:
# Remove duplicates with set and convert back to list
pattern_final_numbers = list(set(pattern_numbers))
else:
pattern_final_numbers = []
# we delete the captured references from the sentence, because if we capture something like seven point five
# then spacy will also identify seven and five, which we do not want it to
for element in pattern_final_numbers:
target_elem = element.replace("$pattern", "").strip()
if target_elem in input_sentence:
input_sentence = input_sentence.replace(target_elem, " ")
# This is for cases of thirty eight or one million and two, etc.
# Define a regular expression to match multiword free-text numbers
pattern2 = r"(?<!\w)(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)(?:\s(?:and\s)?(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion))+\s?)+(?!\w*pennies)"
# Find all multiword free-text number matches in the sentence
multi_numbers = re.findall(pattern2, input_sentence)
if multi_numbers:
multinumber_final_numbers = list(set(multi_numbers))
else:
multinumber_final_numbers = []
for elem in multinumber_final_numbers:
if elem in input_sentence:
input_sentence = input_sentence.replace(elem, " ")
# we also delete the captured references from the sentence in this case
for element in multinumber_final_numbers:
target_elem = element.replace("$pattern", "").strip()
if target_elem in input_sentence:
input_sentence = input_sentence.replace(target_elem, " ")
# Parse the input sentence with Spacy
doc = nlp(input_sentence)
# This is to capture all the numbers in int and float form, as well as numbers like eight, two, hundred
s_numbers = [token.text for token in doc if token.like_num]
if s_numbers:
# Remove duplicates with set and convert back to list
spacy_final_numbers = list(set(s_numbers))
else:
spacy_final_numbers = []
# return the extracted numbers
return pattern_final_numbers + multinumber_final_numbers + spacy_final_numbers
except:
return 0
def numeric_number_dot_freetext(text):
'''
This is a function to convert cases of '6 point five, six point 5 etc'
'''
try:
# # Define a dictionary to map words to numbers
num_dict = {
'zero': 0,
'one': 1,
'two': 2,
'three': 3,
'four': 4,
'five': 5,
'six': 6,
'seven': 7,
'eight': 8,
'nine': 9,
'ten': 10,
'eleven': 11,
'twelve': 12,
'thirteen': 13,
'fourteen': 14,
'fifteen': 15,
'sixteen': 16,
'seventeen': 17,
'eighteen': 18,
'nineteen': 19,
'twenty': 20,
'thirty': 30,
'forty': 40,
'fifty': 50,
'sixty': 60,
'seventy': 70,
'eighty': 80,
'ninety': 90,
'hundred': 100,
'thousand': 1000,
'million': 1000000,
'billion': 1000000000,
'trillion': 1000000000000
}
# # Define a regular expression pattern to extract the numeric form and free text form from input text
pattern = r"(\d+|\w+(?:\s+\w+)*)\s+(?:decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
# Use regular expression to extract the numeric form and free text form from input text
match = re.search(pattern, text)
if match:
num1 = match.group(1)
num2 = match.group(2)
# If the numeric form is a word, map it to its numerical value
if num1 in num_dict:
num1 = num_dict[num1]
# if not in the dictionary try also with the w2n library
else:
# try to convert to float. That means this is a number, otherwise it is a string so continue
try:
num1 = float(num1)
except:
# this will handle cases like "bla bla bla seven"
try:
num1 = w2n.word_to_num(num1)
# this is to handle cases like "bla bla bla 7"
except:
try:
# we identify all the numeric references
num_ref1 = [int(ref) for ref in re.findall(r'\d+', num1)]
# if there is exactly one number then we cope with that
if len(num_ref1) == 1:
num1 = num_ref1[0]
# in any other case throw an error
elif len(num_ref1) > 1:
return (0, 'MAGNITUDE', 'more_magnitude')
elif len(num_ref1) == 0:
return (0, 'MAGNITUDE', 'no_magnitude')
except:
return (0, 'MAGNITUDE', 'unknown_error')
# If the free text form is a word, map it to its numerical value
if num2 in num_dict:
num2 = num_dict[num2]
else:
try:
num2 = int(num2)
except:
try:
num2 = w2n.word_to_num(num2)
except:
try:
# we identify all the numeric references
num_ref2 = [int(ref) for ref in re.findall(r'\d+', num2)]
# if there is exactly one number then we cope with that
if len(num_ref2) == 1:
num2 = num_ref2[0]
# in any other case throw an error
elif len(num_ref2) > 1:
return (0, 'MAGNITUDE', 'more_magnitude')
elif len(num_ref2) == 0:
return (0, 'MAGNITUDE', 'no_magnitude')
except:
return (0, 'MAGNITUDE', 'unknown_error')
try:
# Convert both parts to float and add them together to get the final decimal value
result = float(num1) + float(num2) / (10 ** len(str(num2)))
return result
except:
return (0, 'MAGNITUDE', 'unknown_error')
else:
# If input text doesn't match the expected pattern, return None
return 0
except:
return 0
def convert_into_numeric(num_list):
'''
This is a function to convert the identified numbers into a numeric form
'''
if num_list:
# at first we examine how many numbers were captured. Only one number should exist
if len(num_list) > 1:
return (0, 'MAGNITUDE', 'more_magnitude')
else:
target_num = num_list[0]
# case it is an integer or float, convert it, otherwise move to following cases
try:
target_num_float = float(target_num)
return {'Number': target_num}
except:
# at first we check for cases like 6,5. If such cases exist we return a format error, otherwise we continue as before
if ',' in target_num:
try:
target_num = float(target_num.replace(",", "."))
return (0, 'MAGNITUDE', 'format_error')
except:
return (0, 'MAGNITUDE', 'unknown_error')
else:
# case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
if "$pattern" in target_num:
num, _ = target_num.split("$")
# try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
num_conversion = numeric_number_dot_freetext(num)
if num_conversion:
return {'Number': num_conversion}
# if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
else:
try:
num_conversion = w2n.word_to_num(target_num)
return {'Number': num_conversion}
# if none of the above try to handle cases of "million and two" or "a million and two". In such cases, we delete any 'a' reference
# and we insert the word 'one' at the beginning. In that way the w2n library can handle them besides immediately throw an error
except:
try:
target_num = target_num.replace(" a ", " ")
new_target_num = "one " + target_num
num_conversion = w2n.word_to_num(new_target_num)
return {'Number': num_conversion}
except:
return (0, 'MAGNITUDE', 'unknown_error')
else:
return (0, 'MAGNITUDE', 'no_magnitude')
def magnitude_binding(input_text):
'''
This is a function that binds together all the subcomponents of the magnitude number identification, while also controlling for multiple, or zero magnitude references
'''
try:
# capture the referred magnitudes
target_numbers = capture_numbers(input_text)
# we only accept for one magnitude reference
if len(target_numbers) == 1:
numeric_target_numbers = convert_into_numeric(target_numbers)
return numeric_target_numbers
# in case of zero references return the appropriate code (to aid returning the correct prompt)
elif len(target_numbers) == 0:
return (0, 'MAGNITUDE', 'no_magnitude')
# in case of more than one references return the appropriate code (to aid returning the correct prompt)
elif len(target_numbers) > 1:
return (0, 'MAGNITUDE', 'more_magnitude')
# in case of unexpected error return the appropriate code (to aid returning the correct prompt)
else:
return (0, 'MAGNITUDE', 'unknown_error')
except:
return (0, 'MAGNITUDE', 'unknown_error')