Spaces:

ValadisCERTH
/

NaturalLanguageModule_complete

Runtime error

File size: 12,271 Bytes

import spacy
import re
from word2number import w2n

# Load the spacy model with GloVe embeddings
nlp = spacy.load("en_core_web_lg")


def capture_numbers(input_sentence):
    '''
      This is a function to capture cases of refered numbers either in numeric or free-text form
    '''

    try:
        # Define the regular expression patterns
        pattern1 = r"(\d+|\w+(?:\s+\w+)*)\s+(decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"

        # Find all matches in the text
        matches = re.findall(pattern1, input_sentence)

        # This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
        pattern_numbers = []
        for match in matches:
            if len(match) == 3:
                # add the $pattern string to easily specify them in a subsequent step
                full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
                pattern_numbers.append(full_string)

        for elem in pattern_numbers:
            input_sentence = input_sentence.replace(elem, " ")

        if pattern_numbers:
            # Remove duplicates with set and convert back to list
            pattern_final_numbers = list(set(pattern_numbers))
        else:
            pattern_final_numbers = []

        # we delete the captured references from the sentence, because if we capture something like seven point five
        # then spacy will also identify seven and five, which we do not want it to
        for element in pattern_final_numbers:
            target_elem = element.replace("$pattern", "").strip()
            if target_elem in input_sentence:
                input_sentence = input_sentence.replace(target_elem, " ")

        # This is for cases of thirty eight or one million and two, etc.

        # Define a regular expression to match multiword free-text numbers
        pattern2 = r"(?<!\w)(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)(?:\s(?:and\s)?(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion))+\s?)+(?!\w*pennies)"

        # Find all multiword free-text number matches in the sentence
        multi_numbers = re.findall(pattern2, input_sentence)

        if multi_numbers:
            multinumber_final_numbers = list(set(multi_numbers))
        else:
            multinumber_final_numbers = []

        for elem in multinumber_final_numbers:
            if elem in input_sentence:
                input_sentence = input_sentence.replace(elem, " ")

        # we also delete the captured references from the sentence in this case
        for element in multinumber_final_numbers:
            target_elem = element.replace("$pattern", "").strip()
            if target_elem in input_sentence:
                input_sentence = input_sentence.replace(target_elem, " ")

        # Parse the input sentence with Spacy
        doc = nlp(input_sentence)

        # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hundred
        s_numbers = [token.text for token in doc if token.like_num]

        if s_numbers:
            # Remove duplicates with set and convert back to list
            spacy_final_numbers = list(set(s_numbers))

        else:
            spacy_final_numbers = []

        # return the extracted numbers
        return pattern_final_numbers + multinumber_final_numbers + spacy_final_numbers

    except:
        return 0


def numeric_number_dot_freetext(text):
    '''
    This is a function to convert cases of '6 point five, six point 5 etc'
    '''

    try:
        # # Define a dictionary to map words to numbers
        num_dict = {
            'zero': 0,
            'one': 1,
            'two': 2,
            'three': 3,
            'four': 4,
            'five': 5,
            'six': 6,
            'seven': 7,
            'eight': 8,
            'nine': 9,
            'ten': 10,
            'eleven': 11,
            'twelve': 12,
            'thirteen': 13,
            'fourteen': 14,
            'fifteen': 15,
            'sixteen': 16,
            'seventeen': 17,
            'eighteen': 18,
            'nineteen': 19,
            'twenty': 20,
            'thirty': 30,
            'forty': 40,
            'fifty': 50,
            'sixty': 60,
            'seventy': 70,
            'eighty': 80,
            'ninety': 90,
            'hundred': 100,
            'thousand': 1000,
            'million': 1000000,
            'billion': 1000000000,
            'trillion': 1000000000000
        }

        # # Define a regular expression pattern to extract the numeric form and free text form from input text
        pattern = r"(\d+|\w+(?:\s+\w+)*)\s+(?:decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"

        # Use regular expression to extract the numeric form and free text form from input text
        match = re.search(pattern, text)

        if match:
            num1 = match.group(1)
            num2 = match.group(2)

            # If the numeric form is a word, map it to its numerical value
            if num1 in num_dict:
                num1 = num_dict[num1]

            # if not in the dictionary try also with the w2n library
            else:

                # try to convert to float. That means this is a number, otherwise it is a string so continue
                try:
                    num1 = float(num1)
                except:

                    # this will handle cases like "bla bla bla seven"
                    try:
                        num1 = w2n.word_to_num(num1)

                    # this is to handle cases like "bla bla bla 7"
                    except:

                        try:
                            # we identify all the numeric references
                            num_ref1 = [int(ref) for ref in re.findall(r'\d+', num1)]

                            # if there is exactly one number then we cope with that
                            if len(num_ref1) == 1:
                                num1 = num_ref1[0]

                            # in any other case throw an error
                            elif len(num_ref1) > 1:
                                return (0, 'MAGNITUDE', 'more_magnitude')

                            elif len(num_ref1) == 0:
                                return (0, 'MAGNITUDE', 'no_magnitude')

                        except:
                            return (0, 'MAGNITUDE', 'unknown_error')

            # If the free text form is a word, map it to its numerical value
            if num2 in num_dict:
                num2 = num_dict[num2]

            else:
                try:
                    num2 = int(num2)
                except:
                    try:
                        num2 = w2n.word_to_num(num2)
                    except:
                        try:
                            # we identify all the numeric references
                            num_ref2 = [int(ref) for ref in re.findall(r'\d+', num2)]

                            # if there is exactly one number then we cope with that
                            if len(num_ref2) == 1:
                                num2 = num_ref2[0]

                            # in any other case throw an error
                            elif len(num_ref2) > 1:
                                return (0, 'MAGNITUDE', 'more_magnitude')

                            elif len(num_ref2) == 0:
                                return (0, 'MAGNITUDE', 'no_magnitude')

                        except:
                            return (0, 'MAGNITUDE', 'unknown_error')

            try:
                # Convert both parts to float and add them together to get the final decimal value
                result = float(num1) + float(num2) / (10 ** len(str(num2)))
                return result
            except:
                return (0, 'MAGNITUDE', 'unknown_error')


        else:
            # If input text doesn't match the expected pattern, return None
            return 0

    except:
        return 0


def convert_into_numeric(num_list):
    '''
    This is a function to convert the identified numbers into a numeric form
    '''

    if num_list:

        # at first we examine how many numbers were captured. Only one number should exist
        if len(num_list) > 1:
            return (0, 'MAGNITUDE', 'more_magnitude')

        else:
            target_num = num_list[0]

            # case it is an integer or float, convert it, otherwise move to following cases
            try:

                target_num_float = float(target_num)
                return {'Number': target_num}

            except:

                # at first we check for cases like 6,5. If such cases exist we return a format error, otherwise we continue as before
                if ',' in target_num:
                    try:
                        target_num = float(target_num.replace(",", "."))
                        return (0, 'MAGNITUDE', 'format_error')

                    except:
                        return (0, 'MAGNITUDE', 'unknown_error')

                else:

                    # case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
                    if "$pattern" in target_num:
                        num, _ = target_num.split("$")

                        # try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
                        num_conversion = numeric_number_dot_freetext(num)

                        if num_conversion:
                            return {'Number': num_conversion}

                    # if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
                    else:
                        try:
                            num_conversion = w2n.word_to_num(target_num)
                            return {'Number': num_conversion}

                        # if none of the above try to handle cases of "million and two" or "a million and two". In such cases, we delete any 'a' reference
                        # and we insert the word 'one' at the beginning. In that way the w2n library can handle them besides immediately throw an error
                        except:

                            try:
                                target_num = target_num.replace(" a ", " ")
                                new_target_num = "one " + target_num
                                num_conversion = w2n.word_to_num(new_target_num)
                                return {'Number': num_conversion}

                            except:
                                return (0, 'MAGNITUDE', 'unknown_error')

    else:
        return (0, 'MAGNITUDE', 'no_magnitude')


def magnitude_binding(input_text):
    '''
    This is a function that binds together all the subcomponents of the magnitude number identification, while also controlling for multiple, or zero magnitude references
    '''

    try:

        # capture the referred magnitudes
        target_numbers = capture_numbers(input_text)

        # we only accept for one magnitude reference
        if len(target_numbers) == 1:
            numeric_target_numbers = convert_into_numeric(target_numbers)

            return numeric_target_numbers

        # in case of zero references return the appropriate code (to aid returning the correct prompt)
        elif len(target_numbers) == 0:
            return (0, 'MAGNITUDE', 'no_magnitude')

        # in case of more than one references return the appropriate code (to aid returning the correct prompt)
        elif len(target_numbers) > 1:
            return (0, 'MAGNITUDE', 'more_magnitude')

        # in case of unexpected error return the appropriate code (to aid returning the correct prompt)
        else:
            return (0, 'MAGNITUDE', 'unknown_error')

    except:
        return (0, 'MAGNITUDE', 'unknown_error')