diff --git a/transformations/words_to_numbers/README.md b/transformations/words_to_numbers/README.md new file mode 100644 index 000000000..4e98fea5c --- /dev/null +++ b/transformations/words_to_numbers/README.md @@ -0,0 +1,45 @@ +# Words to Numbers +This transformation replaces word forms of numbers with their decimal representations, e.g. "two thousand nine hundred +and twelve" with "2912". In some sense, this is much harder to implement and the reverse transformation of +https://github.com/GEM-benchmark/NL-Augmenter/pull/39 and is related to +https://github.com/GEM-benchmark/NL-Augmenter/pull/71. + +Author name: Mo Tiwari +Author email: motiwari@stanford.edu +Author Affiliation: Stanford University + +## What type of transformation is this? + +This transformation functions as a perturbation to test robustness to different representations of numbers, either in +their decimal form or word form. + +## What tasks does it intend to benefit? +This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, +text generation, etc. and may deal with numbers written out in word form. + +## Previous Work + +Several webpages exist to do this (as the code is fairly simple) but have various errors: + +- https://www.browserling.com/tools/words-to-numbers cannot handle capital letters +- https://www.dcode.fr/writing-words-numbers does not provide source code + +Our code is very loosely adapted from +https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers, though our implementation +is more general and handles sentences where only part of the sentence refers to a number. + +This transformation is the "inverse" transformation of the +[number-to-word transformation](https://github.com/GEM-benchmark/NL-Augmenter/blob/main/transformations/number-to-word/transformation.py) +which converts numerical representations of numbers to their word form and is a much easier transformation to implement. + +## What are the limitations of this transformation? +- Very large numbers (>10^66) have special names that are not included here as they are likely used rarely in common +language +- The transformation does not work with mixed-representation numbers, e.g. "140 million" +- The transformation does not work with unconventionally-formatted numbers, e.g. "one thousand million" in place of +"one billion", and assumes a standard formatting like "one million, three hundred thousand, seven hundred forty-two" +- The transformation may fail in settings where the actual references are ambiguous, e.g. "The numbers five hundred, forty two, and six are even" +- As an easy extension we could output styled numbers, e.g. "1000000" as "1,000,000" + +## Robustness Evaluation + diff --git a/transformations/words_to_numbers/__init__.py b/transformations/words_to_numbers/__init__.py new file mode 100644 index 000000000..0a79241bb --- /dev/null +++ b/transformations/words_to_numbers/__init__.py @@ -0,0 +1 @@ +from .transformation import * \ No newline at end of file diff --git a/transformations/words_to_numbers/requirements.txt b/transformations/words_to_numbers/requirements.txt new file mode 100644 index 000000000..f3e1b6a75 --- /dev/null +++ b/transformations/words_to_numbers/requirements.txt @@ -0,0 +1 @@ +torchtext>=0.9.1 \ No newline at end of file diff --git a/transformations/words_to_numbers/test.json b/transformations/words_to_numbers/test.json new file mode 100644 index 000000000..6e0634253 --- /dev/null +++ b/transformations/words_to_numbers/test.json @@ -0,0 +1,60 @@ +{ + "type": "words_to_numbers", + "test_cases": [ + { + "class": "WordsToNumbers", + "inputs": { + "sentence": "I have ten cats." + }, + "outputs": [ + { + "sentence": "I have 10 cats." + } + ] + }, + { + "class": "WordsToNumbers", + "inputs": { + "sentence": "Mo has twelve dogs who eat two hundred pieces of food every day." + }, + "outputs": [ + { + "sentence": "Mo has 12 dogs who eat 200 pieces of food every day." + } + ] + }, + { + "class": "WordsToNumbers", + "inputs": { + "sentence": "There are three hundred twelve million, five hundred thirty four thousand, six hundred seventy two people in the United States." + }, + "outputs": [ + { + "sentence": "There are 312534672 people in the United States." + } + ] + }, + { + "class": "WordsToNumbers", + "inputs": { + "sentence": "One vigintillion is a one followed by sixty three zeros." + }, + "outputs": [ + { + "sentence": "1000000000000000000000000000000000000000000000000000000000000000 is 1 followed by 63 zeros." + } + ] + }, + { + "class": "WordsToNumbers", + "inputs": { + "sentence": "Roughly one hundred forty million people are born each year." + }, + "outputs": [ + { + "sentence": "Roughly 140000000 people are born each year." + } + ] + } + ] +} diff --git a/transformations/words_to_numbers/text2nums.py b/transformations/words_to_numbers/text2nums.py new file mode 100644 index 000000000..b45fc4357 --- /dev/null +++ b/transformations/words_to_numbers/text2nums.py @@ -0,0 +1,176 @@ +''' +Very loosely adapted from https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers +''' + +import re + +from .words_to_numbers_constants import units, tens, teens, scales + + +def period_rep(tokens, period_start_loc, period_end_loc): + """ + Parse a "period" of the number corresponding to 3 digits, given a sequence of tokens and the location of the period + in that sentence + """ + str_ = '' + + first_digit = '0' + tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc])) + if 'hundred' in tmp_tokens: + # The token before "hundred" must be the "number of hundreds" + hundred_idx = tmp_tokens.index('hundred') + first_digit = units[tmp_tokens[hundred_idx - 1]] # Will raise KeyError if malformed input + period_start_loc += 2 # Now, only consider tokens after the "X hundred" in the sequence + tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc])) # start and end locs have changed + str_ += first_digit + + second_digit = '0' + for t in tens: + if t in tmp_tokens: + second_digit = tens[t] + period_start_loc += 1 # Now, only consider tokens after the tens quantifier in the sequence + tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc])) # start and end locs have changed + str_ += second_digit + + third_digit = '0' + for u in units: + if u in tmp_tokens: + third_digit = units[u] + period_start_loc += 1 # Though this is not used currently, leave it here for extensions like "one point six" + tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc])) # start and end locs have changed + str_ += third_digit + + # Handle the case of 11 - 19 + for te in teens: + if te in tmp_tokens: + str_ = str_[0] + teens[te] # Can't do in-place because of 'str' object does not support item assignment + + return str_ + + +def is_token_numeric(token): + """ + Decide if a given token is part of the number + """ + return token != ',' and (token in units or token in tens or token in teens or token.rstrip(',') in scales or token == 'hundred') + +def find_continugous_number_words(old_tokens): + """ + Given all the tokens of a sentence, find all the phrases that correspond to words. + This is necessary because several "word numbers" may be present in a sentence, e.g. + 'three hundred people went to twenty two events' + + Returns a set of "word numbers" and their corresponding start and end indices in the original token sequence + """ + number_words = [] + start_idcs = [] + end_idcs = [] + + t_idx = 0 + new_word = True + tokens = list(map(lambda x: x.lower(), old_tokens)) + while t_idx < len(tokens): + if is_token_numeric(tokens[t_idx]): + if new_word: # We've found a new "word number" + start_idx = t_idx + start_idcs.append(t_idx) + new_word = False + else: + if not new_word: # We just completed the "word number" + number_words.append(tokens[start_idx:t_idx]) + end_idcs.append(t_idx) + new_word = True + t_idx += 1 + + return number_words, list(zip(start_idcs, end_idcs)) + + +def parse_number_word(number_tokens): + """ + Given a sequence of tokens corresponding to a "word number", converts it to a decimal representation, e.g. + 'Three thousand five hundred twelve' -> '3512' + """ + word_rep = ' '.join(number_tokens) + word_rep = word_rep.replace('-', ' ') + word_rep = word_rep.replace(' and ', ' ') + tokens = re.split('( |,)', word_rep) + tokens = list(filter(lambda x: x != ' ' and x != '', tokens)) # Remove extraneous empty strings and spaces + + num_string = '' + last_found_period = None + + # Search the possible period identifiers backwards to look for biggest scale first. + # As of Python 3.6, for the CPython implementation of Python, dictionaries maintain insertion order by default. + for period in list(scales)[::-1]: + if period in tokens: + # We found a new period identifier and had an old one that wasn't the one immediately larger than it, + # so we need to pad the middle with zeros, e.g. in the number "one billion, one thousand one" + if last_found_period is not None: + num_string += '0' * 3 * (list(scales).index(last_found_period) - list(scales).index(period) - 1) + + last_found_period = period + period_end_loc = tokens.index(period) + + # We need to find the tokens that correspond to the period under identification. + # Walk backwards to find comma or non-number word + period_start_loc = period_end_loc - 1 + token = tokens[period_start_loc] + # Walk back towards the last seen period identifier like million, billion, etc. and + # don't wrap back around around the string + while is_token_numeric(token) and token not in scales and period_start_loc != 0: + period_start_loc -= 1 + token = tokens[period_start_loc] + + num_string += period_rep(tokens, period_start_loc, period_end_loc) + + # Handle the corner cases like "one million and twelve" + if last_found_period is not None and last_found_period != 'thousand': + num_string += '0' * 3 * (list(scales).index(last_found_period)) + + # If the last token is not a period identifier, then we have a number less than one thousand + if last_found_period is None: + num_string += period_rep(tokens, 0, len(tokens)) + elif tokens.index(last_found_period) != len(scales) - 1: + num_string += period_rep(tokens, tokens.index(last_found_period)+1, len(tokens)) + else: + num_string += '0' * scales[last_found_period] # Add right-zeros in the case we had a number like "one million" + + # Trim leading 0s + num_string = num_string.lstrip('0') + + return num_string + + +def text2int(sentence): + """ + Given a sentence, find the contiguous subsequences of tokens that correspond to a number. + Convert those to their decimal representations, and interlace them with the original sentence. + """ + output_tokens = [] + original_tokens = sentence.split(" ") + number_tokens, idcs = find_continugous_number_words(original_tokens) + + if len(number_tokens) != 0: # We have some numbers to convert + number_tokens_counter = 0 + idx = 0 + while idx < len(original_tokens): + if number_tokens_counter < len(number_tokens) and idx == idcs[number_tokens_counter][0]: # Number to convert + output_tokens.append(parse_number_word(number_tokens[number_tokens_counter])) + idx = idcs[number_tokens_counter][1] # Skip ahead to the end of the word number + number_tokens_counter += 1 + else: # Keep original tokens + output_tokens.append(original_tokens[idx]) + idx += 1 + else: + output_tokens = original_tokens + return ' '.join(output_tokens) + +if __name__ == '__main__': + print(text2int("I have ten cats.")) + print(text2int("Mo has twelve dogs who eat two hundred pieces of food every day.")) + print(text2int("There are three hundred twelve million, five hundred thirty four thousand, six hundred seventy two people in the United States.")) + print(text2int("One vigintillion is a one followed by sixty three zeros.")) + print(text2int("Roughly one hundred forty million people are born each year.")) + print(text2int("One thousand three hundred people went to three million twelve stores and two billion one thousand stores")) + print(text2int("There are three hundred twelve million, five hundred thirty four thousand, six hundred seventy two people in the United States and one in every two is female.")) + diff --git a/transformations/words_to_numbers/transformation.py b/transformations/words_to_numbers/transformation.py new file mode 100644 index 000000000..093412f70 --- /dev/null +++ b/transformations/words_to_numbers/transformation.py @@ -0,0 +1,24 @@ +from typing import List + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + +from .text2nums import * + + +class WordsToNumbers(SentenceOperation): + ''' + Transforms a given sentence that has "word numbers" to their numerical representations, e.g. + "I have ten cats" -> "I have 10 cats." + + Inherits from SentenceOperation. + ''' + tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION, TaskType.PARAPHRASE_DETECTION, TaskType.TEXTUAL_ENTAILMENT] + languages = ["en"] + keywords = ["lexical", "rule-based", "written", "highly-meaning-preserving", "high-precision", "low-generations"] + + def __init__(self, seed: int = 0, max_outputs=1) -> None: + super().__init__(seed=seed, max_outputs=max_outputs) + + def generate(self, sentence: str) -> List[str]: + return [text2int(sentence)] diff --git a/transformations/words_to_numbers/words_to_numbers_constants.py b/transformations/words_to_numbers/words_to_numbers_constants.py new file mode 100644 index 000000000..8047e8489 --- /dev/null +++ b/transformations/words_to_numbers/words_to_numbers_constants.py @@ -0,0 +1,61 @@ +units = { + "zero": "0'", + "a": "1", # e.g. "a million" + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", +} + +teens = { + "ten": "10", + "eleven": "11", + "twelve": "12", + "thirteen": "13", + "fourteen": "14", + "fifteen": "15", + "sixteen": "16", + "seventeen": "17", + "eighteen": "18", + "nineteen": "19", + } + +tens = { + "twenty": '2', + "thirty": '3', + "forty": '4', + "fifty": '5', + "sixty": '6', + "seventy": '7', + "eighty": '8', + "ninety": '9', +} + +scales = { + "thousand": 3, # 10^3 + "million": 6, # 10^6 + "billion": 9, # 10^9 + "trillion": 12, # 10^12 + "quadrillion": 15, # 10^15 + "quintillion": 18, # 10^18 + "sextillion": 21, # 10^21 + "septillion": 24, # 10^24 + "octillion": 27, # 10^27 + "nonillion": 30, # 10^30 + "decillion": 33, # 10^33 + "undecillion": 36, # 10^36 + "dodecillion": 39, # 10^39 + "tredecillion": 42, # 10^42 + "quattuordecillion": 45, # 10^45 + "quindecillion": 48, # 10^48 + "sexdecillion": 51, # 10^51 + "septendecillion": 54, # 10^54 + "octodecillion": 57, # 10^57 + "novemdecillion": 60, # 10^60 + "vigintillion": 63, # 10^63 +} \ No newline at end of file