GEM-benchmark · motiwari · Sep 1, 2021 · Sep 18, 2021 · Sep 18, 2021 · Sep 18, 2021
diff --git a/transformations/words_to_numbers/README.md b/transformations/words_to_numbers/README.md
@@ -0,0 +1,45 @@
+# Words to Numbers
+This transformation replaces word forms of numbers with their decimal representations, e.g. "two thousand nine hundred
+and twelve" with "2912". In some sense, this is much harder to implement and the reverse transformation of 
+https://github.com/GEM-benchmark/NL-Augmenter/pull/39 and is related to 
+https://github.com/GEM-benchmark/NL-Augmenter/pull/71.
+
+Author name: Mo Tiwari
+Author email: motiwari@stanford.edu
+Author Affiliation: Stanford University
+
+## What type of  transformation is this?
+
+This transformation functions as a perturbation to test robustness to different representations of numbers, either in
+their decimal form or word form.
+
+## What tasks does it intend to benefit?
+This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, 
+text generation, etc. and may deal with numbers written out in word form. 
+
+## Previous Work
+
+Several webpages exist to do this (as the code is fairly simple) but have various errors:
+
+- https://www.browserling.com/tools/words-to-numbers cannot handle capital letters
+- https://www.dcode.fr/writing-words-numbers does not provide source code
+
+Our code is very loosely adapted from
+https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers, though our implementation
+is more general and handles sentences where only part of the sentence refers to a number.
+
+This transformation is the "inverse" transformation of the 
+[number-to-word transformation](https://github.com/GEM-benchmark/NL-Augmenter/blob/main/transformations/number-to-word/transformation.py)
+which converts numerical representations of numbers to their word form and is a much easier transformation to implement.
+
+## What are the limitations of this transformation?
+- Very large numbers (>10^66) have special names that are not included here as they are likely used rarely in common
+language
+- The transformation does not work with mixed-representation numbers, e.g. "140 million"
+- The transformation does not work with unconventionally-formatted numbers, e.g. "one thousand million" in place of 
+"one billion", and assumes a standard formatting like "one million, three hundred thousand, seven hundred forty-two"
+- The transformation may fail in settings where the actual references are ambiguous, e.g. "The numbers five hundred, forty two, and six are even"
+- As an easy extension we could output styled numbers, e.g. "1000000" as "1,000,000"
+
+## Robustness Evaluation
+
diff --git a/transformations/words_to_numbers/__init__.py b/transformations/words_to_numbers/__init__.py
@@ -0,0 +1 @@
+from .transformation import *
diff --git a/transformations/words_to_numbers/requirements.txt b/transformations/words_to_numbers/requirements.txt
@@ -0,0 +1 @@
+torchtext>=0.9.1
diff --git a/transformations/words_to_numbers/test.json b/transformations/words_to_numbers/test.json
@@ -0,0 +1,60 @@
+{
+  "type": "words_to_numbers",
+  "test_cases": [
+    {
+      "class": "WordsToNumbers",
+      "inputs": {
+        "sentence": "I have ten cats."
+      },
+      "outputs": [
+        {
+        "sentence": "I have 10 cats."
+        }
+      ]
+    },
+    {
+      "class": "WordsToNumbers",
+      "inputs": {
+        "sentence": "Mo has twelve dogs who eat two hundred pieces of food every day."
+      },
+      "outputs": [
+        {
+        "sentence": "Mo has 12 dogs who eat 200 pieces of food every day."
+        }
+      ]
+    },
+    {
+      "class": "WordsToNumbers",
+      "inputs": {
+        "sentence": "There are three hundred twelve million, five hundred thirty four thousand, six hundred seventy two people in the United States."
+      },
+      "outputs": [
+        {
+        "sentence": "There are 312534672 people in the United States."
+        }
+      ]
+    },
+    {
+      "class": "WordsToNumbers",
+      "inputs": {
+        "sentence": "One vigintillion is a one followed by sixty three zeros."
+      },
+      "outputs": [
+        {
+        "sentence": "1000000000000000000000000000000000000000000000000000000000000000 is 1 followed by 63 zeros."
+        }
+      ]
+    },
+    {
+      "class": "WordsToNumbers",
+      "inputs": {
+        "sentence": "Roughly one hundred forty million people are born each year."
+      },
+      "outputs": [
+        {
+        "sentence": "Roughly 140000000 people are born each year."
+        }
+      ]
+    }
+  ]
+}
diff --git a/transformations/words_to_numbers/text2nums.py b/transformations/words_to_numbers/text2nums.py
@@ -0,0 +1,176 @@
+'''
+Very loosely adapted from https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers
+'''
+
+import re
+
+from .words_to_numbers_constants import units, tens, teens, scales
+
+
+def period_rep(tokens, period_start_loc, period_end_loc):
+    """
+    Parse a "period" of the number corresponding to 3 digits, given a sequence of tokens and the location of the period
+    in that sentence
+    """
+    str_ = ''
+
+    first_digit = '0'
+    tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc]))
+    if 'hundred' in tmp_tokens:
+        # The token before "hundred" must be the "number of hundreds"
+        hundred_idx = tmp_tokens.index('hundred')
+        first_digit = units[tmp_tokens[hundred_idx - 1]]  # Will raise KeyError if malformed input
+        period_start_loc += 2 # Now, only consider tokens after the "X hundred" in the sequence
+        tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc])) # start and end locs have changed
+    str_ += first_digit
+
+    second_digit = '0'
+    for t in tens:
+        if t in tmp_tokens:
+            second_digit = tens[t]
+            period_start_loc += 1 # Now, only consider tokens after the tens quantifier in the sequence
+            tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc])) # start and end locs have changed
+    str_ += second_digit
+
+    third_digit = '0'
+    for u in units:
+        if u in tmp_tokens:
+            third_digit = units[u]
+            period_start_loc += 1 # Though this is not used currently, leave it here for extensions like "one point six"
+            tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc])) # start and end locs have changed
+    str_ += third_digit
+
+    # Handle the case of 11 - 19
+    for te in teens:
+        if te in tmp_tokens:
+            str_ = str_[0] + teens[te] # Can't do in-place because of 'str' object does not support item assignment
+
+    return str_
+
+
+def is_token_numeric(token):
+    """
+    Decide if a given token is part of the number
+    """
+    return token != ',' and (token in units or token in tens or token in teens or token.rstrip(',') in scales or token == 'hundred')
+
+def find_continugous_number_words(old_tokens):
+    """
+    Given all the tokens of a sentence, find all the phrases that correspond to words.
+    This is necessary because several "word numbers" may be present in a sentence, e.g.
+    'three hundred people went to twenty two events'
+
+    Returns a set of "word numbers" and their corresponding start and end indices in the original token sequence
+    """
+    number_words = []
+    start_idcs = []
+    end_idcs = []
+
+    t_idx = 0
+    new_word = True
+    tokens = list(map(lambda x: x.lower(), old_tokens))
+    while t_idx < len(tokens):
+        if is_token_numeric(tokens[t_idx]):
+            if new_word: # We've found a new "word number"
+                start_idx = t_idx
+                start_idcs.append(t_idx)
+                new_word = False
+        else:
+            if not new_word: # We just completed the "word number"
+                number_words.append(tokens[start_idx:t_idx])
+                end_idcs.append(t_idx)
+            new_word = True
+        t_idx += 1
+
+    return number_words, list(zip(start_idcs, end_idcs))
+
+
+def parse_number_word(number_tokens):
+    """
+    Given a sequence of tokens corresponding to a "word number", converts it to a decimal representation, e.g.
+    'Three thousand five hundred twelve' -> '3512'
+    """
+    word_rep = ' '.join(number_tokens)
+    word_rep = word_rep.replace('-', ' ')
+    word_rep = word_rep.replace(' and ', ' ')
+    tokens = re.split('( |,)', word_rep)
+    tokens = list(filter(lambda x: x != ' ' and x != '', tokens)) # Remove extraneous empty strings and spaces
+
+    num_string = ''
+    last_found_period = None
+
+    # Search the possible period identifiers backwards to look for biggest scale first.
+    # As of Python 3.6, for the CPython implementation of Python, dictionaries maintain insertion order by default.
+    for period in list(scales)[::-1]:
+        if period in tokens:
+            # We found a new period identifier and had an old one that wasn't the one immediately larger than it,
+            # so we need to pad the middle with zeros, e.g. in the number "one billion, one thousand one"
+            if last_found_period is not None:
+                num_string += '0' * 3 * (list(scales).index(last_found_period) - list(scales).index(period) - 1)
+
+            last_found_period = period
+            period_end_loc = tokens.index(period)
+
+            # We need to find the tokens that correspond to the period under identification.
+            # Walk backwards to find comma or non-number word
+            period_start_loc = period_end_loc - 1
+            token = tokens[period_start_loc]
+            # Walk back towards the last seen period identifier like million, billion, etc. and
+            # don't wrap back around around the string
+            while is_token_numeric(token) and token not in scales and period_start_loc != 0:
+                period_start_loc -= 1
+                token = tokens[period_start_loc]
+
+            num_string += period_rep(tokens, period_start_loc, period_end_loc)
+
+    # Handle the corner cases like "one million and twelve"
+    if last_found_period is not None and last_found_period != 'thousand':
+        num_string += '0' * 3 * (list(scales).index(last_found_period))
+
+    # If the last token is not a period identifier, then we have a number less than one thousand
+    if last_found_period is None:
+        num_string += period_rep(tokens, 0, len(tokens))
+    elif tokens.index(last_found_period) != len(scales) - 1:
+        num_string += period_rep(tokens, tokens.index(last_found_period)+1, len(tokens))
+    else:
+        num_string += '0' * scales[last_found_period] # Add right-zeros in the case we had a number like "one million"
+
+    # Trim leading 0s
+    num_string = num_string.lstrip('0')
+
+    return num_string
+
+
+def text2int(sentence):
+    """
+    Given a sentence, find the contiguous subsequences of tokens that correspond to a number.
+    Convert those to their decimal representations, and interlace them with the original sentence.
+    """
+    output_tokens = []
+    original_tokens = sentence.split(" ")
+    number_tokens, idcs = find_continugous_number_words(original_tokens)
+
+    if len(number_tokens) != 0: # We have some numbers to convert
+        number_tokens_counter = 0
+        idx = 0
+        while idx < len(original_tokens):
+            if number_tokens_counter < len(number_tokens) and idx == idcs[number_tokens_counter][0]: # Number to convert
+                output_tokens.append(parse_number_word(number_tokens[number_tokens_counter]))
+                idx = idcs[number_tokens_counter][1] # Skip ahead to the end of the word number
+                number_tokens_counter += 1
+            else: # Keep original tokens
+                output_tokens.append(original_tokens[idx])
+                idx += 1
+    else:
+        output_tokens = original_tokens
+    return ' '.join(output_tokens)
+
+if __name__ == '__main__':
+    print(text2int("I have ten cats."))
+    print(text2int("Mo has twelve dogs who eat two hundred pieces of food every day."))
+    print(text2int("There are three hundred twelve million, five hundred thirty four thousand, six hundred seventy two people in the United States."))
+    print(text2int("One vigintillion is a one followed by sixty three zeros."))
+    print(text2int("Roughly one hundred forty million people are born each year."))
+    print(text2int("One thousand three hundred people went to three million twelve stores and two billion one thousand stores"))
+    print(text2int("There are three hundred twelve million, five hundred thirty four thousand, six hundred seventy two people in the United States and one in every two is female."))
+
diff --git a/transformations/words_to_numbers/transformation.py b/transformations/words_to_numbers/transformation.py
@@ -0,0 +1,24 @@
+from typing import List
+
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+
+from .text2nums import *
+
+
+class WordsToNumbers(SentenceOperation):
+    '''
+    Transforms a given sentence that has "word numbers" to their numerical representations, e.g.
+    "I have ten cats" -> "I have 10 cats."
+
+    Inherits from SentenceOperation.
+    '''
+    tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION, TaskType.PARAPHRASE_DETECTION, TaskType.TEXTUAL_ENTAILMENT]
+    languages = ["en"]
+    keywords = ["lexical", "rule-based", "written", "highly-meaning-preserving", "high-precision", "low-generations"]
+
+    def __init__(self, seed: int = 0, max_outputs=1) -> None:
+        super().__init__(seed=seed, max_outputs=max_outputs)
+
+    def generate(self, sentence: str) -> List[str]:
+        return [text2int(sentence)]
diff --git a/transformations/words_to_numbers/words_to_numbers_constants.py b/transformations/words_to_numbers/words_to_numbers_constants.py
@@ -0,0 +1,61 @@
+units = {
+        "zero": "0'",
+        "a": "1", # e.g. "a million"
+        "one": "1",
+        "two": "2",
+        "three": "3",
+        "four": "4",
+        "five": "5",
+        "six": "6",
+        "seven": "7",
+        "eight": "8",
+        "nine": "9",
+}
+
+teens = {
+        "ten": "10",
+        "eleven": "11",
+        "twelve": "12",
+        "thirteen": "13",
+        "fourteen": "14",
+        "fifteen": "15",
+        "sixteen": "16",
+        "seventeen": "17",
+        "eighteen": "18",
+        "nineteen": "19",
+    }
+
+tens = {
+    "twenty": '2',
+    "thirty": '3',
+    "forty": '4',
+    "fifty": '5',
+    "sixty": '6',
+    "seventy": '7',
+    "eighty": '8',
+    "ninety": '9',
+}
+
+scales = {
+    "thousand": 3, # 10^3
+    "million": 6, # 10^6
+    "billion": 9, # 10^9
+    "trillion": 12, # 10^12
+    "quadrillion": 15, # 10^15
+    "quintillion": 18, # 10^18
+    "sextillion": 21, # 10^21
+    "septillion": 24, # 10^24
+    "octillion": 27, # 10^27
+    "nonillion": 30, # 10^30
+    "decillion": 33, # 10^33
+    "undecillion": 36, # 10^36
+    "dodecillion": 39, # 10^39
+    "tredecillion": 42, # 10^42
+    "quattuordecillion": 45, # 10^45
+    "quindecillion": 48, # 10^48
+    "sexdecillion": 51, # 10^51
+    "septendecillion": 54, # 10^54
+    "octodecillion": 57, # 10^57
+    "novemdecillion": 60, # 10^60
+    "vigintillion": 63, # 10^63
+}