Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions transformations/words_to_numbers/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Words to Numbers
This transformation replaces word forms of numbers with their decimal representations, e.g. "two thousand nine hundred
and twelve" with "2912". In some sense, this is much harder to implement and the reverse transformation of
https://github.com/GEM-benchmark/NL-Augmenter/pull/39 and is related to
https://github.com/GEM-benchmark/NL-Augmenter/pull/71.

Author name: Mo Tiwari
Author email: motiwari@stanford.edu
Author Affiliation: Stanford University

## What type of transformation is this?

This transformation functions as a perturbation to test robustness to different representations of numbers, either in
their decimal form or word form.

## What tasks does it intend to benefit?
This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification,
text generation, etc. and may deal with numbers written out in word form.

## Previous Work

Several webpages exist to do this (as the code is fairly simple) but have various errors:

- https://www.browserling.com/tools/words-to-numbers cannot handle capital letters
- https://www.dcode.fr/writing-words-numbers does not provide source code

Our code is very loosely adapted from
https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers, though our implementation
is more general and handles sentences where only part of the sentence refers to a number.

This transformation is the "inverse" transformation of the
[number-to-word transformation](https://github.com/GEM-benchmark/NL-Augmenter/blob/main/transformations/number-to-word/transformation.py)
which converts numerical representations of numbers to their word form and is a much easier transformation to implement.

## What are the limitations of this transformation?
- Very large numbers (>10^66) have special names that are not included here as they are likely used rarely in common
language
- The transformation does not work with mixed-representation numbers, e.g. "140 million"
- The transformation does not work with unconventionally-formatted numbers, e.g. "one thousand million" in place of
"one billion", and assumes a standard formatting like "one million, three hundred thousand, seven hundred forty-two"
- The transformation may fail in settings where the actual references are ambiguous, e.g. "The numbers five hundred, forty two, and six are even"
- As an easy extension we could output styled numbers, e.g. "1000000" as "1,000,000"

## Robustness Evaluation

1 change: 1 addition & 0 deletions transformations/words_to_numbers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .transformation import *
1 change: 1 addition & 0 deletions transformations/words_to_numbers/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
torchtext>=0.9.1
60 changes: 60 additions & 0 deletions transformations/words_to_numbers/test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
{
"type": "words_to_numbers",
"test_cases": [
{
"class": "WordsToNumbers",
"inputs": {
"sentence": "I have ten cats."
},
"outputs": [
{
"sentence": "I have 10 cats."
}
]
},
{
"class": "WordsToNumbers",
"inputs": {
"sentence": "Mo has twelve dogs who eat two hundred pieces of food every day."
},
"outputs": [
{
"sentence": "Mo has 12 dogs who eat 200 pieces of food every day."
}
]
},
{
"class": "WordsToNumbers",
"inputs": {
"sentence": "There are three hundred twelve million, five hundred thirty four thousand, six hundred seventy two people in the United States."
},
"outputs": [
{
"sentence": "There are 312534672 people in the United States."
}
]
},
{
"class": "WordsToNumbers",
"inputs": {
"sentence": "One vigintillion is a one followed by sixty three zeros."
},
"outputs": [
{
"sentence": "1000000000000000000000000000000000000000000000000000000000000000 is 1 followed by 63 zeros."
}
]
},
{
"class": "WordsToNumbers",
"inputs": {
"sentence": "Roughly one hundred forty million people are born each year."
},
"outputs": [
{
"sentence": "Roughly 140000000 people are born each year."
}
]
}
]
}
176 changes: 176 additions & 0 deletions transformations/words_to_numbers/text2nums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
'''
Very loosely adapted from https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers
'''

import re

from .words_to_numbers_constants import units, tens, teens, scales


def period_rep(tokens, period_start_loc, period_end_loc):
"""
Parse a "period" of the number corresponding to 3 digits, given a sequence of tokens and the location of the period
in that sentence
"""
str_ = ''

first_digit = '0'
tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc]))
if 'hundred' in tmp_tokens:
# The token before "hundred" must be the "number of hundreds"
hundred_idx = tmp_tokens.index('hundred')
first_digit = units[tmp_tokens[hundred_idx - 1]] # Will raise KeyError if malformed input
period_start_loc += 2 # Now, only consider tokens after the "X hundred" in the sequence
tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc])) # start and end locs have changed
str_ += first_digit

second_digit = '0'
for t in tens:
if t in tmp_tokens:
second_digit = tens[t]
period_start_loc += 1 # Now, only consider tokens after the tens quantifier in the sequence
tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc])) # start and end locs have changed
str_ += second_digit

third_digit = '0'
for u in units:
if u in tmp_tokens:
third_digit = units[u]
period_start_loc += 1 # Though this is not used currently, leave it here for extensions like "one point six"
tmp_tokens = list(map(lambda x: x.lower(), tokens[period_start_loc:period_end_loc])) # start and end locs have changed
str_ += third_digit

# Handle the case of 11 - 19
for te in teens:
if te in tmp_tokens:
str_ = str_[0] + teens[te] # Can't do in-place because of 'str' object does not support item assignment

return str_


def is_token_numeric(token):
"""
Decide if a given token is part of the number
"""
return token != ',' and (token in units or token in tens or token in teens or token.rstrip(',') in scales or token == 'hundred')

def find_continugous_number_words(old_tokens):
"""
Given all the tokens of a sentence, find all the phrases that correspond to words.
This is necessary because several "word numbers" may be present in a sentence, e.g.
'three hundred people went to twenty two events'

Returns a set of "word numbers" and their corresponding start and end indices in the original token sequence
"""
number_words = []
start_idcs = []
end_idcs = []

t_idx = 0
new_word = True
tokens = list(map(lambda x: x.lower(), old_tokens))
while t_idx < len(tokens):
if is_token_numeric(tokens[t_idx]):
if new_word: # We've found a new "word number"
start_idx = t_idx
start_idcs.append(t_idx)
new_word = False
else:
if not new_word: # We just completed the "word number"
number_words.append(tokens[start_idx:t_idx])
end_idcs.append(t_idx)
new_word = True
t_idx += 1

return number_words, list(zip(start_idcs, end_idcs))


def parse_number_word(number_tokens):
"""
Given a sequence of tokens corresponding to a "word number", converts it to a decimal representation, e.g.
'Three thousand five hundred twelve' -> '3512'
"""
word_rep = ' '.join(number_tokens)
word_rep = word_rep.replace('-', ' ')
word_rep = word_rep.replace(' and ', ' ')
tokens = re.split('( |,)', word_rep)
tokens = list(filter(lambda x: x != ' ' and x != '', tokens)) # Remove extraneous empty strings and spaces

num_string = ''
last_found_period = None

# Search the possible period identifiers backwards to look for biggest scale first.
# As of Python 3.6, for the CPython implementation of Python, dictionaries maintain insertion order by default.
for period in list(scales)[::-1]:
if period in tokens:
# We found a new period identifier and had an old one that wasn't the one immediately larger than it,
# so we need to pad the middle with zeros, e.g. in the number "one billion, one thousand one"
if last_found_period is not None:
num_string += '0' * 3 * (list(scales).index(last_found_period) - list(scales).index(period) - 1)

last_found_period = period
period_end_loc = tokens.index(period)

# We need to find the tokens that correspond to the period under identification.
# Walk backwards to find comma or non-number word
period_start_loc = period_end_loc - 1
token = tokens[period_start_loc]
# Walk back towards the last seen period identifier like million, billion, etc. and
# don't wrap back around around the string
while is_token_numeric(token) and token not in scales and period_start_loc != 0:
period_start_loc -= 1
token = tokens[period_start_loc]

num_string += period_rep(tokens, period_start_loc, period_end_loc)

# Handle the corner cases like "one million and twelve"
if last_found_period is not None and last_found_period != 'thousand':
num_string += '0' * 3 * (list(scales).index(last_found_period))

# If the last token is not a period identifier, then we have a number less than one thousand
if last_found_period is None:
num_string += period_rep(tokens, 0, len(tokens))
elif tokens.index(last_found_period) != len(scales) - 1:
num_string += period_rep(tokens, tokens.index(last_found_period)+1, len(tokens))
else:
num_string += '0' * scales[last_found_period] # Add right-zeros in the case we had a number like "one million"

# Trim leading 0s
num_string = num_string.lstrip('0')

return num_string


def text2int(sentence):
"""
Given a sentence, find the contiguous subsequences of tokens that correspond to a number.
Convert those to their decimal representations, and interlace them with the original sentence.
"""
output_tokens = []
original_tokens = sentence.split(" ")
number_tokens, idcs = find_continugous_number_words(original_tokens)

if len(number_tokens) != 0: # We have some numbers to convert
number_tokens_counter = 0
idx = 0
while idx < len(original_tokens):
if number_tokens_counter < len(number_tokens) and idx == idcs[number_tokens_counter][0]: # Number to convert
output_tokens.append(parse_number_word(number_tokens[number_tokens_counter]))
idx = idcs[number_tokens_counter][1] # Skip ahead to the end of the word number
number_tokens_counter += 1
else: # Keep original tokens
output_tokens.append(original_tokens[idx])
idx += 1
else:
output_tokens = original_tokens
return ' '.join(output_tokens)

if __name__ == '__main__':
print(text2int("I have ten cats."))
print(text2int("Mo has twelve dogs who eat two hundred pieces of food every day."))
print(text2int("There are three hundred twelve million, five hundred thirty four thousand, six hundred seventy two people in the United States."))
print(text2int("One vigintillion is a one followed by sixty three zeros."))
print(text2int("Roughly one hundred forty million people are born each year."))
print(text2int("One thousand three hundred people went to three million twelve stores and two billion one thousand stores"))
print(text2int("There are three hundred twelve million, five hundred thirty four thousand, six hundred seventy two people in the United States and one in every two is female."))

24 changes: 24 additions & 0 deletions transformations/words_to_numbers/transformation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from typing import List

from interfaces.SentenceOperation import SentenceOperation
from tasks.TaskTypes import TaskType

from .text2nums import *


class WordsToNumbers(SentenceOperation):
'''
Transforms a given sentence that has "word numbers" to their numerical representations, e.g.
"I have ten cats" -> "I have 10 cats."

Inherits from SentenceOperation.
'''
tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION, TaskType.PARAPHRASE_DETECTION, TaskType.TEXTUAL_ENTAILMENT]
languages = ["en"]
keywords = ["lexical", "rule-based", "written", "highly-meaning-preserving", "high-precision", "low-generations"]

def __init__(self, seed: int = 0, max_outputs=1) -> None:
super().__init__(seed=seed, max_outputs=max_outputs)

def generate(self, sentence: str) -> List[str]:
return [text2int(sentence)]
61 changes: 61 additions & 0 deletions transformations/words_to_numbers/words_to_numbers_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
units = {
"zero": "0'",
"a": "1", # e.g. "a million"
"one": "1",
"two": "2",
"three": "3",
"four": "4",
"five": "5",
"six": "6",
"seven": "7",
"eight": "8",
"nine": "9",
}

teens = {
"ten": "10",
"eleven": "11",
"twelve": "12",
"thirteen": "13",
"fourteen": "14",
"fifteen": "15",
"sixteen": "16",
"seventeen": "17",
"eighteen": "18",
"nineteen": "19",
}

tens = {
"twenty": '2',
"thirty": '3',
"forty": '4',
"fifty": '5',
"sixty": '6',
"seventy": '7',
"eighty": '8',
"ninety": '9',
}

scales = {
"thousand": 3, # 10^3
"million": 6, # 10^6
"billion": 9, # 10^9
"trillion": 12, # 10^12
"quadrillion": 15, # 10^15
"quintillion": 18, # 10^18
"sextillion": 21, # 10^21
"septillion": 24, # 10^24
"octillion": 27, # 10^27
"nonillion": 30, # 10^30
"decillion": 33, # 10^33
"undecillion": 36, # 10^36
"dodecillion": 39, # 10^39
"tredecillion": 42, # 10^42
"quattuordecillion": 45, # 10^45
"quindecillion": 48, # 10^48
"sexdecillion": 51, # 10^51
"septendecillion": 54, # 10^54
"octodecillion": 57, # 10^57
"novemdecillion": 60, # 10^60
"vigintillion": 63, # 10^63
}