From 8758186047f77d56e095059469bc1e83f5540ede Mon Sep 17 00:00:00 2001 From: Shahab Raji Date: Tue, 31 Aug 2021 20:51:58 -0400 Subject: [PATCH 1/9] font change transformation added post-opt --- transformations/font_change/README.md | 36 + transformations/font_change/__init__.py | 1 + transformations/font_change/fonts.json | 2427 +++++++++++++++++ transformations/font_change/requirements.txt | 1 + transformations/font_change/test.json | 71 + transformations/font_change/transformation.py | 122 + 6 files changed, 2658 insertions(+) create mode 100644 transformations/font_change/README.md create mode 100644 transformations/font_change/__init__.py create mode 100644 transformations/font_change/fonts.json create mode 100644 transformations/font_change/requirements.txt create mode 100644 transformations/font_change/test.json create mode 100644 transformations/font_change/transformation.py diff --git a/transformations/font_change/README.md b/transformations/font_change/README.md new file mode 100644 index 000000000..927997b2e --- /dev/null +++ b/transformations/font_change/README.md @@ -0,0 +1,36 @@ +# Hashtagify + +This transformation add noise to an input sentence by named entities and other common words and turning them into hashtags, as often used in social media. + +Authors: [Shahab Raji](mailto:shahab.raji@rutgers.edu) (Rutgers University) and [Gerard de Melo](http://gerard.demelo.org/) (Hasso Plattner Institute / University of Potsdam) + + +## How does the transformation work? + +Hashtagify uses named entity recognition and part-of-speech tagging to turn certain words or phrases in the sentence into hashtags. This transformation converts the sentence to a social media style text to support the generalizability of NLP models. + +In more detail, Hashtagify identifies named entities, nouns, and verbs and adds the hash character "#" prefix to turn them into hashtags. The hashtags are added to each candidate word according to a fixed probability. Stopwords are not hashtagged. Multi-word named entities are handled by removing the spaces and capitalizing the first letter of each word. The syntactic and semantic structure of the sentence is preserved during the transformation. + +Examples: + +``` +New Delhi is among the many famous places in India. +``` + +to + +``` +#NewDelhi is among the many famous places in India. +``` + + +## Target Tasks + +This transformation can be used for augmenting the text in classification and generation tasks. + + +## Limitations + +- Non-neural NER models sometimes fail to identify the named entities correctly. A fine-tuned model based on the input data can be used to improve the performance of the NER model. +- Hashtags are sometimes added to unusual words or based on some trends. + diff --git a/transformations/font_change/__init__.py b/transformations/font_change/__init__.py new file mode 100644 index 000000000..930cdce0b --- /dev/null +++ b/transformations/font_change/__init__.py @@ -0,0 +1 @@ +from .transformation import * diff --git a/transformations/font_change/fonts.json b/transformations/font_change/fonts.json new file mode 100644 index 000000000..268c4de71 --- /dev/null +++ b/transformations/font_change/fonts.json @@ -0,0 +1,2427 @@ +{ + "normal": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "A", + "B": "B", + "C": "C", + "D": "D", + "E": "E", + "F": "F", + "G": "G", + "H": "H", + "I": "I", + "J": "J", + "K": "K", + "L": "L", + "M": "M", + "N": "N", + "O": "O", + "P": "P", + "Q": "Q", + "R": "R", + "S": "S", + "T": "T", + "U": "U", + "V": "V", + "W": "W", + "X": "X", + "Y": "Y", + "Z": "Z", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "a", + "b": "b", + "c": "c", + "d": "d", + "e": "e", + "f": "f", + "g": "g", + "h": "h", + "i": "i", + "j": "j", + "k": "k", + "l": "l", + "m": "m", + "n": "n", + "o": "o", + "p": "p", + "q": "q", + "r": "r", + "s": "s", + "t": "t", + "u": "u", + "v": "v", + "w": "w", + "x": "x", + "y": "y", + "z": "z", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "sans": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "\ud835\udfe2", + "1": "\ud835\udfe3", + "2": "\ud835\udfe4", + "3": "\ud835\udfe5", + "4": "\ud835\udfe6", + "5": "\ud835\udfe7", + "6": "\ud835\udfe8", + "7": "\ud835\udfe9", + "8": "\ud835\udfea", + "9": "\ud835\udfeb", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\udda0", + "B": "\ud835\udda1", + "C": "\ud835\udda2", + "D": "\ud835\udda3", + "E": "\ud835\udda4", + "F": "\ud835\udda5", + "G": "\ud835\udda6", + "H": "\ud835\udda7", + "I": "\ud835\udda8", + "J": "\ud835\udda9", + "K": "\ud835\uddaa", + "L": "\ud835\uddab", + "M": "\ud835\uddac", + "N": "\ud835\uddad", + "O": "\ud835\uddae", + "P": "\ud835\uddaf", + "Q": "\ud835\uddb0", + "R": "\ud835\uddb1", + "S": "\ud835\uddb2", + "T": "\ud835\uddb3", + "U": "\ud835\uddb4", + "V": "\ud835\uddb5", + "W": "\ud835\uddb6", + "X": "\ud835\uddb7", + "Y": "\ud835\uddb8", + "Z": "\ud835\uddb9", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\uddba", + "b": "\ud835\uddbb", + "c": "\ud835\uddbc", + "d": "\ud835\uddbd", + "e": "\ud835\uddbe", + "f": "\ud835\uddbf", + "g": "\ud835\uddc0", + "h": "\ud835\uddc1", + "i": "\ud835\uddc2", + "j": "\ud835\uddc3", + "k": "\ud835\uddc4", + "l": "\ud835\uddc5", + "m": "\ud835\uddc6", + "n": "\ud835\uddc7", + "o": "\ud835\uddc8", + "p": "\ud835\uddc9", + "q": "\ud835\uddca", + "r": "\ud835\uddcb", + "s": "\ud835\uddcc", + "t": "\ud835\uddcd", + "u": "\ud835\uddce", + "v": "\ud835\uddcf", + "w": "\ud835\uddd0", + "x": "\ud835\uddd1", + "y": "\ud835\uddd2", + "z": "\ud835\uddd3", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "sansBold": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "\ud835\udfec", + "1": "\ud835\udfed", + "2": "\ud835\udfee", + "3": "\ud835\udfef", + "4": "\ud835\udff0", + "5": "\ud835\udff1", + "6": "\ud835\udff2", + "7": "\ud835\udff3", + "8": "\ud835\udff4", + "9": "\ud835\udff5", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\uddd4", + "B": "\ud835\uddd5", + "C": "\ud835\uddd6", + "D": "\ud835\uddd7", + "E": "\ud835\uddd8", + "F": "\ud835\uddd9", + "G": "\ud835\uddda", + "H": "\ud835\udddb", + "I": "\ud835\udddc", + "J": "\ud835\udddd", + "K": "\ud835\uddde", + "L": "\ud835\udddf", + "M": "\ud835\udde0", + "N": "\ud835\udde1", + "O": "\ud835\udde2", + "P": "\ud835\udde3", + "Q": "\ud835\udde4", + "R": "\ud835\udde5", + "S": "\ud835\udde6", + "T": "\ud835\udde7", + "U": "\ud835\udde8", + "V": "\ud835\udde9", + "W": "\ud835\uddea", + "X": "\ud835\uddeb", + "Y": "\ud835\uddec", + "Z": "\ud835\udded", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\uddee", + "b": "\ud835\uddef", + "c": "\ud835\uddf0", + "d": "\ud835\uddf1", + "e": "\ud835\uddf2", + "f": "\ud835\uddf3", + "g": "\ud835\uddf4", + "h": "\ud835\uddf5", + "i": "\ud835\uddf6", + "j": "\ud835\uddf7", + "k": "\ud835\uddf8", + "l": "\ud835\uddf9", + "m": "\ud835\uddfa", + "n": "\ud835\uddfb", + "o": "\ud835\uddfc", + "p": "\ud835\uddfd", + "q": "\ud835\uddfe", + "r": "\ud835\uddff", + "s": "\ud835\ude00", + "t": "\ud835\ude01", + "u": "\ud835\ude02", + "v": "\ud835\ude03", + "w": "\ud835\ude04", + "x": "\ud835\ude05", + "y": "\ud835\ude06", + "z": "\ud835\ude07", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "sansItalic": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\ude08", + "B": "\ud835\ude09", + "C": "\ud835\ude0a", + "D": "\ud835\ude0b", + "E": "\ud835\ude0c", + "F": "\ud835\ude0d", + "G": "\ud835\ude0e", + "H": "\ud835\ude0f", + "I": "\ud835\ude10", + "J": "\ud835\ude11", + "K": "\ud835\ude12", + "L": "\ud835\ude13", + "M": "\ud835\ude14", + "N": "\ud835\ude15", + "O": "\ud835\ude16", + "P": "\ud835\ude17", + "Q": "\ud835\ude18", + "R": "\ud835\ude19", + "S": "\ud835\ude1a", + "T": "\ud835\ude1b", + "U": "\ud835\ude1c", + "V": "\ud835\ude1d", + "W": "\ud835\ude1e", + "X": "\ud835\ude1f", + "Y": "\ud835\ude20", + "Z": "\ud835\ude21", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\ude22", + "b": "\ud835\ude23", + "c": "\ud835\ude24", + "d": "\ud835\ude25", + "e": "\ud835\ude26", + "f": "\ud835\ude27", + "g": "\ud835\ude28", + "h": "\ud835\ude29", + "i": "\ud835\ude2a", + "j": "\ud835\ude2b", + "k": "\ud835\ude2c", + "l": "\ud835\ude2d", + "m": "\ud835\ude2e", + "n": "\ud835\ude2f", + "o": "\ud835\ude30", + "p": "\ud835\ude31", + "q": "\ud835\ude32", + "r": "\ud835\ude33", + "s": "\ud835\ude34", + "t": "\ud835\ude35", + "u": "\ud835\ude36", + "v": "\ud835\ude37", + "w": "\ud835\ude38", + "x": "\ud835\ude39", + "y": "\ud835\ude3a", + "z": "\ud835\ude3b", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "sansBoldItalic": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\ude3c", + "B": "\ud835\ude3d", + "C": "\ud835\ude3e", + "D": "\ud835\ude3f", + "E": "\ud835\ude40", + "F": "\ud835\ude41", + "G": "\ud835\ude42", + "H": "\ud835\ude43", + "I": "\ud835\ude44", + "J": "\ud835\ude45", + "K": "\ud835\ude46", + "L": "\ud835\ude47", + "M": "\ud835\ude48", + "N": "\ud835\ude49", + "O": "\ud835\ude4a", + "P": "\ud835\ude4b", + "Q": "\ud835\ude4c", + "R": "\ud835\ude4d", + "S": "\ud835\ude4e", + "T": "\ud835\ude4f", + "U": "\ud835\ude50", + "V": "\ud835\ude51", + "W": "\ud835\ude52", + "X": "\ud835\ude53", + "Y": "\ud835\ude54", + "Z": "\ud835\ude55", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\ude56", + "b": "\ud835\ude57", + "c": "\ud835\ude58", + "d": "\ud835\ude59", + "e": "\ud835\ude5a", + "f": "\ud835\ude5b", + "g": "\ud835\ude5c", + "h": "\ud835\ude5d", + "i": "\ud835\ude5e", + "j": "\ud835\ude5f", + "k": "\ud835\ude60", + "l": "\ud835\ude61", + "m": "\ud835\ude62", + "n": "\ud835\ude63", + "o": "\ud835\ude64", + "p": "\ud835\ude65", + "q": "\ud835\ude66", + "r": "\ud835\ude67", + "s": "\ud835\ude68", + "t": "\ud835\ude69", + "u": "\ud835\ude6a", + "v": "\ud835\ude6b", + "w": "\ud835\ude6c", + "x": "\ud835\ude6d", + "y": "\ud835\ude6e", + "z": "\ud835\ude6f", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "monospace": { + "\"": "\"", + "\\": "\\", + " ": "\u2002", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "\ud835\udff6", + "1": "\ud835\udff7", + "2": "\ud835\udff8", + "3": "\ud835\udff9", + "4": "\ud835\udffa", + "5": "\ud835\udffb", + "6": "\ud835\udffc", + "7": "\ud835\udffd", + "8": "\ud835\udffe", + "9": "\ud835\udfff", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\ude70", + "B": "\ud835\ude71", + "C": "\ud835\ude72", + "D": "\ud835\ude73", + "E": "\ud835\ude74", + "F": "\ud835\ude75", + "G": "\ud835\ude76", + "H": "\ud835\ude77", + "I": "\ud835\ude78", + "J": "\ud835\ude79", + "K": "\ud835\ude7a", + "L": "\ud835\ude7b", + "M": "\ud835\ude7c", + "N": "\ud835\ude7d", + "O": "\ud835\ude7e", + "P": "\ud835\ude7f", + "Q": "\ud835\ude80", + "R": "\ud835\ude81", + "S": "\ud835\ude82", + "T": "\ud835\ude83", + "U": "\ud835\ude84", + "V": "\ud835\ude85", + "W": "\ud835\ude86", + "X": "\ud835\ude87", + "Y": "\ud835\ude88", + "Z": "\ud835\ude89", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\ude8a", + "b": "\ud835\ude8b", + "c": "\ud835\ude8c", + "d": "\ud835\ude8d", + "e": "\ud835\ude8e", + "f": "\ud835\ude8f", + "g": "\ud835\ude90", + "h": "\ud835\ude91", + "i": "\ud835\ude92", + "j": "\ud835\ude93", + "k": "\ud835\ude94", + "l": "\ud835\ude95", + "m": "\ud835\ude96", + "n": "\ud835\ude97", + "o": "\ud835\ude98", + "p": "\ud835\ude99", + "q": "\ud835\ude9a", + "r": "\ud835\ude9b", + "s": "\ud835\ude9c", + "t": "\ud835\ude9d", + "u": "\ud835\ude9e", + "v": "\ud835\ude9f", + "w": "\ud835\udea0", + "x": "\ud835\udea1", + "y": "\ud835\udea2", + "z": "\ud835\udea3", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "fullwidth": { + "\"": "\"", + "\\": "\uff3c", + " ": "\u3000", + "!": "\uff01", + "#": "\uff03", + "$": "\uff04", + "%": "\uff05", + "&": "\uff06", + "'": "\uff07", + "(": "\uff08", + ")": "\uff09", + "*": "\uff0a", + "+": "\uff0b", + ",": "\uff0c", + "-": "\uff0d", + ".": "\uff0e", + "/": "\uff0f", + "0": "\uff10", + "1": "\uff11", + "2": "\uff12", + "3": "\uff13", + "4": "\uff14", + "5": "\uff15", + "6": "\uff16", + "7": "\uff17", + "8": "\uff18", + "9": "\uff19", + ":": "\uff1a", + ";": "\uff1b", + "<": "<", + "=": "\uff1d", + ">": ">", + "?": "\uff1f", + "@": "\uff20", + "A": "\uff21", + "B": "\uff22", + "C": "\uff23", + "D": "\uff24", + "E": "\uff25", + "F": "\uff26", + "G": "\uff27", + "H": "\uff28", + "I": "\uff29", + "J": "\uff2a", + "K": "\uff2b", + "L": "\uff2c", + "M": "\uff2d", + "N": "\uff2e", + "O": "\uff2f", + "P": "\uff30", + "Q": "\uff31", + "R": "\uff32", + "S": "\uff33", + "T": "\uff34", + "U": "\uff35", + "V": "\uff36", + "W": "\uff37", + "X": "\uff38", + "Y": "\uff39", + "Z": "\uff3a", + "[": "\uff3b", + "]": "\uff3d", + "^": "\uff3e", + "_": "\uff3f", + "`": "\uff40", + "a": "\uff41", + "b": "\uff42", + "c": "\uff43", + "d": "\uff44", + "e": "\uff45", + "f": "\uff46", + "g": "\uff47", + "h": "\uff48", + "i": "\uff49", + "j": "\uff4a", + "k": "\uff4b", + "l": "\uff4c", + "m": "\uff4d", + "n": "\uff4e", + "o": "\uff4f", + "p": "\uff50", + "q": "\uff51", + "r": "\uff52", + "s": "\uff53", + "t": "\uff54", + "u": "\uff55", + "v": "\uff56", + "w": "\uff57", + "x": "\uff58", + "y": "\uff59", + "z": "\uff5a", + "{": "\uff5b", + "|": "\uff5c", + "}": "\uff5d", + "~": "\uff5e" + }, + "fraktur": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\udd04", + "B": "\ud835\udd05", + "C": "\u212d", + "D": "\ud835\udd07", + "E": "\ud835\udd08", + "F": "\ud835\udd09", + "G": "\ud835\udd0a", + "H": "\u210c", + "I": "\u2111", + "J": "\ud835\udd0d", + "K": "\ud835\udd0e", + "L": "\ud835\udd0f", + "M": "\ud835\udd10", + "N": "\ud835\udd11", + "O": "\ud835\udd12", + "P": "\ud835\udd13", + "Q": "\ud835\udd14", + "R": "\u211c", + "S": "\ud835\udd16", + "T": "\ud835\udd17", + "U": "\ud835\udd18", + "V": "\ud835\udd19", + "W": "\ud835\udd1a", + "X": "\ud835\udd1b", + "Y": "\ud835\udd1c", + "Z": "\u2128", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\udd1e", + "b": "\ud835\udd1f", + "c": "\ud835\udd20", + "d": "\ud835\udd21", + "e": "\ud835\udd22", + "f": "\ud835\udd23", + "g": "\ud835\udd24", + "h": "\ud835\udd25", + "i": "\ud835\udd26", + "j": "\ud835\udd27", + "k": "\ud835\udd28", + "l": "\ud835\udd29", + "m": "\ud835\udd2a", + "n": "\ud835\udd2b", + "o": "\ud835\udd2c", + "p": "\ud835\udd2d", + "q": "\ud835\udd2e", + "r": "\ud835\udd2f", + "s": "\ud835\udd30", + "t": "\ud835\udd31", + "u": "\ud835\udd32", + "v": "\ud835\udd33", + "w": "\ud835\udd34", + "x": "\ud835\udd35", + "y": "\ud835\udd36", + "z": "\ud835\udd37", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "boldFraktur": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\udd6c", + "B": "\ud835\udd6d", + "C": "\ud835\udd6e", + "D": "\ud835\udd6f", + "E": "\ud835\udd70", + "F": "\ud835\udd71", + "G": "\ud835\udd72", + "H": "\ud835\udd73", + "I": "\ud835\udd74", + "J": "\ud835\udd75", + "K": "\ud835\udd76", + "L": "\ud835\udd77", + "M": "\ud835\udd78", + "N": "\ud835\udd79", + "O": "\ud835\udd7a", + "P": "\ud835\udd7b", + "Q": "\ud835\udd7c", + "R": "\ud835\udd7d", + "S": "\ud835\udd7e", + "T": "\ud835\udd7f", + "U": "\ud835\udd80", + "V": "\ud835\udd81", + "W": "\ud835\udd82", + "X": "\ud835\udd83", + "Y": "\ud835\udd84", + "Z": "\ud835\udd85", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\udd86", + "b": "\ud835\udd87", + "c": "\ud835\udd88", + "d": "\ud835\udd89", + "e": "\ud835\udd8a", + "f": "\ud835\udd8b", + "g": "\ud835\udd8c", + "h": "\ud835\udd8d", + "i": "\ud835\udd8e", + "j": "\ud835\udd8f", + "k": "\ud835\udd90", + "l": "\ud835\udd91", + "m": "\ud835\udd92", + "n": "\ud835\udd93", + "o": "\ud835\udd94", + "p": "\ud835\udd95", + "q": "\ud835\udd96", + "r": "\ud835\udd97", + "s": "\ud835\udd98", + "t": "\ud835\udd99", + "u": "\ud835\udd9a", + "v": "\ud835\udd9b", + "w": "\ud835\udd9c", + "x": "\ud835\udd9d", + "y": "\ud835\udd9e", + "z": "\ud835\udd9f", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "serifBold": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "\ud835\udfce", + "1": "\ud835\udfcf", + "2": "\ud835\udfd0", + "3": "\ud835\udfd1", + "4": "\ud835\udfd2", + "5": "\ud835\udfd3", + "6": "\ud835\udfd4", + "7": "\ud835\udfd5", + "8": "\ud835\udfd6", + "9": "\ud835\udfd7", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\udc00", + "B": "\ud835\udc01", + "C": "\ud835\udc02", + "D": "\ud835\udc03", + "E": "\ud835\udc04", + "F": "\ud835\udc05", + "G": "\ud835\udc06", + "H": "\ud835\udc07", + "I": "\ud835\udc08", + "J": "\ud835\udc09", + "K": "\ud835\udc0a", + "L": "\ud835\udc0b", + "M": "\ud835\udc0c", + "N": "\ud835\udc0d", + "O": "\ud835\udc0e", + "P": "\ud835\udc0f", + "Q": "\ud835\udc10", + "R": "\ud835\udc11", + "S": "\ud835\udc12", + "T": "\ud835\udc13", + "U": "\ud835\udc14", + "V": "\ud835\udc15", + "W": "\ud835\udc16", + "X": "\ud835\udc17", + "Y": "\ud835\udc18", + "Z": "\ud835\udc19", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\udc1a", + "b": "\ud835\udc1b", + "c": "\ud835\udc1c", + "d": "\ud835\udc1d", + "e": "\ud835\udc1e", + "f": "\ud835\udc1f", + "g": "\ud835\udc20", + "h": "\ud835\udc21", + "i": "\ud835\udc22", + "j": "\ud835\udc23", + "k": "\ud835\udc24", + "l": "\ud835\udc25", + "m": "\ud835\udc26", + "n": "\ud835\udc27", + "o": "\ud835\udc28", + "p": "\ud835\udc29", + "q": "\ud835\udc2a", + "r": "\ud835\udc2b", + "s": "\ud835\udc2c", + "t": "\ud835\udc2d", + "u": "\ud835\udc2e", + "v": "\ud835\udc2f", + "w": "\ud835\udc30", + "x": "\ud835\udc31", + "y": "\ud835\udc32", + "z": "\ud835\udc33", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "serifItalic": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\udc34", + "B": "\ud835\udc35", + "C": "\ud835\udc36", + "D": "\ud835\udc37", + "E": "\ud835\udc38", + "F": "\ud835\udc39", + "G": "\ud835\udc3a", + "H": "\ud835\udc3b", + "I": "\ud835\udc3c", + "J": "\ud835\udc3d", + "K": "\ud835\udc3e", + "L": "\ud835\udc3f", + "M": "\ud835\udc40", + "N": "\ud835\udc41", + "O": "\ud835\udc42", + "P": "\ud835\udc43", + "Q": "\ud835\udc44", + "R": "\ud835\udc45", + "S": "\ud835\udc46", + "T": "\ud835\udc47", + "U": "\ud835\udc48", + "V": "\ud835\udc49", + "W": "\ud835\udc4a", + "X": "\ud835\udc4b", + "Y": "\ud835\udc4c", + "Z": "\ud835\udc4d", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\udc4e", + "b": "\ud835\udc4f", + "c": "\ud835\udc50", + "d": "\ud835\udc51", + "e": "\ud835\udc52", + "f": "\ud835\udc53", + "g": "\ud835\udc54", + "h": "\u210e", + "i": "\ud835\udc56", + "j": "\ud835\udc57", + "k": "\ud835\udc58", + "l": "\ud835\udc59", + "m": "\ud835\udc5a", + "n": "\ud835\udc5b", + "o": "\ud835\udc5c", + "p": "\ud835\udc5d", + "q": "\ud835\udc5e", + "r": "\ud835\udc5f", + "s": "\ud835\udc60", + "t": "\ud835\udc61", + "u": "\ud835\udc62", + "v": "\ud835\udc63", + "w": "\ud835\udc64", + "x": "\ud835\udc65", + "y": "\ud835\udc66", + "z": "\ud835\udc67", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "serifBoldItalic": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\udc68", + "B": "\ud835\udc69", + "C": "\ud835\udc6a", + "D": "\ud835\udc6b", + "E": "\ud835\udc6c", + "F": "\ud835\udc6d", + "G": "\ud835\udc6e", + "H": "\ud835\udc6f", + "I": "\ud835\udc70", + "J": "\ud835\udc71", + "K": "\ud835\udc72", + "L": "\ud835\udc73", + "M": "\ud835\udc74", + "N": "\ud835\udc75", + "O": "\ud835\udc76", + "P": "\ud835\udc77", + "Q": "\ud835\udc78", + "R": "\ud835\udc79", + "S": "\ud835\udc7a", + "T": "\ud835\udc7b", + "U": "\ud835\udc7c", + "V": "\ud835\udc7d", + "W": "\ud835\udc7e", + "X": "\ud835\udc7f", + "Y": "\ud835\udc80", + "Z": "\ud835\udc81", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\udc82", + "b": "\ud835\udc83", + "c": "\ud835\udc84", + "d": "\ud835\udc85", + "e": "\ud835\udc86", + "f": "\ud835\udc87", + "g": "\ud835\udc88", + "h": "\ud835\udc89", + "i": "\ud835\udc8a", + "j": "\ud835\udc8b", + "k": "\ud835\udc8c", + "l": "\ud835\udc8d", + "m": "\ud835\udc8e", + "n": "\ud835\udc8f", + "o": "\ud835\udc90", + "p": "\ud835\udc91", + "q": "\ud835\udc92", + "r": "\ud835\udc93", + "s": "\ud835\udc94", + "t": "\ud835\udc95", + "u": "\ud835\udc96", + "v": "\ud835\udc97", + "w": "\ud835\udc98", + "x": "\ud835\udc99", + "y": "\ud835\udc9a", + "z": "\ud835\udc9b", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "doubleStruck": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "\ud835\udfd8", + "1": "\ud835\udfd9", + "2": "\ud835\udfda", + "3": "\ud835\udfdb", + "4": "\ud835\udfdc", + "5": "\ud835\udfdd", + "6": "\ud835\udfde", + "7": "\ud835\udfdf", + "8": "\ud835\udfe0", + "9": "\ud835\udfe1", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\udd38", + "B": "\ud835\udd39", + "C": "\u2102", + "D": "\ud835\udd3b", + "E": "\ud835\udd3c", + "F": "\ud835\udd3d", + "G": "\ud835\udd3e", + "H": "\u210d", + "I": "\ud835\udd40", + "J": "\ud835\udd41", + "K": "\ud835\udd42", + "L": "\ud835\udd43", + "M": "\ud835\udd44", + "N": "\u2115", + "O": "\ud835\udd46", + "P": "\u2119", + "Q": "\u211a", + "R": "\u211d", + "S": "\ud835\udd4a", + "T": "\ud835\udd4b", + "U": "\ud835\udd4c", + "V": "\ud835\udd4d", + "W": "\ud835\udd4e", + "X": "\ud835\udd4f", + "Y": "\ud835\udd50", + "Z": "\u2124", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\udd52", + "b": "\ud835\udd53", + "c": "\ud835\udd54", + "d": "\ud835\udd55", + "e": "\ud835\udd56", + "f": "\ud835\udd57", + "g": "\ud835\udd58", + "h": "\ud835\udd59", + "i": "\ud835\udd5a", + "j": "\ud835\udd5b", + "k": "\ud835\udd5c", + "l": "\ud835\udd5d", + "m": "\ud835\udd5e", + "n": "\ud835\udd5f", + "o": "\ud835\udd60", + "p": "\ud835\udd61", + "q": "\ud835\udd62", + "r": "\ud835\udd63", + "s": "\ud835\udd64", + "t": "\ud835\udd65", + "u": "\ud835\udd66", + "v": "\ud835\udd67", + "w": "\ud835\udd68", + "x": "\ud835\udd69", + "y": "\ud835\udd6a", + "z": "\ud835\udd6b", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "script": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\udc9c", + "B": "\u212c", + "C": "\ud835\udc9e", + "D": "\ud835\udc9f", + "E": "\u2130", + "F": "\u2131", + "G": "\ud835\udca2", + "H": "\u210b", + "I": "\u2110", + "J": "\ud835\udca5", + "K": "\ud835\udca6", + "L": "\u2112", + "M": "\u2133", + "N": "\ud835\udca9", + "O": "\ud835\udcaa", + "P": "\ud835\udcab", + "Q": "\ud835\udcac", + "R": "\u211b", + "S": "\ud835\udcae", + "T": "\ud835\udcaf", + "U": "\ud835\udcb0", + "V": "\ud835\udcb1", + "W": "\ud835\udcb2", + "X": "\ud835\udcb3", + "Y": "\ud835\udcb4", + "Z": "\ud835\udcb5", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\udcb6", + "b": "\ud835\udcb7", + "c": "\ud835\udcb8", + "d": "\ud835\udcb9", + "e": "\u212f", + "f": "\ud835\udcbb", + "g": "\u210a", + "h": "\ud835\udcbd", + "i": "\ud835\udcbe", + "j": "\ud835\udcbf", + "k": "\ud835\udcc0", + "l": "\ud835\udcc1", + "m": "\ud835\udcc2", + "n": "\ud835\udcc3", + "o": "\u2134", + "p": "\ud835\udcc5", + "q": "\ud835\udcc6", + "r": "\ud835\udcc7", + "s": "\ud835\udcc8", + "t": "\ud835\udcc9", + "u": "\ud835\udcca", + "v": "\ud835\udccb", + "w": "\ud835\udccc", + "x": "\ud835\udccd", + "y": "\ud835\udcce", + "z": "\ud835\udccf", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "boldScript": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud835\udcd0", + "B": "\ud835\udcd1", + "C": "\ud835\udcd2", + "D": "\ud835\udcd3", + "E": "\ud835\udcd4", + "F": "\ud835\udcd5", + "G": "\ud835\udcd6", + "H": "\ud835\udcd7", + "I": "\ud835\udcd8", + "J": "\ud835\udcd9", + "K": "\ud835\udcda", + "L": "\ud835\udcdb", + "M": "\ud835\udcdc", + "N": "\ud835\udcdd", + "O": "\ud835\udcde", + "P": "\ud835\udcdf", + "Q": "\ud835\udce0", + "R": "\ud835\udce1", + "S": "\ud835\udce2", + "T": "\ud835\udce3", + "U": "\ud835\udce4", + "V": "\ud835\udce5", + "W": "\ud835\udce6", + "X": "\ud835\udce7", + "Y": "\ud835\udce8", + "Z": "\ud835\udce9", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud835\udcea", + "b": "\ud835\udceb", + "c": "\ud835\udcec", + "d": "\ud835\udced", + "e": "\ud835\udcee", + "f": "\ud835\udcef", + "g": "\ud835\udcf0", + "h": "\ud835\udcf1", + "i": "\ud835\udcf2", + "j": "\ud835\udcf3", + "k": "\ud835\udcf4", + "l": "\ud835\udcf5", + "m": "\ud835\udcf6", + "n": "\ud835\udcf7", + "o": "\ud835\udcf8", + "p": "\ud835\udcf9", + "q": "\ud835\udcfa", + "r": "\ud835\udcfb", + "s": "\ud835\udcfc", + "t": "\ud835\udcfd", + "u": "\ud835\udcfe", + "v": "\ud835\udcff", + "w": "\ud835\udd00", + "x": "\ud835\udd01", + "y": "\ud835\udd02", + "z": "\ud835\udd03", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "circled": { + "\"": "\"", + "\\": "\u29b8", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "\u229b", + "+": "\u2295", + ",": ",", + "-": "\u2296", + ".": "\u2a00", + "/": "\u2298", + "0": "\u24ea", + "1": "\u2460", + "2": "\u2461", + "3": "\u2462", + "4": "\u2463", + "5": "\u2464", + "6": "\u2465", + "7": "\u2466", + "8": "\u2467", + "9": "\u2468", + ":": ":", + ";": ";", + "<": "\u29c0", + "=": "\u229c", + ">": "\u29c1", + "?": "?", + "@": "@", + "A": "\u24b6", + "B": "\u24b7", + "C": "\u24b8", + "D": "\u24b9", + "E": "\u24ba", + "F": "\u24bb", + "G": "\u24bc", + "H": "\u24bd", + "I": "\u24be", + "J": "\u24bf", + "K": "\u24c0", + "L": "\u24c1", + "M": "\u24c2", + "N": "\u24c3", + "O": "\u24c4", + "P": "\u24c5", + "Q": "\u24c6", + "R": "\u24c7", + "S": "\u24c8", + "T": "\u24c9", + "U": "\u24ca", + "V": "\u24cb", + "W": "\u24cc", + "X": "\u24cd", + "Y": "\u24ce", + "Z": "\u24cf", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\u24d0", + "b": "\u24d1", + "c": "\u24d2", + "d": "\u24d3", + "e": "\u24d4", + "f": "\u24d5", + "g": "\u24d6", + "h": "\u24d7", + "i": "\u24d8", + "j": "\u24d9", + "k": "\u24da", + "l": "\u24db", + "m": "\u24dc", + "n": "\u24dd", + "o": "\u24de", + "p": "\u24df", + "q": "\u24e0", + "r": "\u24e1", + "s": "\u24e2", + "t": "\u24e3", + "u": "\u24e4", + "v": "\u24e5", + "w": "\u24e6", + "x": "\u24e7", + "y": "\u24e8", + "z": "\u24e9", + "{": "{", + "|": "\u29b6", + "}": "}", + "~": "~" + }, + "circledNegative": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "\u24ff", + "1": "\u2776", + "2": "\u2777", + "3": "\u2778", + "4": "\u2779", + "5": "\u277a", + "6": "\u277b", + "7": "\u277c", + "8": "\u277d", + "9": "\u277e", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud83c\udd50", + "B": "\ud83c\udd51", + "C": "\ud83c\udd52", + "D": "\ud83c\udd53", + "E": "\ud83c\udd54", + "F": "\ud83c\udd55", + "G": "\ud83c\udd56", + "H": "\ud83c\udd57", + "I": "\ud83c\udd58", + "J": "\ud83c\udd59", + "K": "\ud83c\udd5a", + "L": "\ud83c\udd5b", + "M": "\ud83c\udd5c", + "N": "\ud83c\udd5d", + "O": "\ud83c\udd5e", + "P": "\ud83c\udd5f", + "Q": "\ud83c\udd60", + "R": "\ud83c\udd61", + "S": "\ud83c\udd62", + "T": "\ud83c\udd63", + "U": "\ud83c\udd64", + "V": "\ud83c\udd65", + "W": "\ud83c\udd66", + "X": "\ud83c\udd67", + "Y": "\ud83c\udd68", + "Z": "\ud83c\udd69", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud83c\udd50", + "b": "\ud83c\udd51", + "c": "\ud83c\udd52", + "d": "\ud83c\udd53", + "e": "\ud83c\udd54", + "f": "\ud83c\udd55", + "g": "\ud83c\udd56", + "h": "\ud83c\udd57", + "i": "\ud83c\udd58", + "j": "\ud83c\udd59", + "k": "\ud83c\udd5a", + "l": "\ud83c\udd5b", + "m": "\ud83c\udd5c", + "n": "\ud83c\udd5d", + "o": "\ud83c\udd5e", + "p": "\ud83c\udd5f", + "q": "\ud83c\udd60", + "r": "\ud83c\udd61", + "s": "\ud83c\udd62", + "t": "\ud83c\udd63", + "u": "\ud83c\udd64", + "v": "\ud83c\udd65", + "w": "\ud83c\udd66", + "x": "\ud83c\udd67", + "y": "\ud83c\udd68", + "z": "\ud83c\udd69", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "squared": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud83c\udd30", + "B": "\ud83c\udd31", + "C": "\ud83c\udd32", + "D": "\ud83c\udd33", + "E": "\ud83c\udd34", + "F": "\ud83c\udd35", + "G": "\ud83c\udd36", + "H": "\ud83c\udd37", + "I": "\ud83c\udd38", + "J": "\ud83c\udd39", + "K": "\ud83c\udd3a", + "L": "\ud83c\udd3b", + "M": "\ud83c\udd3c", + "N": "\ud83c\udd3d", + "O": "\ud83c\udd3e", + "P": "\ud83c\udd3f", + "Q": "\ud83c\udd40", + "R": "\ud83c\udd41", + "S": "\ud83c\udd42", + "T": "\ud83c\udd43", + "U": "\ud83c\udd44", + "V": "\ud83c\udd45", + "W": "\ud83c\udd46", + "X": "\ud83c\udd47", + "Y": "\ud83c\udd48", + "Z": "\ud83c\udd49", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud83c\udd30", + "b": "\ud83c\udd31", + "c": "\ud83c\udd32", + "d": "\ud83c\udd33", + "e": "\ud83c\udd34", + "f": "\ud83c\udd35", + "g": "\ud83c\udd36", + "h": "\ud83c\udd37", + "i": "\ud83c\udd38", + "j": "\ud83c\udd39", + "k": "\ud83c\udd3a", + "l": "\ud83c\udd3b", + "m": "\ud83c\udd3c", + "n": "\ud83c\udd3d", + "o": "\ud83c\udd3e", + "p": "\ud83c\udd3f", + "q": "\ud83c\udd40", + "r": "\ud83c\udd41", + "s": "\ud83c\udd42", + "t": "\ud83c\udd43", + "u": "\ud83c\udd44", + "v": "\ud83c\udd45", + "w": "\ud83c\udd46", + "x": "\ud83c\udd47", + "y": "\ud83c\udd48", + "z": "\ud83c\udd49", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "squaredNegative": { + "\"": "\"", + "\\": "\u29c5", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "\u29c6", + "+": "\u229e", + ",": ",", + "-": "\u229f", + ".": "\u22a1", + "/": "\u29c4", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\ud83c\udd70", + "B": "\ud83c\udd71", + "C": "\ud83c\udd72", + "D": "\ud83c\udd73", + "E": "\ud83c\udd74", + "F": "\ud83c\udd75", + "G": "\ud83c\udd76", + "H": "\ud83c\udd77", + "I": "\ud83c\udd78", + "J": "\ud83c\udd79", + "K": "\ud83c\udd7a", + "L": "\ud83c\udd7b", + "M": "\ud83c\udd7c", + "N": "\ud83c\udd7d", + "O": "\ud83c\udd7e", + "P": "\ud83c\udd7f", + "Q": "\ud83c\udd80", + "R": "\ud83c\udd81", + "S": "\ud83c\udd82", + "T": "\ud83c\udd83", + "U": "\ud83c\udd84", + "V": "\ud83c\udd85", + "W": "\ud83c\udd86", + "X": "\ud83c\udd87", + "Y": "\ud83c\udd88", + "Z": "\ud83c\udd89", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\ud83c\udd70", + "b": "\ud83c\udd71", + "c": "\ud83c\udd72", + "d": "\ud83c\udd73", + "e": "\ud83c\udd74", + "f": "\ud83c\udd75", + "g": "\ud83c\udd76", + "h": "\ud83c\udd77", + "i": "\ud83c\udd78", + "j": "\ud83c\udd79", + "k": "\ud83c\udd7a", + "l": "\ud83c\udd7b", + "m": "\ud83c\udd7c", + "n": "\ud83c\udd7d", + "o": "\ud83c\udd7e", + "p": "\ud83c\udd7f", + "q": "\ud83c\udd80", + "r": "\ud83c\udd81", + "s": "\ud83c\udd82", + "t": "\ud83c\udd83", + "u": "\ud83c\udd84", + "v": "\ud83c\udd85", + "w": "\ud83c\udd86", + "x": "\ud83c\udd87", + "y": "\ud83c\udd88", + "z": "\ud83c\udd89", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "parenthesized": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "\u2474", + "2": "\u2475", + "3": "\u2476", + "4": "\u2477", + "5": "\u2478", + "6": "\u2479", + "7": "\u247a", + "8": "\u247b", + "9": "\u247c", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "\u249c", + "B": "\u249d", + "C": "\u249e", + "D": "\u249f", + "E": "\u24a0", + "F": "\u24a1", + "G": "\u24a2", + "H": "\u24a3", + "I": "\u24a4", + "J": "\u24a5", + "K": "\u24a6", + "L": "\u24a7", + "M": "\u24a8", + "N": "\u24a9", + "O": "\u24aa", + "P": "\u24ab", + "Q": "\u24ac", + "R": "\u24ad", + "S": "\u24ae", + "T": "\u24af", + "U": "\u24b0", + "V": "\u24b1", + "W": "\u24b2", + "X": "\u24b3", + "Y": "\u24b4", + "Z": "\u24b5", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\u249c", + "b": "\u249d", + "c": "\u249e", + "d": "\u249f", + "e": "\u24a0", + "f": "\u24a1", + "g": "\u24a2", + "h": "\u24a3", + "i": "\u24a4", + "j": "\u24a5", + "k": "\u24a6", + "l": "\u24a7", + "m": "\u24a8", + "n": "\u24a9", + "o": "\u24aa", + "p": "\u24ab", + "q": "\u24ac", + "r": "\u24ad", + "s": "\u24ae", + "t": "\u24af", + "u": "\u24b0", + "v": "\u24b1", + "w": "\u24b2", + "x": "\u24b3", + "y": "\u24b4", + "z": "\u24b5", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "smallCaps": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "A", + "B": "B", + "C": "C", + "D": "D", + "E": "E", + "F": "F", + "G": "G", + "H": "H", + "I": "I", + "J": "J", + "K": "K", + "L": "L", + "M": "M", + "N": "N", + "O": "O", + "P": "P", + "Q": "Q", + "R": "R", + "S": "S", + "T": "T", + "U": "U", + "V": "V", + "W": "W", + "X": "X", + "Y": "Y", + "Z": "Z", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\u1d00", + "b": "\u0299", + "c": "\u1d04", + "d": "\u1d05", + "e": "\u1d07", + "f": "\ua730", + "g": "\u0262", + "h": "\u029c", + "i": "\u026a", + "j": "\u1d0a", + "k": "\u1d0b", + "l": "\u029f", + "m": "\u1d0d", + "n": "\u0274", + "o": "\u1d0f", + "p": "\u1d29", + "q": "\ua7af", + "r": "\u0280", + "s": "\ua731", + "t": "\u1d1b", + "u": "\u1d1c", + "v": "\u1d20", + "w": "\u1d21", + "x": "x", + "y": "\u028f", + "z": "\u1d22", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "subscript": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "\u208d", + ")": "\u208e", + "*": "*", + "+": "\u208a", + ",": ",", + "-": "\u208b", + ".": ".", + "/": "/", + "0": "\u2080", + "1": "\u2081", + "2": "\u2082", + "3": "\u2083", + "4": "\u2084", + "5": "\u2085", + "6": "\u2086", + "7": "\u2087", + "8": "\u2088", + "9": "\u2089", + ":": ":", + ";": ";", + "<": "<", + "=": "\u208c", + ">": ">", + "?": "?", + "@": "@", + "A": "\u1d00", + "B": "\u0299", + "C": "\u1d04", + "D": "\u1d05", + "E": "\u1d07", + "F": "\ua730", + "G": "\u0262", + "H": "\u029c", + "I": "\u026a", + "J": "\u1d0a", + "K": "\u1d0b", + "L": "\u029f", + "M": "\u1d0d", + "N": "\u0274", + "O": "\u1d0f", + "P": "\u1d18", + "Q": "\ud83c\uddf6", + "R": "\u0280", + "S": "\ua731", + "T": "\u1d1b", + "U": "\u1d1c", + "V": "\u1d20", + "W": "\u1d21", + "X": "x", + "Y": "\u028f", + "Z": "\u1d22", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\u2090", + "b": "\u1d66", + "c": "\ud835\udcb8", + "d": "\ud835\udcb9", + "e": "\u2091", + "f": "\ud835\udcbb", + "g": "\ud835\udcf0", + "h": "\u2095", + "i": "\u1d62", + "j": "\u2c7c", + "k": "\u2096", + "l": "\u2097", + "m": "\u2098", + "n": "\u2099", + "o": "\u2092", + "p": "\u209a", + "q": "\u1d69", + "r": "\u1d63", + "s": "\u209b", + "t": "\u209c", + "u": "\u1d64", + "v": "\u1d65", + "w": "\ud835\udccc", + "x": "\u2093", + "y": "\u1d67", + "z": "\ud835\udccf", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "superscript": { + "\"": "\"", + "\\": "\\", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "\u207d", + ")": "\u207e", + "*": "*", + "+": "\u207a", + ",": ",", + "-": "\u207b", + ".": ".", + "/": "/", + "0": "\u2070", + "1": "\u00b9", + "2": "\u00b2", + "3": "\u00b3", + "4": "\u2074", + "5": "\u2075", + "6": "\u2076", + "7": "\u2077", + "8": "\u2078", + "9": "\u2079", + ":": ":", + ";": ";", + "<": "<", + "=": "\u207c", + ">": ">", + "?": "?", + "@": "@", + "A": "\u1d2c", + "B": "\u1d2e", + "C": "\u1d9c", + "D": "\u1d30", + "E": "\u1d31", + "F": "\u1da0", + "G": "\u1d33", + "H": "\u1d34", + "I": "\u1d35", + "J": "\u1d36", + "K": "\u1d37", + "L": "\u1d38", + "M": "\u1d39", + "N": "\u1d3a", + "O": "\u1d3c", + "P": "\u1d3e", + "Q": "\u1d60", + "R": "\u1d3f", + "S": "\u02e2", + "T": "\u1d40", + "U": "\u1d41", + "V": "\u2c7d", + "W": "\u1d42", + "X": "\u02e3", + "Y": "\u02b8", + "Z": "\u1dbb", + "[": "[", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "\u1d43", + "b": "\u1d47", + "c": "\u1d9c", + "d": "\u1d48", + "e": "\u1d49", + "f": "\u1da0", + "g": "\u1d4d", + "h": "\u02b0", + "i": "\u2071", + "j": "\u02b2", + "k": "\u1d4f", + "l": "\u02e1", + "m": "\u1d50", + "n": "\u207f", + "o": "\u1d52", + "p": "\u1d56", + "q": "\u1d60", + "r": "\u02b3", + "s": "\u02e2", + "t": "\u1d57", + "u": "\u1d58", + "v": "\u1d5b", + "w": "\u02b7", + "x": "\u02e3", + "y": "\u02b8", + "z": "\u1dbb", + "{": "{", + "|": "|", + "}": "}", + "~": "~" + }, + "inverted": { + "\"": "\u201e", + "\\": "\\", + " ": " ", + "!": "\u00a1", + "#": "#", + "$": "$", + "%": "%", + "&": "\u214b", + "'": ",", + "(": ")", + ")": "(", + "*": "*", + "+": "+", + ",": "\u2018", + "-": "-", + ".": "\u02d9", + "/": "/", + "0": "0", + "1": "\u0196", + "2": "\u0547", + "3": "\u0190", + "4": "\u152d", + "5": "\u03db", + "6": "9", + "7": "\u2c62", + "8": "8", + "9": "6", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "\u00bf", + "@": "@", + "A": "\u2200", + "B": "\ua4ed", + "C": "\u2183", + "D": "\ua4f7", + "E": "\u018e", + "F": "\u2132", + "G": "\u2141", + "H": "H", + "I": "I", + "J": "\u017f", + "K": "\ua4d8", + "L": "\u2142", + "M": "W", + "N": "N", + "O": "O", + "P": "\u0500", + "Q": "\u1ff8", + "R": "\ua4e4", + "S": "S", + "T": "\u22a5", + "U": "\u2229", + "V": "\ua4e5", + "W": "M", + "X": "X", + "Y": "\u2144", + "Z": "Z", + "[": "]", + "]": "[", + "^": "^", + "_": "\u203e", + "`": "`", + "a": "\u0250", + "b": "q", + "c": "\u0254", + "d": "p", + "e": "\u01dd", + "f": "\u025f", + "g": "\u0183", + "h": "\u0265", + "i": "\u0131", + "j": "\u027e", + "k": "\u029e", + "l": "\u05df", + "m": "\u026f", + "n": "u", + "o": "o", + "p": "d", + "q": "b", + "r": "\u0279", + "s": "s", + "t": "\u0287", + "u": "n", + "v": "\u028c", + "w": "\u028d", + "x": "x", + "y": "\u028e", + "z": "z", + "{": "}", + "|": "|", + "}": "{", + "~": "~" + }, + "mirrored": { + "\"": "\"", + "\\": "/", + " ": " ", + "!": "!", + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": ")", + ")": "(", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "\\", + "0": "0", + "1": "\u07c1", + "2": "\u03c2", + "3": "\u0190", + "4": "\u07c2", + "5": "\u091f", + "6": "\u10db", + "7": "\u0662", + "8": "8", + "9": "\u0b67", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "\u2e2e", + "@": "@", + "A": "A", + "B": "\ua4ed", + "C": "\u2183", + "D": "\ua4f7", + "E": "\u018e", + "F": "\ua7fb", + "G": "\u04d8", + "H": "H", + "I": "I", + "J": "\u10b1", + "K": "\ua4d8", + "L": "\u2143", + "M": "M", + "N": "\u0418", + "O": "O", + "P": "\ua7fc", + "Q": "\u03d8", + "R": "\u042f", + "S": "\ua644", + "T": "T", + "U": "U", + "V": "V", + "W": "W", + "X": "X", + "Y": "Y", + "Z": "Z", + "[": "]", + "]": "[", + "^": "^", + "_": "_", + "`": "`", + "a": "\u0252", + "b": "d", + "c": "\u2184", + "d": "b", + "e": "\u0258", + "f": "\u0287", + "g": "\u03f1", + "h": "\u029c", + "i": "i", + "j": "\u012f", + "k": "\u029e", + "l": "l", + "m": "m", + "n": "\u1d0e", + "o": "o", + "p": "q", + "q": "p", + "r": "\u1d19", + "s": "\ua645", + "t": "\u0248", + "u": "\u03c5", + "v": "v", + "w": "w", + "x": "x", + "y": "\u03b3", + "z": "z", + "{": "}", + "|": "|", + "}": "{", + "~": "~" + } +} \ No newline at end of file diff --git a/transformations/font_change/requirements.txt b/transformations/font_change/requirements.txt new file mode 100644 index 000000000..de3503dc8 --- /dev/null +++ b/transformations/font_change/requirements.txt @@ -0,0 +1 @@ +nltk==3.6.2 \ No newline at end of file diff --git a/transformations/font_change/test.json b/transformations/font_change/test.json new file mode 100644 index 000000000..1ce3e52d8 --- /dev/null +++ b/transformations/font_change/test.json @@ -0,0 +1,71 @@ +{ + "type": "font_change", + "test_cases": [ + { + "class": "Font_Change", + "inputs": { + "sentence": "Apple is looking at buying U.K. startup for $132 billion." + }, + "outputs": [ + { + "sentence": "Apple is looking at buying U.K. startup for $132 \ud83c\udd71\ud83c\udd78\ud83c\udd7b\ud83c\udd7b\ud83c\udd78\ud83c\udd7e\ud83c\udd7d." + } + ] + }, + { + "class": "Font_Change", + "inputs": { + "sentence": "We had to box part of the pizza to take it home and we were out the door by 6:42." + }, + "outputs": [ + { + "sentence": "We had to box part of the pizza to take it \ud835\ude5d\ud835\ude64\ud835\ude62\ud835\ude5a and we were out the \ud835\udcb9\u2134\u2134\ud835\udcc7 by 6:42." + } + ] + }, + { + "class": "Font_Change", + "inputs": { + "sentence": "The quick brown fox jumps over the lazy dog." + }, + "outputs": [ + { + "sentence": "The quick brown \ud83c\udd75\ud83c\udd7e\ud83c\udd87 \u027en\u026fds over the lazy \u1d05\u1d0f\u0262." + } + ] + }, + { + "class": "Font_Change", + "inputs": { + "sentence": "Mumbai, Bengaluru, New Delhi are among the many famous places in India." + }, + "outputs": [ + { + "sentence": "Mumbai, Bengaluru, New \ud83c\udd73\ud83c\udd74\ud83c\udd7b\ud83c\udd77\ud83c\udd78 are \u0250\u026fou\u0183 the many famous places in I\u0274\u1d05\u026a\u1d00." + } + ] + }, + { + "class": "Font_Change", + "inputs": { + "sentence": "New Delhi is among the many famous places in India." + }, + "outputs": [ + { + "sentence": "New Delhi is among the \ud83c\udd7c\ud83c\udd70\ud83c\udd7d\ud83c\udd88 \u025f\u0250\u026fons places in I\u0274\u1d05\u026a\u1d00." + } + ] + }, + { + "class": "Font_Change", + "inputs": { + "sentence": "Oh, and their spring rolls and the accompanying peanuts and hot sauces were also delicious." + }, + "outputs": [ + { + "sentence": "Oh, and their \ud835\ude68\ud835\ude65\ud835\ude67\ud835\ude5e\ud835\ude63\ud835\ude5c rolls and the accompanying peanuts and hot \ud835\udcc8\ud835\udcb6\ud835\udcca\ud835\udcb8\u212f\ud835\udcc8 were \ud835\udcb6\ud835\udcc1\ud835\udcc8\u2134 delicious." + } + ] + } + ] +} \ No newline at end of file diff --git a/transformations/font_change/transformation.py b/transformations/font_change/transformation.py new file mode 100644 index 000000000..6b5d47c0b --- /dev/null +++ b/transformations/font_change/transformation.py @@ -0,0 +1,122 @@ +import itertools +import json +import os +import random +import re + +from nltk import download as nltkdl +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + +nltkdl("stopwords") +nltkdl("punkt") + + +# import spacy + +# from nltk import ne_chunk +# from nltk.tag import pos_tag +# from nltk.tree import Tree + +# nltkdl("words") +# nltkdl("maxent_ne_chunker") + +# nltkdl("averaged_perceptron_tagger") + + +# def find_candidates(doc): +# chunked = ne_chunk(pos_tag(word_tokenize(doc.text))) +# candidates = [] +# current_chunk = [] +# for i in chunked: +# if type(i) == tuple: +# if i[1] in [ +# "NN", +# "NNP", +# "NNPS", +# "VB", +# "VBD", +# "VBG", +# "VBN", +# "VBP", +# "VBZ", +# ]: +# if i[0] not in stopwords.words("english"): +# candidates.append(i[0]) +# +# if type(i) == Tree: +# current_chunk.append( +# " ".join([token for token, pos in i.leaves()]) +# ) +# +# if current_chunk: +# named_entity = " ".join(current_chunk) +# if named_entity not in candidates: +# candidates.append(named_entity) +# current_chunk = [] +# else: +# continue +# return candidates + + +def font_change(sentence, fonts, seed=666, max_outputs=1): + random.seed(seed) + perturbed_texts = [] + + for _ in itertools.repeat(None, max_outputs): + # Generate the sentence with randomly distributed fonts + tokens_escaped_list = [] + for token in word_tokenize(sentence): + if token not in stopwords.words("english"): + if token != ".": + tokens_escaped_list.extend( + list(re.finditer(re.escape(token), sentence)) + ) + + transformed_sentence = list(sentence) + + tokens_to_change = random.sample( + tokens_escaped_list, + random.randint(1, min(3, len(tokens_escaped_list) - 1)), + ) + for ttc in tokens_to_change: + while True: + font = random.sample(list(fonts.keys()), 1)[0] + if font != "normal": + break + + for i in range(ttc.start(), ttc.end()): + try: + transformed_sentence[i] = fonts[font][ + transformed_sentence[i] + ] + except KeyError: + transformed_sentence[i] = transformed_sentence[i] + + perturbed_texts.append("".join(transformed_sentence)) + return perturbed_texts + + +class Font_Change(SentenceOperation): + tasks = [TaskType.TEXT_CLASSIFICATION] + languages = ["en"] + + def __init__(self, seed=664, max_outputs=1): + super().__init__(seed, max_outputs=max_outputs) + dict_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "fonts.json" + ) + with open(dict_path) as f: + self.fonts = json.load(f) + + def generate(self, sentence: str): + perturbed_texts = font_change( + sentence, + self.fonts, + seed=self.seed, + max_outputs=self.max_outputs, + ) + return perturbed_texts From 5f951e0d67b49f8838dd0d3cb5b48a2fed65df5f Mon Sep 17 00:00:00 2001 From: Shahab Raji Date: Tue, 31 Aug 2021 21:08:33 -0400 Subject: [PATCH 2/9] readme and class name changed --- transformations/font_change/README.md | 25 ++++----- transformations/font_change/test.json | 12 ++-- transformations/font_change/transformation.py | 55 ++----------------- 3 files changed, 22 insertions(+), 70 deletions(-) diff --git a/transformations/font_change/README.md b/transformations/font_change/README.md index 927997b2e..352fcd17d 100644 --- a/transformations/font_change/README.md +++ b/transformations/font_change/README.md @@ -1,36 +1,31 @@ # Hashtagify -This transformation add noise to an input sentence by named entities and other common words and turning them into hashtags, as often used in social media. +Inspired by social media posts, this transformation add noise to an input sentence by randomly changing the font of words +in a sentence. -Authors: [Shahab Raji](mailto:shahab.raji@rutgers.edu) (Rutgers University) and [Gerard de Melo](http://gerard.demelo.org/) (Hasso Plattner Institute / University of Potsdam) +Authors: [Shahab Raji](mailto:shahab.raji@rutgers.edu) (Rutgers University) and [Gerard de Melo](http://gerard.demelo.org/) +(Hasso Plattner Institute / University of Potsdam) ## How does the transformation work? -Hashtagify uses named entity recognition and part-of-speech tagging to turn certain words or phrases in the sentence into hashtags. This transformation converts the sentence to a social media style text to support the generalizability of NLP models. - -In more detail, Hashtagify identifies named entities, nouns, and verbs and adds the hash character "#" prefix to turn them into hashtags. The hashtags are added to each candidate word according to a fixed probability. Stopwords are not hashtagged. Multi-word named entities are handled by removing the spaces and capitalizing the first letter of each word. The syntactic and semantic structure of the sentence is preserved during the transformation. +Font Change uses the mapping from [𝓾𝓷𝓲𝓬𝓸𝓭𝓮 𝙛𝙤𝙧𝙢𝙖𝙩𝙩𝙚𝙧](https://github.com/DenverCoder1/unicode-formatter) (MIT license) +to change the font of random words in the input sentence. Examples: -``` -New Delhi is among the many famous places in India. -``` +> The quick brown fox jumps over the lazy dog. to -``` -#NewDelhi is among the many famous places in India. -``` - +> The quick brown 🅵🅾🆇 ɾnɯds over the lazy ᴅᴏɢ. ## Target Tasks -This transformation can be used for augmenting the text in classification and generation tasks. +This transformation can be used for augmenting the text in classification task. ## Limitations -- Non-neural NER models sometimes fail to identify the named entities correctly. A fine-tuned model based on the input data can be used to improve the performance of the NER model. -- Hashtags are sometimes added to unusual words or based on some trends. +## Previous Work diff --git a/transformations/font_change/test.json b/transformations/font_change/test.json index 1ce3e52d8..957537834 100644 --- a/transformations/font_change/test.json +++ b/transformations/font_change/test.json @@ -2,7 +2,7 @@ "type": "font_change", "test_cases": [ { - "class": "Font_Change", + "class": "FontChange", "inputs": { "sentence": "Apple is looking at buying U.K. startup for $132 billion." }, @@ -13,7 +13,7 @@ ] }, { - "class": "Font_Change", + "class": "FontChange", "inputs": { "sentence": "We had to box part of the pizza to take it home and we were out the door by 6:42." }, @@ -24,7 +24,7 @@ ] }, { - "class": "Font_Change", + "class": "FontChange", "inputs": { "sentence": "The quick brown fox jumps over the lazy dog." }, @@ -35,7 +35,7 @@ ] }, { - "class": "Font_Change", + "class": "FontChange", "inputs": { "sentence": "Mumbai, Bengaluru, New Delhi are among the many famous places in India." }, @@ -46,7 +46,7 @@ ] }, { - "class": "Font_Change", + "class": "FontChange", "inputs": { "sentence": "New Delhi is among the many famous places in India." }, @@ -57,7 +57,7 @@ ] }, { - "class": "Font_Change", + "class": "FontChange", "inputs": { "sentence": "Oh, and their spring rolls and the accompanying peanuts and hot sauces were also delicious." }, diff --git a/transformations/font_change/transformation.py b/transformations/font_change/transformation.py index 6b5d47c0b..ee5c3e206 100644 --- a/transformations/font_change/transformation.py +++ b/transformations/font_change/transformation.py @@ -15,59 +15,12 @@ nltkdl("punkt") -# import spacy - -# from nltk import ne_chunk -# from nltk.tag import pos_tag -# from nltk.tree import Tree - -# nltkdl("words") -# nltkdl("maxent_ne_chunker") - -# nltkdl("averaged_perceptron_tagger") - - -# def find_candidates(doc): -# chunked = ne_chunk(pos_tag(word_tokenize(doc.text))) -# candidates = [] -# current_chunk = [] -# for i in chunked: -# if type(i) == tuple: -# if i[1] in [ -# "NN", -# "NNP", -# "NNPS", -# "VB", -# "VBD", -# "VBG", -# "VBN", -# "VBP", -# "VBZ", -# ]: -# if i[0] not in stopwords.words("english"): -# candidates.append(i[0]) -# -# if type(i) == Tree: -# current_chunk.append( -# " ".join([token for token, pos in i.leaves()]) -# ) -# -# if current_chunk: -# named_entity = " ".join(current_chunk) -# if named_entity not in candidates: -# candidates.append(named_entity) -# current_chunk = [] -# else: -# continue -# return candidates - - def font_change(sentence, fonts, seed=666, max_outputs=1): random.seed(seed) perturbed_texts = [] for _ in itertools.repeat(None, max_outputs): - # Generate the sentence with randomly distributed fonts + # Generate the sentence with random fonts tokens_escaped_list = [] for token in word_tokenize(sentence): if token not in stopwords.words("english"): @@ -100,12 +53,16 @@ def font_change(sentence, fonts, seed=666, max_outputs=1): return perturbed_texts -class Font_Change(SentenceOperation): +class FontChange(SentenceOperation): tasks = [TaskType.TEXT_CLASSIFICATION] languages = ["en"] def __init__(self, seed=664, max_outputs=1): super().__init__(seed, max_outputs=max_outputs) + + # mapping tables based on unicode-formatter (MIT license) + # https://github.com/DenverCoder1/unicode-formatter + dict_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "fonts.json" ) From 0c3cca63a4dd85ca4cc5e276f23e9683a237dbff Mon Sep 17 00:00:00 2001 From: Shahab Raji Date: Tue, 31 Aug 2021 21:14:02 -0400 Subject: [PATCH 3/9] further changes in README --- transformations/font_change/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/transformations/font_change/README.md b/transformations/font_change/README.md index 352fcd17d..bfb84e755 100644 --- a/transformations/font_change/README.md +++ b/transformations/font_change/README.md @@ -27,5 +27,3 @@ This transformation can be used for augmenting the text in classification task. ## Limitations -## Previous Work - From cd61e02a5a445d86eebc80e9c9d439a9a1927987 Mon Sep 17 00:00:00 2001 From: Shahab Raji Date: Tue, 31 Aug 2021 21:25:21 -0400 Subject: [PATCH 4/9] further changes in README --- transformations/font_change/README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/transformations/font_change/README.md b/transformations/font_change/README.md index bfb84e755..e65976ec6 100644 --- a/transformations/font_change/README.md +++ b/transformations/font_change/README.md @@ -24,6 +24,3 @@ to This transformation can be used for augmenting the text in classification task. - -## Limitations - From 599284ecd8ea7ddf1cd04f82f1b9337ac07d914a Mon Sep 17 00:00:00 2001 From: Shahab Raji Date: Wed, 1 Sep 2021 15:04:00 -0400 Subject: [PATCH 5/9] Even further changes in README --- transformations/font_change/README.md | 50 +++++++++---------- transformations/font_change/transformation.py | 2 +- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/transformations/font_change/README.md b/transformations/font_change/README.md index e65976ec6..5c4954c2e 100644 --- a/transformations/font_change/README.md +++ b/transformations/font_change/README.md @@ -1,26 +1,24 @@ -# Hashtagify - -Inspired by social media posts, this transformation add noise to an input sentence by randomly changing the font of words -in a sentence. - -Authors: [Shahab Raji](mailto:shahab.raji@rutgers.edu) (Rutgers University) and [Gerard de Melo](http://gerard.demelo.org/) -(Hasso Plattner Institute / University of Potsdam) - - -## How does the transformation work? - -Font Change uses the mapping from [𝓾𝓷𝓲𝓬𝓸𝓭𝓮 𝙛𝙤𝙧𝙢𝙖𝙩𝙩𝙚𝙧](https://github.com/DenverCoder1/unicode-formatter) (MIT license) -to change the font of random words in the input sentence. - -Examples: - -> The quick brown fox jumps over the lazy dog. - -to - -> The quick brown 🅵🅾🆇 ɾnɯds over the lazy ᴅᴏɢ. - -## Target Tasks - -This transformation can be used for augmenting the text in classification task. - +# Font Change + +The Font Change transformation modifies words in the input to have a stylized appearance using suitable Unicode characters, as often in encountered in social media posts. + +Authors: [Shahab Raji](mailto:shahab.raji@rutgers.edu) (Rutgers University) and [Gerard de Melo](http://gerard.demelo.org/) +(Hasso Plattner Institute / University of Potsdam) + + +## How does the transformation work? + +Font Change adapts the appearance of randomly selected words in the input sentence. For each selected word, one of several possible appearance changes is chosen randomly. Such changes are achieved using Unicode characters based on mapping tables from the [𝓾𝓷𝓲𝓬𝓸𝓭𝓮 𝙛𝙤𝙧𝙢𝙖𝙩𝙩𝙚𝙧](https://github.com/DenverCoder1/unicode-formatter) (MIT license) tool. + +Examples: + +> The quick brown fox jumps over the lazy dog. + +to + +> The quick brown 🅵🅾🆇 ɾnɯds over the lazy ᴅᴏɢ. + +## Target Tasks + +This transformation can be used for data augmentation in text classification tasks. + diff --git a/transformations/font_change/transformation.py b/transformations/font_change/transformation.py index ee5c3e206..6c20aaf48 100644 --- a/transformations/font_change/transformation.py +++ b/transformations/font_change/transformation.py @@ -60,7 +60,7 @@ class FontChange(SentenceOperation): def __init__(self, seed=664, max_outputs=1): super().__init__(seed, max_outputs=max_outputs) - # mapping tables based on unicode-formatter (MIT license) + # Mapping tables based on unicode-formatter (MIT license) # https://github.com/DenverCoder1/unicode-formatter dict_path = os.path.join( From 2b2f60ccc4c52f237c68568656c4e06f2fe49c2f Mon Sep 17 00:00:00 2001 From: Shahab Raji Date: Sat, 18 Sep 2021 05:06:52 -0400 Subject: [PATCH 6/9] changes after the reviews --- transformations/font_change/transformation.py | 1040 ++++++++++++++++- 1 file changed, 1038 insertions(+), 2 deletions(-) diff --git a/transformations/font_change/transformation.py b/transformations/font_change/transformation.py index 6c20aaf48..089614ec6 100644 --- a/transformations/font_change/transformation.py +++ b/transformations/font_change/transformation.py @@ -54,8 +54,1044 @@ def font_change(sentence, fonts, seed=666, max_outputs=1): class FontChange(SentenceOperation): - tasks = [TaskType.TEXT_CLASSIFICATION] - languages = ["en"] + tasks = [ + TaskType.TEXT_CLASSIFICATION, + TaskType.TEXT_TAGGING, + ] + languages = [ + "aa", + "aai", + "aak", + "aau", + "abi", + "abr", + "abt", + "aby", + "acd", + "ace", + "ach", + "ada", + "ade", + "adj", + "adz", + "aey", + "af", + "agc", + "agd", + "agg", + "agm", + "ago", + "agq", + "aha", + "ahl", + "ajg", + "ak", + "ala", + "ali", + "aln", + "amm", + "amn", + "amo", + "amp", + "an", + "anc", + "ank", + "ann", + "any", + "aoj", + "aom", + "aoz", + "ape", + "apr", + "aps", + "apz", + "arh", + "arn", + "aro", + "asa", + "asg", + "aso", + "ast", + "ata", + "atg", + "atj", + "auy", + "avn", + "avt", + "avu", + "awb", + "awo", + "awx", + "ay", + "ayb", + "az", + "ban", + "bar", + "bas", + "bav", + "bba", + "bbb", + "bbc", + "bbd", + "bbj", + "bbp", + "bbr", + "bcf", + "bch", + "bci", + "bcm", + "bcn", + "bco", + "bcu", + "bdd", + "bef", + "beh", + "bem", + "bet", + "bew", + "bex", + "bez", + "bfd", + "bhg", + "bhl", + "bhy", + "bi", + "bib", + "big", + "bik", + "bim", + "bin", + "bio", + "biq", + "bjh", + "bjn", + "bjo", + "bjr", + "bjt", + "bjz", + "bkc", + "bkm", + "bkq", + "bku", + "bkv", + "bm", + "bmh", + "bmk", + "bmq", + "bmu", + "bng", + "bnm", + "bnp", + "boj", + "bom", + "bon", + "bqc", + "bqp", + "bqv", + "br", + "brz", + "bs", + "bsj", + "bss", + "bto", + "btt", + "buc", + "bud", + "bug", + "buk", + "bum", + "buo", + "bus", + "buu", + "bvb", + "bwd", + "bwr", + "bxh", + "bye", + "byr", + "bys", + "byv", + "byx", + "bza", + "bze", + "bzf", + "bzh", + "bzw", + "ca", + "cad", + "can", + "cbj", + "cch", + "ceb", + "cfa", + "cgg", + "ch", + "chk", + "cho", + "chp", + "cic", + "cjv", + "ckl", + "cko", + "cky", + "cla", + "cme", + "co", + "cps", + "crs", + "cs", + "csb", + "cy", + "da", + "dad", + "daf", + "dag", + "dah", + "dak", + "dav", + "dbd", + "dbq", + "ddn", + "de", + "ded", + "den", + "dga", + "dgh", + "dgi", + "dgr", + "dgz", + "dia", + "dje", + "dnj", + "dob", + "dop", + "dow", + "dri", + "dsb", + "dtm", + "dtp", + "dts", + "dua", + "duc", + "dud", + "dug", + "dva", + "dww", + "dyo", + "dyu", + "dzg", + "ebu", + "ee", + "efi", + "egl", + "eka", + "ema", + "emi", + "en", + "enn", + "enq", + "eo", + "eri", + "es", + "esu", + "et", + "etr", + "etu", + "etx", + "eu", + "ewo", + "ext", + "faa", + "fab", + "fag", + "fai", + "fan", + "ff", + "ffi", + "ffm", + "fi", + "fil", + "fit", + "fj", + "flr", + "fmp", + "fo", + "fod", + "fon", + "for", + "fpe", + "fqs", + "fr", + "frc", + "frp", + "frr", + "frs", + "fud", + "fue", + "fuf", + "fuh", + "fuq", + "fur", + "fuv", + "fuy", + "fvr", + "fy", + "ga", + "gaa", + "gaf", + "gag", + "gah", + "gaj", + "gam", + "gaw", + "gay", + "gba", + "gbf", + "gby", + "gcr", + "gd", + "gde", + "gdn", + "gdr", + "geb", + "gej", + "gel", + "gfk", + "ghs", + "gil", + "gim", + "gjn", + "gkn", + "gkp", + "gl", + "gmm", + "gn", + "gnd", + "gng", + "god", + "goi", + "gor", + "gos", + "grb", + "grw", + "gsw", + "gub", + "guc", + "gud", + "gur", + "guw", + "gux", + "guz", + "gv", + "gvf", + "gvs", + "gwi", + "gyi", + "ha", + "hag", + "ham", + "haw", + "hbb", + "hhy", + "hi-Latn", + "hia", + "hif", + "hig", + "hih", + "hil", + "hla", + "hmt", + "hnn", + "ho", + "hot", + "hr", + "hsb", + "ht", + "hu", + "hui", + "hz", + "ia", + "ian", + "iar", + "iba", + "ibb", + "iby", + "ica", + "ich", + "id", + "idd", + "idi", + "idu", + "ife", + "ig", + "igb", + "ige", + "ijj", + "ik", + "ikk", + "ikt", + "ikw", + "ikx", + "ilo", + "imo", + "in", + "io", + "iou", + "iri", + "is", + "it", + "iwm", + "iws", + "izh", + "izi", + "jab", + "jam", + "jbo", + "jbu", + "jen", + "jgk", + "jgo", + "jib", + "jmc", + "jra", + "jut", + "jv", + "jw", + "kab", + "kac", + "kad", + "kai", + "kaj", + "kam", + "kao", + "kbm", + "kbp", + "kbq", + "kbx", + "kcg", + "kck", + "kcl", + "kct", + "kde", + "kdl", + "kea", + "ken", + "kez", + "kfo", + "kg", + "kge", + "kgf", + "kgp", + "kha", + "khq", + "khs", + "khz", + "ki", + "kij", + "kiu", + "kiw", + "kj", + "kjd", + "kjs", + "kjy", + "kkc", + "kkj", + "kl", + "kln", + "klq", + "klt", + "klx", + "kmb", + "kmh", + "kmo", + "kms", + "kmu", + "kmw", + "knf", + "knp", + "kol", + "kos", + "koz", + "kpe", + "kpf", + "kpo", + "kpr", + "kpx", + "kqb", + "kqf", + "kqs", + "kr", + "kri", + "krj", + "krl", + "krs", + "ksb", + "ksd", + "ksf", + "ksh", + "ksj", + "ksr", + "ktm", + "kto", + "ktr", + "ku", + "kub", + "kud", + "kue", + "kuj", + "kun", + "kup", + "kus", + "kvg", + "kvr", + "kw", + "kwj", + "kwo", + "kwq", + "kxa", + "kxe", + "kxw", + "kxz", + "ky-Latn", + "ky-TR", + "kye", + "kyx", + "kzj", + "kzr", + "kzt", + "la", + "lag", + "laj", + "las", + "lb", + "lbu", + "lbw", + "lcm", + "ldb", + "led", + "lee", + "lem", + "leq", + "leu", + "lg", + "lgg", + "li", + "lia", + "lid", + "lig", + "lih", + "lij", + "ljp", + "lkt", + "lle", + "lln", + "lmo", + "lmp", + "ln", + "lns", + "lnu", + "loj", + "lok", + "lol", + "lor", + "los", + "loz", + "lt", + "ltg", + "lu", + "lua", + "luo", + "luy", + "lv", + "lzz", + "mad", + "maf", + "mak", + "man", + "mas", + "maw", + "maz", + "mbh", + "mbo", + "mbq", + "mbu", + "mbw", + "mci", + "mcp", + "mcq", + "mcr", + "mcu", + "mda", + "mdh", + "mdj", + "mdr", + "med", + "mee", + "mek", + "men", + "mer", + "met", + "meu", + "mfe", + "mfn", + "mfo", + "mfq", + "mg", + "mgh", + "mgl", + "mgo", + "mgy", + "mh", + "mhi", + "mhl", + "mi", + "mif", + "min", + "miw", + "mkl", + "mkp", + "mkw", + "mle", + "mlp", + "mls", + "mmo", + "mmu", + "mmx", + "mna", + "mnf", + "mo", + "moa", + "moe", + "moh", + "mos", + "mox", + "mpp", + "mps", + "mpt", + "mpx", + "mql", + "ms", + "ms-ID", + "mt", + "mtc", + "mtf", + "mti", + "mua", + "mur", + "mus", + "mva", + "mvn", + "mwk", + "mwv", + "mxc", + "mxm", + "myk", + "myw", + "myx", + "mzk", + "mzm", + "mzp", + "mzw", + "mzz", + "na", + "nac", + "naf", + "nak", + "nap", + "naq", + "nas", + "nb", + "nca", + "nce", + "ncf", + "nch", + "nco", + "ncu", + "nd", + "ndc", + "nds", + "neb", + "nex", + "nfr", + "ng", + "nga", + "ngb", + "ngl", + "nhb", + "nhe", + "nhw", + "nif", + "nii", + "nij", + "nin", + "niu", + "niy", + "niz", + "njo", + "nkg", + "nko", + "nl", + "nmg", + "nmz", + "nn", + "nnf", + "nnh", + "nnk", + "nnm", + "no", + "nop", + "nou", + "nr", + "nrb", + "nsn", + "nso", + "nss", + "ntm", + "ntr", + "nui", + "nup", + "nus", + "nuv", + "nux", + "nv", + "nwb", + "nxq", + "nxr", + "ny", + "nym", + "nyn", + "nzi", + "oc", + "ogc", + "okr", + "okv", + "om", + "ong", + "onn", + "ons", + "opm", + "oro", + "ozm", + "pag", + "pam", + "pap", + "pau", + "pbi", + "pcd", + "pcm", + "pdc", + "pdt", + "ped", + "pex", + "pfl", + "pil", + "pip", + "pko", + "pl", + "pla", + "pms", + "png", + "pnn", + "pon", + "ppo", + "prg", + "pss", + "pt", + "ptp", + "puu", + "pwa", + "qu", + "quc", + "qug", + "rai", + "rao", + "rcf", + "rej", + "rel", + "res", + "rgn", + "ria", + "rif-NL", + "rm", + "rmf", + "rmo", + "rmu", + "rn", + "rna", + "rng", + "ro", + "rob", + "rof", + "roo", + "rro", + "rtm", + "rug", + "rw", + "rwk", + "rwo", + "saf", + "saq", + "sas", + "sav", + "sba", + "sbe", + "sbp", + "sc", + "scn", + "sco", + "scs", + "sdc", + "se", + "sef", + "seh", + "sei", + "ses", + "sg", + "sgs", + "sgz", + "shk", + "sid", + "sig", + "sil", + "sim", + "sjr", + "sk", + "skc", + "sks", + "sl", + "sld", + "sli", + "sll", + "sly", + "sm", + "sma", + "smj", + "smn", + "smq", + "sms", + "sn", + "snc", + "snk", + "snp", + "snx", + "sny", + "so", + "sok", + "soq", + "soy", + "spd", + "spl", + "sps", + "sq", + "sr-ME", + "sr-RO", + "sr-RU", + "sr-TR", + "srn", + "srr", + "ss", + "ssd", + "ssg", + "ssy", + "st", + "stk", + "stq", + "su", + "sua", + "sue", + "suk", + "sur", + "sus", + "sv", + "sw", + "swc", + "swg", + "swp", + "sxn", + "sxw", + "szl", + "tal", + "tan", + "taq", + "tbc", + "tbd", + "tbf", + "tbg", + "tbo", + "tbw", + "tbz", + "tci", + "tdu", + "ted", + "tem", + "teo", + "tet", + "tfi", + "tgc", + "tgo", + "tgu", + "tif", + "tik", + "tim", + "tio", + "tiv", + "tk", + "tkl", + "tkr", + "tl", + "tlf", + "tlx", + "tly", + "tmh", + "tmy", + "tn", + "tnh", + "to", + "tof", + "tog", + "toq", + "tpi", + "tpm", + "tpz", + "tqo", + "tr", + "tru", + "trv", + "ts", + "tsg", + "tsw", + "ttd", + "tte", + "ttj", + "ttr", + "ttt", + "tuh", + "tul", + "tum", + "tuq", + "tvd", + "tvl", + "tvu", + "twh", + "twq", + "ty", + "tya", + "tzm", + "ubu", + "uli", + "umb", + "und", + "uok", + "uri", + "urt", + "urw", + "usa", + "utr", + "uvh", + "uvl", + "uz", + "vag", + "van", + "ve", + "vec", + "vep", + "vi", + "vic", + "viv", + "vls", + "vmf", + "vmw", + "vo", + "vot", + "vro", + "vun", + "vut", + "wa", + "wae", + "waj", + "wan", + "war", + "wbp", + "wci", + "wer", + "wgi", + "whg", + "wib", + "wiu", + "wiv", + "wja", + "wji", + "wls", + "wmo", + "wnc", + "wnu", + "wo", + "wob", + "wos", + "wrs", + "wsk", + "wuv", + "wwa", + "xav", + "xbi", + "xes", + "xh", + "xla", + "xog", + "xon", + "xrb", + "xsi", + "xsm", + "xwe", + "yam", + "yao", + "yap", + "yas", + "yat", + "yav", + "yay", + "yaz", + "yba", + "ybb", + "yby", + "yer", + "ygr", + "ygw", + "yko", + "yle", + "ylg", + "yll", + "yml", + "yo", + "yon", + "yrb", + "yre", + "yrl", + "yss", + "yua", + "yuj", + "yut", + "yuw", + "za", + "zag", + "zea", + "zia", + "zlm", + "zmi", + "zne", + "zu", + "zza", + ] + keywords = [ + "noise", + "rule-based", + "written", + "visual", + "highly-meaning-preserving", + "high-precision", + "high-coverage", + "high-generations", + ] def __init__(self, seed=664, max_outputs=1): super().__init__(seed, max_outputs=max_outputs) From a51022baad9ac9cf26730ae4389d47a11fcec4de Mon Sep 17 00:00:00 2001 From: Shahab Raji Date: Wed, 22 Sep 2021 00:59:04 -0400 Subject: [PATCH 7/9] constructor updated --- transformations/font_change/transformation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/transformations/font_change/transformation.py b/transformations/font_change/transformation.py index 089614ec6..d8d5f9882 100644 --- a/transformations/font_change/transformation.py +++ b/transformations/font_change/transformation.py @@ -11,9 +11,6 @@ from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType -nltkdl("stopwords") -nltkdl("punkt") - def font_change(sentence, fonts, seed=666, max_outputs=1): random.seed(seed) @@ -1094,6 +1091,8 @@ class FontChange(SentenceOperation): ] def __init__(self, seed=664, max_outputs=1): + nltkdl("stopwords") + nltkdl("punkt") super().__init__(seed, max_outputs=max_outputs) # Mapping tables based on unicode-formatter (MIT license) From 92b3f780a9b95285d50ffdbd05ad0f1d905d687b Mon Sep 17 00:00:00 2001 From: Shahab Raji Date: Fri, 1 Oct 2021 02:13:55 -0400 Subject: [PATCH 8/9] docstring added, list of languages added --- .../font_change/list_of_languages.txt | 1022 ++++++++++++++++ transformations/font_change/transformation.py | 1048 +---------------- 2 files changed, 1041 insertions(+), 1029 deletions(-) create mode 100644 transformations/font_change/list_of_languages.txt diff --git a/transformations/font_change/list_of_languages.txt b/transformations/font_change/list_of_languages.txt new file mode 100644 index 000000000..771a3045f --- /dev/null +++ b/transformations/font_change/list_of_languages.txt @@ -0,0 +1,1022 @@ +aa +aai +aak +aau +abi +abr +abt +aby +acd +ace +ach +ada +ade +adj +adz +aey +af +agc +agd +agg +agm +ago +agq +aha +ahl +ajg +ak +ala +ali +aln +amm +amn +amo +amp +an +anc +ank +ann +any +aoj +aom +aoz +ape +apr +aps +apz +arh +arn +aro +asa +asg +aso +ast +ata +atg +atj +auy +avn +avt +avu +awb +awo +awx +ay +ayb +az +ban +bar +bas +bav +bba +bbb +bbc +bbd +bbj +bbp +bbr +bcf +bch +bci +bcm +bcn +bco +bcu +bdd +bef +beh +bem +bet +bew +bex +bez +bfd +bhg +bhl +bhy +bi +bib +big +bik +bim +bin +bio +biq +bjh +bjn +bjo +bjr +bjt +bjz +bkc +bkm +bkq +bku +bkv +bm +bmh +bmk +bmq +bmu +bng +bnm +bnp +boj +bom +bon +bqc +bqp +bqv +br +brz +bs +bsj +bss +bto +btt +buc +bud +bug +buk +bum +buo +bus +buu +bvb +bwd +bwr +bxh +bye +byr +bys +byv +byx +bza +bze +bzf +bzh +bzw +ca +cad +can +cbj +cch +ceb +cfa +cgg +ch +chk +cho +chp +cic +cjv +ckl +cko +cky +cla +cme +co +cps +crs +cs +csb +cy +da +dad +daf +dag +dah +dak +dav +dbd +dbq +ddn +de +ded +den +dga +dgh +dgi +dgr +dgz +dia +dje +dnj +dob +dop +dow +dri +dsb +dtm +dtp +dts +dua +duc +dud +dug +dva +dww +dyo +dyu +dzg +ebu +ee +efi +egl +eka +ema +emi +en +enn +enq +eo +eri +es +esu +et +etr +etu +etx +eu +ewo +ext +faa +fab +fag +fai +fan +ff +ffi +ffm +fi +fil +fit +fj +flr +fmp +fo +fod +fon +for +fpe +fqs +fr +frc +frp +frr +frs +fud +fue +fuf +fuh +fuq +fur +fuv +fuy +fvr +fy +ga +gaa +gaf +gag +gah +gaj +gam +gaw +gay +gba +gbf +gby +gcr +gd +gde +gdn +gdr +geb +gej +gel +gfk +ghs +gil +gim +gjn +gkn +gkp +gl +gmm +gn +gnd +gng +god +goi +gor +gos +grb +grw +gsw +gub +guc +gud +gur +guw +gux +guz +gv +gvf +gvs +gwi +gyi +ha +hag +ham +haw +hbb +hhy +hi-Latn +hia +hif +hig +hih +hil +hla +hmt +hnn +ho +hot +hr +hsb +ht +hu +hui +hz +ia +ian +iar +iba +ibb +iby +ica +ich +id +idd +idi +idu +ife +ig +igb +ige +ijj +ik +ikk +ikt +ikw +ikx +ilo +imo +in +io +iou +iri +is +it +iwm +iws +izh +izi +jab +jam +jbo +jbu +jen +jgk +jgo +jib +jmc +jra +jut +jv +jw +kab +kac +kad +kai +kaj +kam +kao +kbm +kbp +kbq +kbx +kcg +kck +kcl +kct +kde +kdl +kea +ken +kez +kfo +kg +kge +kgf +kgp +kha +khq +khs +khz +ki +kij +kiu +kiw +kj +kjd +kjs +kjy +kkc +kkj +kl +kln +klq +klt +klx +kmb +kmh +kmo +kms +kmu +kmw +knf +knp +kol +kos +koz +kpe +kpf +kpo +kpr +kpx +kqb +kqf +kqs +kr +kri +krj +krl +krs +ksb +ksd +ksf +ksh +ksj +ksr +ktm +kto +ktr +ku +kub +kud +kue +kuj +kun +kup +kus +kvg +kvr +kw +kwj +kwo +kwq +kxa +kxe +kxw +kxz +ky-Latn +ky-TR +kye +kyx +kzj +kzr +kzt +la +lag +laj +las +lb +lbu +lbw +lcm +ldb +led +lee +lem +leq +leu +lg +lgg +li +lia +lid +lig +lih +lij +ljp +lkt +lle +lln +lmo +lmp +ln +lns +lnu +loj +lok +lol +lor +los +loz +lt +ltg +lu +lua +luo +luy +lv +lzz +mad +maf +mak +man +mas +maw +maz +mbh +mbo +mbq +mbu +mbw +mci +mcp +mcq +mcr +mcu +mda +mdh +mdj +mdr +med +mee +mek +men +mer +met +meu +mfe +mfn +mfo +mfq +mg +mgh +mgl +mgo +mgy +mh +mhi +mhl +mi +mif +min +miw +mkl +mkp +mkw +mle +mlp +mls +mmo +mmu +mmx +mna +mnf +mo +moa +moe +moh +mos +mox +mpp +mps +mpt +mpx +mql +ms +ms-ID +mt +mtc +mtf +mti +mua +mur +mus +mva +mvn +mwk +mwv +mxc +mxm +myk +myw +myx +mzk +mzm +mzp +mzw +mzz +na +nac +naf +nak +nap +naq +nas +nb +nca +nce +ncf +nch +nco +ncu +nd +ndc +nds +neb +nex +nfr +ng +nga +ngb +ngl +nhb +nhe +nhw +nif +nii +nij +nin +niu +niy +niz +njo +nkg +nko +nl +nmg +nmz +nn +nnf +nnh +nnk +nnm +no +nop +nou +nr +nrb +nsn +nso +nss +ntm +ntr +nui +nup +nus +nuv +nux +nv +nwb +nxq +nxr +ny +nym +nyn +nzi +oc +ogc +okr +okv +om +ong +onn +ons +opm +oro +ozm +pag +pam +pap +pau +pbi +pcd +pcm +pdc +pdt +ped +pex +pfl +pil +pip +pko +pl +pla +pms +png +pnn +pon +ppo +prg +pss +pt +ptp +puu +pwa +qu +quc +qug +rai +rao +rcf +rej +rel +res +rgn +ria +rif-NL +rm +rmf +rmo +rmu +rn +rna +rng +ro +rob +rof +roo +rro +rtm +rug +rw +rwk +rwo +saf +saq +sas +sav +sba +sbe +sbp +sc +scn +sco +scs +sdc +se +sef +seh +sei +ses +sg +sgs +sgz +shk +sid +sig +sil +sim +sjr +sk +skc +sks +sl +sld +sli +sll +sly +sm +sma +smj +smn +smq +sms +sn +snc +snk +snp +snx +sny +so +sok +soq +soy +spd +spl +sps +sq +sr-ME +sr-RO +sr-RU +sr-TR +srn +srr +ss +ssd +ssg +ssy +st +stk +stq +su +sua +sue +suk +sur +sus +sv +sw +swc +swg +swp +sxn +sxw +szl +tal +tan +taq +tbc +tbd +tbf +tbg +tbo +tbw +tbz +tci +tdu +ted +tem +teo +tet +tfi +tgc +tgo +tgu +tif +tik +tim +tio +tiv +tk +tkl +tkr +tl +tlf +tlx +tly +tmh +tmy +tn +tnh +to +tof +tog +toq +tpi +tpm +tpz +tqo +tr +tru +trv +ts +tsg +tsw +ttd +tte +ttj +ttr +ttt +tuh +tul +tum +tuq +tvd +tvl +tvu +twh +twq +ty +tya +tzm +ubu +uli +umb +und +uok +uri +urt +urw +usa +utr +uvh +uvl +uz +vag +van +ve +vec +vep +vi +vic +viv +vls +vmf +vmw +vo +vot +vro +vun +vut +wa +wae +waj +wan +war +wbp +wci +wer +wgi +whg +wib +wiu +wiv +wja +wji +wls +wmo +wnc +wnu +wo +wob +wos +wrs +wsk +wuv +wwa +xav +xbi +xes +xh +xla +xog +xon +xrb +xsi +xsm +xwe +yam +yao +yap +yas +yat +yav +yay +yaz +yba +ybb +yby +yer +ygr +ygw +yko +yle +ylg +yll +yml +yo +yon +yrb +yre +yrl +yss +yua +yuj +yut +yuw +za +zag +zea +zia +zlm +zmi +zne +zu +zza \ No newline at end of file diff --git a/transformations/font_change/transformation.py b/transformations/font_change/transformation.py index d8d5f9882..292bf7ebf 100644 --- a/transformations/font_change/transformation.py +++ b/transformations/font_change/transformation.py @@ -13,24 +13,36 @@ def font_change(sentence, fonts, seed=666, max_outputs=1): + """ + Randomly choose words and a font for each word and transform the characters one by one. + + parameters: + sentence (str): input sentence + fonts (dict): dictionary containing character replacements for various fonts + max_outputs (int): number of outputs for each input + + returns: + perturbed_texts (list): a list of sentences where random words are in different fonts. + """ random.seed(seed) perturbed_texts = [] for _ in itertools.repeat(None, max_outputs): - # Generate the sentence with random fonts - tokens_escaped_list = [] + # tokens_match_list: a list of re.match objects of the words in the sentence. (No stop words) + tokens_match_list = [] for token in word_tokenize(sentence): if token not in stopwords.words("english"): if token != ".": - tokens_escaped_list.extend( + tokens_match_list.extend( list(re.finditer(re.escape(token), sentence)) ) transformed_sentence = list(sentence) + # tokens_to_change: a list of randomly chosen words from tokens_match_list (up to three words) tokens_to_change = random.sample( - tokens_escaped_list, - random.randint(1, min(3, len(tokens_escaped_list) - 1)), + tokens_match_list, + random.randint(1, min(3, len(tokens_match_list) - 1)), ) for ttc in tokens_to_change: while True: @@ -55,1030 +67,8 @@ class FontChange(SentenceOperation): TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TAGGING, ] - languages = [ - "aa", - "aai", - "aak", - "aau", - "abi", - "abr", - "abt", - "aby", - "acd", - "ace", - "ach", - "ada", - "ade", - "adj", - "adz", - "aey", - "af", - "agc", - "agd", - "agg", - "agm", - "ago", - "agq", - "aha", - "ahl", - "ajg", - "ak", - "ala", - "ali", - "aln", - "amm", - "amn", - "amo", - "amp", - "an", - "anc", - "ank", - "ann", - "any", - "aoj", - "aom", - "aoz", - "ape", - "apr", - "aps", - "apz", - "arh", - "arn", - "aro", - "asa", - "asg", - "aso", - "ast", - "ata", - "atg", - "atj", - "auy", - "avn", - "avt", - "avu", - "awb", - "awo", - "awx", - "ay", - "ayb", - "az", - "ban", - "bar", - "bas", - "bav", - "bba", - "bbb", - "bbc", - "bbd", - "bbj", - "bbp", - "bbr", - "bcf", - "bch", - "bci", - "bcm", - "bcn", - "bco", - "bcu", - "bdd", - "bef", - "beh", - "bem", - "bet", - "bew", - "bex", - "bez", - "bfd", - "bhg", - "bhl", - "bhy", - "bi", - "bib", - "big", - "bik", - "bim", - "bin", - "bio", - "biq", - "bjh", - "bjn", - "bjo", - "bjr", - "bjt", - "bjz", - "bkc", - "bkm", - "bkq", - "bku", - "bkv", - "bm", - "bmh", - "bmk", - "bmq", - "bmu", - "bng", - "bnm", - "bnp", - "boj", - "bom", - "bon", - "bqc", - "bqp", - "bqv", - "br", - "brz", - "bs", - "bsj", - "bss", - "bto", - "btt", - "buc", - "bud", - "bug", - "buk", - "bum", - "buo", - "bus", - "buu", - "bvb", - "bwd", - "bwr", - "bxh", - "bye", - "byr", - "bys", - "byv", - "byx", - "bza", - "bze", - "bzf", - "bzh", - "bzw", - "ca", - "cad", - "can", - "cbj", - "cch", - "ceb", - "cfa", - "cgg", - "ch", - "chk", - "cho", - "chp", - "cic", - "cjv", - "ckl", - "cko", - "cky", - "cla", - "cme", - "co", - "cps", - "crs", - "cs", - "csb", - "cy", - "da", - "dad", - "daf", - "dag", - "dah", - "dak", - "dav", - "dbd", - "dbq", - "ddn", - "de", - "ded", - "den", - "dga", - "dgh", - "dgi", - "dgr", - "dgz", - "dia", - "dje", - "dnj", - "dob", - "dop", - "dow", - "dri", - "dsb", - "dtm", - "dtp", - "dts", - "dua", - "duc", - "dud", - "dug", - "dva", - "dww", - "dyo", - "dyu", - "dzg", - "ebu", - "ee", - "efi", - "egl", - "eka", - "ema", - "emi", - "en", - "enn", - "enq", - "eo", - "eri", - "es", - "esu", - "et", - "etr", - "etu", - "etx", - "eu", - "ewo", - "ext", - "faa", - "fab", - "fag", - "fai", - "fan", - "ff", - "ffi", - "ffm", - "fi", - "fil", - "fit", - "fj", - "flr", - "fmp", - "fo", - "fod", - "fon", - "for", - "fpe", - "fqs", - "fr", - "frc", - "frp", - "frr", - "frs", - "fud", - "fue", - "fuf", - "fuh", - "fuq", - "fur", - "fuv", - "fuy", - "fvr", - "fy", - "ga", - "gaa", - "gaf", - "gag", - "gah", - "gaj", - "gam", - "gaw", - "gay", - "gba", - "gbf", - "gby", - "gcr", - "gd", - "gde", - "gdn", - "gdr", - "geb", - "gej", - "gel", - "gfk", - "ghs", - "gil", - "gim", - "gjn", - "gkn", - "gkp", - "gl", - "gmm", - "gn", - "gnd", - "gng", - "god", - "goi", - "gor", - "gos", - "grb", - "grw", - "gsw", - "gub", - "guc", - "gud", - "gur", - "guw", - "gux", - "guz", - "gv", - "gvf", - "gvs", - "gwi", - "gyi", - "ha", - "hag", - "ham", - "haw", - "hbb", - "hhy", - "hi-Latn", - "hia", - "hif", - "hig", - "hih", - "hil", - "hla", - "hmt", - "hnn", - "ho", - "hot", - "hr", - "hsb", - "ht", - "hu", - "hui", - "hz", - "ia", - "ian", - "iar", - "iba", - "ibb", - "iby", - "ica", - "ich", - "id", - "idd", - "idi", - "idu", - "ife", - "ig", - "igb", - "ige", - "ijj", - "ik", - "ikk", - "ikt", - "ikw", - "ikx", - "ilo", - "imo", - "in", - "io", - "iou", - "iri", - "is", - "it", - "iwm", - "iws", - "izh", - "izi", - "jab", - "jam", - "jbo", - "jbu", - "jen", - "jgk", - "jgo", - "jib", - "jmc", - "jra", - "jut", - "jv", - "jw", - "kab", - "kac", - "kad", - "kai", - "kaj", - "kam", - "kao", - "kbm", - "kbp", - "kbq", - "kbx", - "kcg", - "kck", - "kcl", - "kct", - "kde", - "kdl", - "kea", - "ken", - "kez", - "kfo", - "kg", - "kge", - "kgf", - "kgp", - "kha", - "khq", - "khs", - "khz", - "ki", - "kij", - "kiu", - "kiw", - "kj", - "kjd", - "kjs", - "kjy", - "kkc", - "kkj", - "kl", - "kln", - "klq", - "klt", - "klx", - "kmb", - "kmh", - "kmo", - "kms", - "kmu", - "kmw", - "knf", - "knp", - "kol", - "kos", - "koz", - "kpe", - "kpf", - "kpo", - "kpr", - "kpx", - "kqb", - "kqf", - "kqs", - "kr", - "kri", - "krj", - "krl", - "krs", - "ksb", - "ksd", - "ksf", - "ksh", - "ksj", - "ksr", - "ktm", - "kto", - "ktr", - "ku", - "kub", - "kud", - "kue", - "kuj", - "kun", - "kup", - "kus", - "kvg", - "kvr", - "kw", - "kwj", - "kwo", - "kwq", - "kxa", - "kxe", - "kxw", - "kxz", - "ky-Latn", - "ky-TR", - "kye", - "kyx", - "kzj", - "kzr", - "kzt", - "la", - "lag", - "laj", - "las", - "lb", - "lbu", - "lbw", - "lcm", - "ldb", - "led", - "lee", - "lem", - "leq", - "leu", - "lg", - "lgg", - "li", - "lia", - "lid", - "lig", - "lih", - "lij", - "ljp", - "lkt", - "lle", - "lln", - "lmo", - "lmp", - "ln", - "lns", - "lnu", - "loj", - "lok", - "lol", - "lor", - "los", - "loz", - "lt", - "ltg", - "lu", - "lua", - "luo", - "luy", - "lv", - "lzz", - "mad", - "maf", - "mak", - "man", - "mas", - "maw", - "maz", - "mbh", - "mbo", - "mbq", - "mbu", - "mbw", - "mci", - "mcp", - "mcq", - "mcr", - "mcu", - "mda", - "mdh", - "mdj", - "mdr", - "med", - "mee", - "mek", - "men", - "mer", - "met", - "meu", - "mfe", - "mfn", - "mfo", - "mfq", - "mg", - "mgh", - "mgl", - "mgo", - "mgy", - "mh", - "mhi", - "mhl", - "mi", - "mif", - "min", - "miw", - "mkl", - "mkp", - "mkw", - "mle", - "mlp", - "mls", - "mmo", - "mmu", - "mmx", - "mna", - "mnf", - "mo", - "moa", - "moe", - "moh", - "mos", - "mox", - "mpp", - "mps", - "mpt", - "mpx", - "mql", - "ms", - "ms-ID", - "mt", - "mtc", - "mtf", - "mti", - "mua", - "mur", - "mus", - "mva", - "mvn", - "mwk", - "mwv", - "mxc", - "mxm", - "myk", - "myw", - "myx", - "mzk", - "mzm", - "mzp", - "mzw", - "mzz", - "na", - "nac", - "naf", - "nak", - "nap", - "naq", - "nas", - "nb", - "nca", - "nce", - "ncf", - "nch", - "nco", - "ncu", - "nd", - "ndc", - "nds", - "neb", - "nex", - "nfr", - "ng", - "nga", - "ngb", - "ngl", - "nhb", - "nhe", - "nhw", - "nif", - "nii", - "nij", - "nin", - "niu", - "niy", - "niz", - "njo", - "nkg", - "nko", - "nl", - "nmg", - "nmz", - "nn", - "nnf", - "nnh", - "nnk", - "nnm", - "no", - "nop", - "nou", - "nr", - "nrb", - "nsn", - "nso", - "nss", - "ntm", - "ntr", - "nui", - "nup", - "nus", - "nuv", - "nux", - "nv", - "nwb", - "nxq", - "nxr", - "ny", - "nym", - "nyn", - "nzi", - "oc", - "ogc", - "okr", - "okv", - "om", - "ong", - "onn", - "ons", - "opm", - "oro", - "ozm", - "pag", - "pam", - "pap", - "pau", - "pbi", - "pcd", - "pcm", - "pdc", - "pdt", - "ped", - "pex", - "pfl", - "pil", - "pip", - "pko", - "pl", - "pla", - "pms", - "png", - "pnn", - "pon", - "ppo", - "prg", - "pss", - "pt", - "ptp", - "puu", - "pwa", - "qu", - "quc", - "qug", - "rai", - "rao", - "rcf", - "rej", - "rel", - "res", - "rgn", - "ria", - "rif-NL", - "rm", - "rmf", - "rmo", - "rmu", - "rn", - "rna", - "rng", - "ro", - "rob", - "rof", - "roo", - "rro", - "rtm", - "rug", - "rw", - "rwk", - "rwo", - "saf", - "saq", - "sas", - "sav", - "sba", - "sbe", - "sbp", - "sc", - "scn", - "sco", - "scs", - "sdc", - "se", - "sef", - "seh", - "sei", - "ses", - "sg", - "sgs", - "sgz", - "shk", - "sid", - "sig", - "sil", - "sim", - "sjr", - "sk", - "skc", - "sks", - "sl", - "sld", - "sli", - "sll", - "sly", - "sm", - "sma", - "smj", - "smn", - "smq", - "sms", - "sn", - "snc", - "snk", - "snp", - "snx", - "sny", - "so", - "sok", - "soq", - "soy", - "spd", - "spl", - "sps", - "sq", - "sr-ME", - "sr-RO", - "sr-RU", - "sr-TR", - "srn", - "srr", - "ss", - "ssd", - "ssg", - "ssy", - "st", - "stk", - "stq", - "su", - "sua", - "sue", - "suk", - "sur", - "sus", - "sv", - "sw", - "swc", - "swg", - "swp", - "sxn", - "sxw", - "szl", - "tal", - "tan", - "taq", - "tbc", - "tbd", - "tbf", - "tbg", - "tbo", - "tbw", - "tbz", - "tci", - "tdu", - "ted", - "tem", - "teo", - "tet", - "tfi", - "tgc", - "tgo", - "tgu", - "tif", - "tik", - "tim", - "tio", - "tiv", - "tk", - "tkl", - "tkr", - "tl", - "tlf", - "tlx", - "tly", - "tmh", - "tmy", - "tn", - "tnh", - "to", - "tof", - "tog", - "toq", - "tpi", - "tpm", - "tpz", - "tqo", - "tr", - "tru", - "trv", - "ts", - "tsg", - "tsw", - "ttd", - "tte", - "ttj", - "ttr", - "ttt", - "tuh", - "tul", - "tum", - "tuq", - "tvd", - "tvl", - "tvu", - "twh", - "twq", - "ty", - "tya", - "tzm", - "ubu", - "uli", - "umb", - "und", - "uok", - "uri", - "urt", - "urw", - "usa", - "utr", - "uvh", - "uvl", - "uz", - "vag", - "van", - "ve", - "vec", - "vep", - "vi", - "vic", - "viv", - "vls", - "vmf", - "vmw", - "vo", - "vot", - "vro", - "vun", - "vut", - "wa", - "wae", - "waj", - "wan", - "war", - "wbp", - "wci", - "wer", - "wgi", - "whg", - "wib", - "wiu", - "wiv", - "wja", - "wji", - "wls", - "wmo", - "wnc", - "wnu", - "wo", - "wob", - "wos", - "wrs", - "wsk", - "wuv", - "wwa", - "xav", - "xbi", - "xes", - "xh", - "xla", - "xog", - "xon", - "xrb", - "xsi", - "xsm", - "xwe", - "yam", - "yao", - "yap", - "yas", - "yat", - "yav", - "yay", - "yaz", - "yba", - "ybb", - "yby", - "yer", - "ygr", - "ygw", - "yko", - "yle", - "ylg", - "yll", - "yml", - "yo", - "yon", - "yrb", - "yre", - "yrl", - "yss", - "yua", - "yuj", - "yut", - "yuw", - "za", - "zag", - "zea", - "zia", - "zlm", - "zmi", - "zne", - "zu", - "zza", - ] + with open("list_of_languages.txt", "r") as f: + languages = [x.rstrip() for x in f.readlines()] keywords = [ "noise", "rule-based", From 9c82836ef812287ecb5f341732ef792c88886b7b Mon Sep 17 00:00:00 2001 From: Shahab Raji Date: Fri, 29 Oct 2021 14:35:21 -0400 Subject: [PATCH 9/9] data and code section added to README --- transformations/font_change/README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/transformations/font_change/README.md b/transformations/font_change/README.md index 5c4954c2e..6d0a92001 100644 --- a/transformations/font_change/README.md +++ b/transformations/font_change/README.md @@ -8,7 +8,7 @@ Authors: [Shahab Raji](mailto:shahab.raji@rutgers.edu) (Rutgers University) and ## How does the transformation work? -Font Change adapts the appearance of randomly selected words in the input sentence. For each selected word, one of several possible appearance changes is chosen randomly. Such changes are achieved using Unicode characters based on mapping tables from the [𝓾𝓷𝓲𝓬𝓸𝓭𝓮 𝙛𝙤𝙧𝙢𝙖𝙩𝙩𝙚𝙧](https://github.com/DenverCoder1/unicode-formatter) (MIT license) tool. +Font Change adapts the appearance of randomly selected words in the input sentence. For each selected word, one of several possible appearance changes is chosen randomly. Examples: @@ -18,7 +18,13 @@ to > The quick brown 🅵🅾🆇 ɾnɯds over the lazy ᴅᴏɢ. -## Target Tasks +## Data and code provenance + +The changes in text are achieved using Unicode characters based on mapping tables from the [𝓾𝓷𝓲𝓬𝓸𝓭𝓮 𝙛𝙤𝙧𝙢𝙖𝙩𝙩𝙚𝙧](https://github.com/DenverCoder1/unicode-formatter) (MIT license) tool. + +The code is implemented by the authors. + +## Target tasks This transformation can be used for data augmentation in text classification tasks.