diff --git a/transformations/space_between_characters/README.md b/transformations/space_between_characters/README.md new file mode 100644 index 000000000..44c54b353 --- /dev/null +++ b/transformations/space_between_characters/README.md @@ -0,0 +1,23 @@ +# Space Between Characters +This perturbation adds noise to all types of text sources (sentence, paragraph, etc.). + +Author name: Marco Di Giovanni +Author email: marco.digiovanni@polimi.it +Author Affiliation: Politecnico di Milano and University of Bologna + +## What type of a transformation is this? +This transformation acts like a perturbation to test robustness. Few words are picked at random and spaces are added between characters (e.g., "Marco" -> "M a r c o"). + +The probability of adding a space between characters can also be set (default to 1), allowing transformations like: "house" -> "h ouse" or "h o use". + +Generated transformations display high similarity to the source sentences i.e. the code outputs highly precise and readable generations. + +## What tasks does it intend to benefit? +This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. + +It could also benefit tasks involving data from OCR systems. + +## What are the limitations of this transformation? +- The transformation's outputs are very simple. +- It is not capable of generating linguistically diverse text. +- This transformation will mainly affect the perfornamce of token/word-level models, while character-level models should be much more robust. \ No newline at end of file diff --git a/transformations/space_between_characters/__init__.py b/transformations/space_between_characters/__init__.py new file mode 100644 index 000000000..930cdce0b --- /dev/null +++ b/transformations/space_between_characters/__init__.py @@ -0,0 +1 @@ +from .transformation import * diff --git a/transformations/space_between_characters/test.json b/transformations/space_between_characters/test.json new file mode 100644 index 000000000..84502713b --- /dev/null +++ b/transformations/space_between_characters/test.json @@ -0,0 +1,50 @@ +{ + "type": "space_between_characters", + "test_cases": [ + { + "class": "SpaceBetweenCharacters", + "inputs": { + "sentence": "Andrew finally returned the French book to Chris that I bought last week" + }, + "outputs": [{ + "sentence": "Andrew f i n a l l y returned the French book to C h r i s that I bought last w e e k" + }] + }, + { + "class": "SpaceBetweenCharacters", + "inputs": { + "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments." + }, + "outputs": [{ + "sentence": "Sentences w i t h gapping, such as Paul likes c o f f e e and M a r y tea, lack a n overt predicate to indicate the relation b e t w e e n two or more arguments." + }] + }, + { + "class": "SpaceBetweenCharacters", + "inputs": { + "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" + }, + "outputs": [{ + "sentence": "Alice i n Wonderland is a 2010 American l i v e - a c t i o n / a n i m a t e d dark f a n t a s y adventure film" + }] + }, + { + "class": "SpaceBetweenCharacters", + "inputs": { + "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" + }, + "outputs": [{ + "sentence": "Ujjal D e v Dosanjh served as 33rd Premier o f British C o l u m b i a from 2000 t o 2001" + }] + }, + { + "class": "SpaceBetweenCharacters", + "inputs": { + "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." + }, + "outputs": [{ + "sentence": "Neuroplasticity i s a continuous processing allowing short-term, m e d i u m - t e r m , and l o n g - t e r m remodeling of t h e neuronosynaptic organization." + }] + } + ] +} diff --git a/transformations/space_between_characters/transformation.py b/transformations/space_between_characters/transformation.py new file mode 100644 index 000000000..9ce6875b4 --- /dev/null +++ b/transformations/space_between_characters/transformation.py @@ -0,0 +1,62 @@ +import random +from typing import List + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + + +def add_spaces(text, prob_token=0.1, prob_char=1.0, seed=0, max_outputs=1): + random.seed(seed) + + words = text.split(" ") + perturbed_texts = [] + for _ in range(max_outputs): + perturbed_text = [] + for word in words: + if random.random() <= prob_token: + if prob_char == 1: + new_word = " ".join(word) + else: + new_word = [word[0]] + for letter in word[1:]: + if random.random() <= prob_char: + new_word.append(" ") + new_word.append(letter) + new_word = "".join(new_word) + else: + new_word = word + perturbed_text.append(new_word) + perturbed_texts.append(" ".join(perturbed_text)) + return perturbed_texts + + +class SpaceBetweenCharacters(SentenceOperation): + tasks = [ + TaskType.TEXT_CLASSIFICATION, + TaskType.TEXT_TO_TEXT_GENERATION, + TaskType.TEXT_TAGGING, + ] + languages = ["All"] + keywords = [ + "morphological", + "noise", + "rule-based", + "highly-meaning-preserving", + "high-precision", + "high-coverage", + ] + + def __init__(self, seed=42, max_outputs=1, prob_token=0.1, prob_char=1.0): + super().__init__(seed, max_outputs=max_outputs) + self.prob_token = prob_token + self.prob_char = prob_char + + def generate(self, sentence: str) -> List[str]: + perturbed_texts = add_spaces( + text=sentence, + prob_token=self.prob_token, + prob_char=self.prob_char, + seed=self.seed, + max_outputs=self.max_outputs, + ) + return perturbed_texts