From beb75487922c9f51ce7a2c6a2335e4a23a456089 Mon Sep 17 00:00:00 2001 From: joseph Date: Sat, 24 Jul 2021 14:47:58 +0200 Subject: [PATCH 01/10] add synonym insertion transformation --- transformations/synonym_insertion/README.md | 68 ++++++++++ transformations/synonym_insertion/__init__.py | 1 + .../synonym_insertion/requirements.txt | 1 + transformations/synonym_insertion/test.json | 60 +++++++++ .../synonym_insertion/transformation.py | 116 ++++++++++++++++++ 5 files changed, 246 insertions(+) create mode 100644 transformations/synonym_insertion/README.md create mode 100644 transformations/synonym_insertion/__init__.py create mode 100644 transformations/synonym_insertion/requirements.txt create mode 100644 transformations/synonym_insertion/test.json create mode 100644 transformations/synonym_insertion/transformation.py diff --git a/transformations/synonym_insertion/README.md b/transformations/synonym_insertion/README.md new file mode 100644 index 000000000..989f9b961 --- /dev/null +++ b/transformations/synonym_insertion/README.md @@ -0,0 +1,68 @@ +# Synonym Insertion +This perturbation adds noise to all types of text sources (sentence, paragraph, etc.) by randomly inserting synonyms of randomly selected words excluding punctuations and stopwords. + +Author1 name: Tshephisho Sefara + +Author1 email: sefarat@gmail.com + +Author1 Affiliation: Council for Scientific and Industrial Research + +Author2 name: Vukosi Marivate + +Author2 email: vima@vima.co.za + +Author2 Affiliation: University of Pretoria + +## What type of a transformation is this? +This transformation could augment the semantic representation of the sentence as well as test model robustness by inserting synonyms of random words excluding punctuations and stopwords. + + +## What tasks does it intend to benefit? +This perturbation would benefit all tasks on text classification and generation. + +Benchmark results: + +- Text Classification: we run sentiment analysis on a 1% sample of the IMDB dataset. The original accuracy is 96.0 and the perturbed accuracy is 94.0. +```{'accuracy': 96.0, + 'dataset_name': 'imdb', + 'model_name': 'aychang/roberta-base-imdb', + 'no_of_examples': 250, + 'pt_accuracy': 94.0, + 'split': 'test[:1%]'} +``` +- Text summarization: we run text summarization on a 1% sample of the xsum dataset. The original bleu is 15.99 and the perturbed bleu is ???. + +## Related Work +This perturbation is adapted from our TextAugmentation library https://github.com/dsfsi/textaugment +```bibtex +@inproceedings{marivate2020improving, + title={Improving short text classification through global augmentation methods}, + author={Marivate, Vukosi and Sefara, Tshephisho}, + booktitle={International Cross-Domain Conference for Machine Learning and Knowledge Extraction}, + pages={385--399}, + year={2020}, + organization={Springer} +} +``` + +The synonyms are based on WordNet via NLTK + +```bibtex +@book{miller1998wordnet, + title={WordNet: An electronic lexical database}, + author={Miller, George A}, + year={1998}, + publisher={MIT press} +} +@inproceedings{bird2006nltk, + title={NLTK: the natural language toolkit}, + author={Bird, Steven}, + booktitle={Proceedings of the COLING/ACL 2006 Interactive Presentation Sessions}, + pages={69--72}, + year={2006} +} +``` + + +## What are the limitations of this transformation? +The space of synonyms depends on WordNet and could be limited. The transformation might introduce non-grammatical segments. diff --git a/transformations/synonym_insertion/__init__.py b/transformations/synonym_insertion/__init__.py new file mode 100644 index 000000000..930cdce0b --- /dev/null +++ b/transformations/synonym_insertion/__init__.py @@ -0,0 +1 @@ +from .transformation import * diff --git a/transformations/synonym_insertion/requirements.txt b/transformations/synonym_insertion/requirements.txt new file mode 100644 index 000000000..3e081137a --- /dev/null +++ b/transformations/synonym_insertion/requirements.txt @@ -0,0 +1 @@ +nltk>=3.4 diff --git a/transformations/synonym_insertion/test.json b/transformations/synonym_insertion/test.json new file mode 100644 index 000000000..3fcd73e59 --- /dev/null +++ b/transformations/synonym_insertion/test.json @@ -0,0 +1,60 @@ +{ + "type": "synonym_insertion", + "test_cases": [ + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Andrew finally returned the French book to Chris that I bought last week" + }, + "outputs": [ + { + "sentence": "Andrew finally returned the French book script to Chris that I bought last final week" + } + ] + }, + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments." + }, + "outputs": [ + { + "sentence": "Sentences with gapping, such as Paul likes coffee coffee bean and Mary tea, lack an overt predicate to indicate bespeak the relation between two or more arguments argumentation." + } + ] + }, + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" + }, + "outputs": [ + { + "sentence": "Alice in Wonderland is a 2010 American live - action / animated inspire dark fantasy illusion adventure film" + } + ] + }, + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" + }, + "outputs": [ + { + "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" + } + ] + }, + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." + }, + "outputs": [ + { + "sentence": "Neuroplasticity is a continuous processing allowing short - term terminus, medium - term terminus, and long - term condition remodeling of the neuronosynaptic organization." + } + ] + } + ] +} diff --git a/transformations/synonym_insertion/transformation.py b/transformations/synonym_insertion/transformation.py new file mode 100644 index 000000000..49dcb4215 --- /dev/null +++ b/transformations/synonym_insertion/transformation.py @@ -0,0 +1,116 @@ +import random +import re +from abc import ABC + +import nltk +import spacy +from nltk.corpus import wordnet, stopwords + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType +from initialize import spacy_nlp + +""" +Base Class for implementing the different input transformations a generation should be robust against. +""" + + +class InsertWordTransformation: + nlp = None + + def __init__(self, seed=0, max_outputs=1, prob=0.5): + self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.max_outputs = max_outputs + self.seed = seed + self.prob = prob + self.stopwords = stopwords.words('english') + + def untokenize(self, words: list): + """ + Untokenizing a text undoes the tokenizing operation, restoring + punctuation and spaces to the places that people expect them to be. + Ideally, `untokenize(tokenize(text))` should be identical to `text`, + except for line breaks. + ref: https://github.com/commonsense/metanl/blob/master/metanl/token_utils.py#L28 + """ + text = " ".join(words) + step1 = ( + text.replace("`` ", '"').replace(" ''", '"').replace(". . .", "...") + ) + step2 = step1.replace(" ( ", " (").replace(" ) ", ") ") + step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2) + step4 = re.sub(r" ([.,:;?!%]+)$", r"\1", step3) + step5 = ( + step4.replace(" '", "'") + .replace(" n't", "n't") + .replace("can not", "cannot") + ) + step6 = step5.replace(" ` ", " '") + return step6.strip() + + def transform(self, input_text: str): + random.seed(self.seed) + pos_wordnet_dict = { + "VERB": "v", + "NOUN": "n", + "ADV": "r", + "ADJ": "s", + } + doc = self.nlp(input_text) + results = set() + for _ in range(self.max_outputs): + result = [] + for token in doc: + word = token.text + wordnet_pos = pos_wordnet_dict.get(token.pos_) + if not wordnet_pos: + result.append(word) + elif word in self.stopwords: + result.append(word) + else: + synsets = wordnet.synsets(word, pos=wordnet_pos) + if len(synsets) > 0: + synsets = [syn.name().split(".")[0] for syn in synsets] + synsets = [syn for syn in synsets if syn.lower() != word.lower()] + synsets = list(set(synsets)) # remove duplicate synonyms + if len(synsets) > 0 and random.random() < self.prob: + syn = random.choice(synsets) + syn = syn.replace("_", " ") + result.append(word) + result.append(syn) + else: + result.append(word) + else: + result.append(word) + result = self.untokenize(result) # rebuild the sentence + results.add(result) + return list(results) + + +""" +Insert words such as synonyms from WordNet via nltk +""" + + +class SynonymInsertion(SentenceOperation, ABC): + """ + This class is an implementation of synonym insertion in the sentence. Created by the Authors of TextAugment + https://github.com/dsfsi/textaugment + """ + tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION] + languages = ["en"] + + def __init__(self, seed=0, prob=0.5, max_outputs=1): + super().__init__(seed, max_outputs=max_outputs) + nltk.download(["wordnet", "stopwords"]) + self.insert_word_transformation = InsertWordTransformation( + seed, max_outputs, prob + ) + + def generate(self, sentence: str): + result = self.insert_word_transformation.transform( + input_text=sentence, + ) + if self.verbose: + print(f"Perturbed Input from {self.name()} : {result}") + return result From d0d9c800a49cf95c1c9e38273b7e154e6cc193db Mon Sep 17 00:00:00 2001 From: Vukosi Date: Sat, 24 Jul 2021 17:41:13 +0200 Subject: [PATCH 02/10] Update README.md --- transformations/synonym_insertion/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformations/synonym_insertion/README.md b/transformations/synonym_insertion/README.md index 989f9b961..c0189f161 100644 --- a/transformations/synonym_insertion/README.md +++ b/transformations/synonym_insertion/README.md @@ -9,9 +9,9 @@ Author1 Affiliation: Council for Scientific and Industrial Research Author2 name: Vukosi Marivate -Author2 email: vima@vima.co.za +Author2 email: vukosi.marivate@cs.up.ac.za, vima@vima.co.za -Author2 Affiliation: University of Pretoria +Author2 Affiliation: Department of Computer Science, University of Pretoria ## What type of a transformation is this? This transformation could augment the semantic representation of the sentence as well as test model robustness by inserting synonyms of random words excluding punctuations and stopwords. From f4201c841ab8fd380393de2121ef0cbada1cd8eb Mon Sep 17 00:00:00 2001 From: Vukosi Date: Sat, 24 Jul 2021 18:48:51 +0200 Subject: [PATCH 03/10] Update test.json --- transformations/synonym_insertion/test.json | 61 +-------------------- 1 file changed, 1 insertion(+), 60 deletions(-) diff --git a/transformations/synonym_insertion/test.json b/transformations/synonym_insertion/test.json index 3fcd73e59..8a8d974b1 100644 --- a/transformations/synonym_insertion/test.json +++ b/transformations/synonym_insertion/test.json @@ -1,60 +1 @@ -{ - "type": "synonym_insertion", - "test_cases": [ - { - "class": "SynonymInsertion", - "inputs": { - "sentence": "Andrew finally returned the French book to Chris that I bought last week" - }, - "outputs": [ - { - "sentence": "Andrew finally returned the French book script to Chris that I bought last final week" - } - ] - }, - { - "class": "SynonymInsertion", - "inputs": { - "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments." - }, - "outputs": [ - { - "sentence": "Sentences with gapping, such as Paul likes coffee coffee bean and Mary tea, lack an overt predicate to indicate bespeak the relation between two or more arguments argumentation." - } - ] - }, - { - "class": "SynonymInsertion", - "inputs": { - "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" - }, - "outputs": [ - { - "sentence": "Alice in Wonderland is a 2010 American live - action / animated inspire dark fantasy illusion adventure film" - } - ] - }, - { - "class": "SynonymInsertion", - "inputs": { - "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" - }, - "outputs": [ - { - "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" - } - ] - }, - { - "class": "SynonymInsertion", - "inputs": { - "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." - }, - "outputs": [ - { - "sentence": "Neuroplasticity is a continuous processing allowing short - term terminus, medium - term terminus, and long - term condition remodeling of the neuronosynaptic organization." - } - ] - } - ] -} +{"type": "synonym_insertion", "test_cases": [{"class": "SynonymInsertion", "inputs": {"sentence": "Andrew finally returned the French book to Chris that I bought last week"}, "outputs": [{"sentence": "Andrew finally returned the French book bible to Chris that I bought last final week"}]}, {"class": "SynonymInsertion", "inputs": {"sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments."}, "outputs": [{"sentence": "Sentences with gapping, such as Paul likes coffee coffee bean and Mary tea, lack an overt predicate to indicate bespeak the relation between two or more arguments argumentation."}]}, {"class": "SynonymInsertion", "inputs": {"sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film"}, "outputs": [{"sentence": "Alice in Wonderland is a 2010 American live - action / animated animate dark fantasy illusion adventure film"}]}, {"class": "SynonymInsertion", "inputs": {"sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001"}, "outputs": [{"sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001"}]}, {"class": "SynonymInsertion", "inputs": {"sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization."}, "outputs": [{"sentence": "Neuroplasticity is a continuous processing allowing short - term condition, medium - term condition, and long - term terminus remodeling of the neuronosynaptic organization."}]}]} From f8cd666ceee2e3eaa30040851777566aafa773a7 Mon Sep 17 00:00:00 2001 From: Vukosi Date: Sat, 24 Jul 2021 18:50:03 +0200 Subject: [PATCH 04/10] Update README.md --- transformations/synonym_insertion/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/transformations/synonym_insertion/README.md b/transformations/synonym_insertion/README.md index c0189f161..313a7c25d 100644 --- a/transformations/synonym_insertion/README.md +++ b/transformations/synonym_insertion/README.md @@ -30,7 +30,13 @@ Benchmark results: 'pt_accuracy': 94.0, 'split': 'test[:1%]'} ``` -- Text summarization: we run text summarization on a 1% sample of the xsum dataset. The original bleu is 15.99 and the perturbed bleu is ???. +- Text summarization: we run text summarization on a 1% sample of the xsum dataset. The original bleu is 15.99 and the perturbed bleu is 13.5. +```{'bleu': 16.0, + 'pt_bleu': 13.5, + 'model_name': 'sshleifer/distilbart-xsum-12-6', + 'split': 'test[:1%]', + 'dataset_name': 'xsum'} +``` ## Related Work This perturbation is adapted from our TextAugmentation library https://github.com/dsfsi/textaugment From b762f37a9a1f8b34be4779d20619d8570b80dfbb Mon Sep 17 00:00:00 2001 From: Vukosi Date: Sat, 24 Jul 2021 18:50:31 +0200 Subject: [PATCH 05/10] Update README.md --- transformations/synonym_insertion/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/transformations/synonym_insertion/README.md b/transformations/synonym_insertion/README.md index 313a7c25d..617f3bd09 100644 --- a/transformations/synonym_insertion/README.md +++ b/transformations/synonym_insertion/README.md @@ -23,7 +23,8 @@ This perturbation would benefit all tasks on text classification and generation. Benchmark results: - Text Classification: we run sentiment analysis on a 1% sample of the IMDB dataset. The original accuracy is 96.0 and the perturbed accuracy is 94.0. -```{'accuracy': 96.0, +``` + {'accuracy': 96.0, 'dataset_name': 'imdb', 'model_name': 'aychang/roberta-base-imdb', 'no_of_examples': 250, @@ -31,7 +32,8 @@ Benchmark results: 'split': 'test[:1%]'} ``` - Text summarization: we run text summarization on a 1% sample of the xsum dataset. The original bleu is 15.99 and the perturbed bleu is 13.5. -```{'bleu': 16.0, +``` + {'bleu': 16.0, 'pt_bleu': 13.5, 'model_name': 'sshleifer/distilbart-xsum-12-6', 'split': 'test[:1%]', From b81cc9ee63e25644b435f5e4136cbbf6342f1f93 Mon Sep 17 00:00:00 2001 From: joseph Date: Sat, 24 Jul 2021 21:25:04 +0200 Subject: [PATCH 06/10] update readme --- transformations/synonym_insertion/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformations/synonym_insertion/README.md b/transformations/synonym_insertion/README.md index 617f3bd09..c79596abd 100644 --- a/transformations/synonym_insertion/README.md +++ b/transformations/synonym_insertion/README.md @@ -3,7 +3,7 @@ This perturbation adds noise to all types of text sources (sentence, paragraph, Author1 name: Tshephisho Sefara -Author1 email: sefarat@gmail.com +Author1 email: sefaratj@gmail.com Author1 Affiliation: Council for Scientific and Industrial Research From 1a56e0cae4876a4723a3190909afa49ed14598e8 Mon Sep 17 00:00:00 2001 From: joseph Date: Sat, 24 Jul 2021 21:25:24 +0200 Subject: [PATCH 07/10] format json --- transformations/synonym_insertion/test.json | 61 ++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/transformations/synonym_insertion/test.json b/transformations/synonym_insertion/test.json index 8a8d974b1..94d4f8595 100644 --- a/transformations/synonym_insertion/test.json +++ b/transformations/synonym_insertion/test.json @@ -1 +1,60 @@ -{"type": "synonym_insertion", "test_cases": [{"class": "SynonymInsertion", "inputs": {"sentence": "Andrew finally returned the French book to Chris that I bought last week"}, "outputs": [{"sentence": "Andrew finally returned the French book bible to Chris that I bought last final week"}]}, {"class": "SynonymInsertion", "inputs": {"sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments."}, "outputs": [{"sentence": "Sentences with gapping, such as Paul likes coffee coffee bean and Mary tea, lack an overt predicate to indicate bespeak the relation between two or more arguments argumentation."}]}, {"class": "SynonymInsertion", "inputs": {"sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film"}, "outputs": [{"sentence": "Alice in Wonderland is a 2010 American live - action / animated animate dark fantasy illusion adventure film"}]}, {"class": "SynonymInsertion", "inputs": {"sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001"}, "outputs": [{"sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001"}]}, {"class": "SynonymInsertion", "inputs": {"sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization."}, "outputs": [{"sentence": "Neuroplasticity is a continuous processing allowing short - term condition, medium - term condition, and long - term terminus remodeling of the neuronosynaptic organization."}]}]} +{ + "type": "synonym_insertion", + "test_cases": [ + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Andrew finally returned the French book to Chris that I bought last week" + }, + "outputs": [ + { + "sentence": "Andrew finally returned the French book bible to Chris that I bought last final week" + } + ] + }, + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments." + }, + "outputs": [ + { + "sentence": "Sentences with gapping, such as Paul likes coffee coffee bean and Mary tea, lack an overt predicate to indicate bespeak the relation between two or more arguments argumentation." + } + ] + }, + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" + }, + "outputs": [ + { + "sentence": "Alice in Wonderland is a 2010 American live - action / animated animate dark fantasy illusion adventure film" + } + ] + }, + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" + }, + "outputs": [ + { + "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" + } + ] + }, + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." + }, + "outputs": [ + { + "sentence": "Neuroplasticity is a continuous processing allowing short - term condition, medium - term condition, and long - term terminus remodeling of the neuronosynaptic organization." + } + ] + } + ] +} From 732b44db52e48b9a465e352f7bc484113ed72aab Mon Sep 17 00:00:00 2001 From: joseph Date: Tue, 5 Oct 2021 16:15:01 +0200 Subject: [PATCH 08/10] added list of keywords --- transformations/synonym_insertion/transformation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/transformations/synonym_insertion/transformation.py b/transformations/synonym_insertion/transformation.py index 49dcb4215..3e49ea396 100644 --- a/transformations/synonym_insertion/transformation.py +++ b/transformations/synonym_insertion/transformation.py @@ -88,7 +88,7 @@ def transform(self, input_text: str): """ -Insert words such as synonyms from WordNet via nltk +Insert words such as synonyms from WordNet via nltk. """ @@ -99,6 +99,9 @@ class SynonymInsertion(SentenceOperation, ABC): """ tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION] languages = ["en"] + keywords = [ + "tokenizer", "external-knowledge-based", "lexical", "low-precision", "low-coverage", "low-generations" + ] def __init__(self, seed=0, prob=0.5, max_outputs=1): super().__init__(seed, max_outputs=max_outputs) From 96c84d44212d9e5f269a0f5fcc9aa6197cd00d7c Mon Sep 17 00:00:00 2001 From: joseph Date: Tue, 5 Oct 2021 18:33:27 +0200 Subject: [PATCH 09/10] update robust evaluation results on readme and test.json --- transformations/synonym_insertion/README.md | 22 +++- transformations/synonym_insertion/test.json | 108 ++++++++++---------- 2 files changed, 71 insertions(+), 59 deletions(-) diff --git a/transformations/synonym_insertion/README.md b/transformations/synonym_insertion/README.md index c79596abd..1d318c29f 100644 --- a/transformations/synonym_insertion/README.md +++ b/transformations/synonym_insertion/README.md @@ -24,6 +24,12 @@ Benchmark results: - Text Classification: we run sentiment analysis on a 1% sample of the IMDB dataset. The original accuracy is 96.0 and the perturbed accuracy is 94.0. ``` +Applying transformation: +100%|██████████| 250/250 [00:18<00:00, 13.85it/s] +Finished transformation! 250 examples generated from 250 original examples, with 250 successfully transformed and 0 unchanged (1.0 perturb rate) +Here is the performance of the model on the transformed set +The accuracy on this subset which has 250 examples = 94.0 + {'accuracy': 96.0, 'dataset_name': 'imdb', 'model_name': 'aychang/roberta-base-imdb', @@ -31,13 +37,19 @@ Benchmark results: 'pt_accuracy': 94.0, 'split': 'test[:1%]'} ``` -- Text summarization: we run text summarization on a 1% sample of the xsum dataset. The original bleu is 15.99 and the perturbed bleu is 13.5. +- Text Generation: we run text generation on a 1% sample of the xsum dataset. The original bleu is 16 and the perturbed bleu is 13.85. ``` - {'bleu': 16.0, - 'pt_bleu': 13.5, +Applying transformation: +100%|██████████| 113/113 [00:12<00:00, 9.31it/s] +Finished transformation! 113 examples generated from 113 original examples, with 113 successfully transformed and 0 unchanged (1.0 perturb rate) +Here is the performance of the model on the transformed set +Length of Evaluation dataset is 113 +Predicted BLEU score = 13.849736846663058 +{'bleu': 16.0, + 'dataset_name': 'xsum', 'model_name': 'sshleifer/distilbart-xsum-12-6', - 'split': 'test[:1%]', - 'dataset_name': 'xsum'} + 'pt_bleu': 13.8, + 'split': 'test[:1%]'} ``` ## Related Work diff --git a/transformations/synonym_insertion/test.json b/transformations/synonym_insertion/test.json index 94d4f8595..bf1e34165 100644 --- a/transformations/synonym_insertion/test.json +++ b/transformations/synonym_insertion/test.json @@ -1,60 +1,60 @@ { - "type": "synonym_insertion", - "test_cases": [ - { - "class": "SynonymInsertion", - "inputs": { - "sentence": "Andrew finally returned the French book to Chris that I bought last week" + "type": "synonym_insertion", + "test_cases": [ + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Andrew finally returned the French book to Chris that I bought last week" + }, + "outputs": [ + { + "sentence": "Andrew finally returned the French book koran to Chris that I bought last final week" + } + ] }, - "outputs": [ - { - "sentence": "Andrew finally returned the French book bible to Chris that I bought last final week" - } - ] - }, - { - "class": "SynonymInsertion", - "inputs": { - "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments." + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments." + }, + "outputs": [ + { + "sentence": "Sentences with gapping, such as Paul likes coffee chocolate and Mary tea, lack an overt predicate to indicate argue the relation between two or more arguments controversy." + } + ] }, - "outputs": [ - { - "sentence": "Sentences with gapping, such as Paul likes coffee coffee bean and Mary tea, lack an overt predicate to indicate bespeak the relation between two or more arguments argumentation." - } - ] - }, - { - "class": "SynonymInsertion", - "inputs": { - "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" + }, + "outputs": [ + { + "sentence": "Alice in Wonderland is a 2010 American live - action / animated animize dark fantasy illusion adventure film" + } + ] }, - "outputs": [ - { - "sentence": "Alice in Wonderland is a 2010 American live - action / animated animate dark fantasy illusion adventure film" - } - ] - }, - { - "class": "SynonymInsertion", - "inputs": { - "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" - }, - "outputs": [ - { + { + "class": "SynonymInsertion", + "inputs": { "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" - } - ] - }, - { - "class": "SynonymInsertion", - "inputs": { - "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." + }, + "outputs": [ + { + "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" + } + ] }, - "outputs": [ - { - "sentence": "Neuroplasticity is a continuous processing allowing short - term condition, medium - term condition, and long - term terminus remodeling of the neuronosynaptic organization." - } - ] - } - ] -} + { + "class": "SynonymInsertion", + "inputs": { + "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." + }, + "outputs": [ + { + "sentence": "Neuroplasticity is a continuous processing allowing short - term terminus, medium - term terminus, and long - term condition remodeling of the neuronosynaptic organization." + } + ] + } + ] + } From 2126a52eb8649cf0dc3f3a378257205577c136d5 Mon Sep 17 00:00:00 2001 From: joseph Date: Tue, 5 Oct 2021 19:02:23 +0200 Subject: [PATCH 10/10] minor updates --- transformations/synonym_insertion/transformation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transformations/synonym_insertion/transformation.py b/transformations/synonym_insertion/transformation.py index 3e49ea396..e393e544a 100644 --- a/transformations/synonym_insertion/transformation.py +++ b/transformations/synonym_insertion/transformation.py @@ -99,6 +99,7 @@ class SynonymInsertion(SentenceOperation, ABC): """ tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION] languages = ["en"] + heavy = False keywords = [ "tokenizer", "external-knowledge-based", "lexical", "low-precision", "low-coverage", "low-generations" ]