From c04446057dba4f7c0ac089b273f61fb5103b4150 Mon Sep 17 00:00:00 2001 From: Nicolas REMOND Date: Tue, 4 Jun 2019 16:40:54 +0900 Subject: [PATCH 1/3] rouge 1/2 added --- nlgeval/__init__.py | 2 +- nlgeval/pycocoevalcap/rouge/rouge.py | 99 ++++++++++++++++++++++++++-- 2 files changed, 95 insertions(+), 6 deletions(-) diff --git a/nlgeval/__init__.py b/nlgeval/__init__.py index a85db78..b23b05d 100644 --- a/nlgeval/__init__.py +++ b/nlgeval/__init__.py @@ -33,7 +33,7 @@ def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=Fa scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), - (Rouge(), "ROUGE_L"), + (Rouge(2), ["ROUGE_1", "ROUGE_2", "ROUGE_L"]), (Cider(), "CIDEr") ] for scorer, method in scorers: diff --git a/nlgeval/pycocoevalcap/rouge/rouge.py b/nlgeval/pycocoevalcap/rouge/rouge.py index 3a10f5a..c023c0c 100755 --- a/nlgeval/pycocoevalcap/rouge/rouge.py +++ b/nlgeval/pycocoevalcap/rouge/rouge.py @@ -8,6 +8,7 @@ # Author : Ramakrishna Vedantam import numpy as np +import itertools import pdb def my_lcs(string, sub): @@ -33,19 +34,97 @@ def my_lcs(string, sub): return lengths[len(string)][len(sub)] +def _get_ngrams(n, text): + """TAKEN FROM https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py + + Calcualtes n-grams. + Args: + n: which n-grams to calculate + text: An array of tokens + Returns: + A set of n-grams + """ + ngram_set = set() + text_length = len(text) + max_index_ngram_start = text_length - n + for i in range(max_index_ngram_start + 1): + ngram_set.add(tuple(text[i:i + n])) + return ngram_set + +def _split_into_words(sentences): + """TAKEN FROM https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py + + Splits multiple sentences into words and flattens the result""" + return list(itertools.chain(*[_.split(" ") for _ in sentences])) + +def _get_word_ngrams(n, sentences): + """TAKEN FROM https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py + + Calculates word n-grams for multiple sentences. + """ + assert len(sentences) > 0 + assert n > 0 + + words = _split_into_words(sentences) + return _get_ngrams(n, words) + +def rouge_n(evaluated_sentences, reference_sentences, n=2): + """ TAKEN FROM https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py + + Computes ROUGE-N of two text collections of sentences. + Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ + papers/rouge-working-note-v1.3.1.pdf + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentences: The sentences from the referene set + n: Size of ngram. Defaults to 2. + Returns: + A tuple (f1, precision, recall) for ROUGE-N + Raises: + ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) + reference_ngrams = _get_word_ngrams(n, reference_sentences) + reference_count = len(reference_ngrams) + evaluated_count = len(evaluated_ngrams) + + # Gets the overlapping ngrams between evaluated and reference + overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) + overlapping_count = len(overlapping_ngrams) + + # Handle edge case. This isn't mathematically correct, but it's good enough + if evaluated_count == 0: + precision = 0.0 + else: + precision = overlapping_count / evaluated_count + + if reference_count == 0: + recall = 0.0 + else: + recall = overlapping_count / reference_count + + f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8)) + + # return overlapping_count / reference_count + return f1_score, precision, recall + class Rouge(): ''' Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set ''' - def __init__(self): + def __init__(self, n=2): # vrama91: updated the value below based on discussion with Hovey self.beta = 1.2 + self._n = n def calc_score(self, candidate, refs): """ Compute ROUGE-L score given one candidate and references for an image - :param candidate: str : candidate sentence to be evaluated + :param candidate: list of str : candidate sentence to be evaluated :param refs: list of str : COCO reference sentences for the particular image to be evaluated :returns score: int (ROUGE-L score for the candidate evaluated against references) """ @@ -54,6 +133,12 @@ def calc_score(self, candidate, refs): prec = [] rec = [] + # Compute ROUGE-n scores + rouge_n_scores = [] + for n in range(1, self._n + 1): + f_score, _, _ = rouge_n(candidate, refs, n) + rouge_n_scores.append(f_score) + # split into tokens token_c = candidate[0].split(" ") @@ -72,7 +157,7 @@ def calc_score(self, candidate, refs): score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) else: score = 0.0 - return score + return rouge_n_scores + [score] def compute_score(self, gts, res): """ @@ -98,8 +183,12 @@ def compute_score(self, gts, res): assert(type(ref) is list) assert(len(ref) > 0) - average_score = np.mean(np.array(score)) - return average_score, np.array(score) + score_type = [] + for s_idx, s_type in enumerate(score[0]): + score_type.append([s[s_idx] for s in score]) + + average_score = [np.mean(np.array(s)) for s in score_type] + return average_score, [np.array(s) for s in score_type] def method(self): return "Rouge" From 49464bff6b8b2c863dc2aaf3bdd2ede674a6d406 Mon Sep 17 00:00:00 2001 From: Nicolas REMOND Date: Wed, 5 Jun 2019 08:51:27 +0900 Subject: [PATCH 2/3] other functions updated to take ROUGE N as well --- nlgeval/__init__.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/nlgeval/__init__.py b/nlgeval/__init__.py index b23b05d..be917db 100644 --- a/nlgeval/__init__.py +++ b/nlgeval/__init__.py @@ -98,7 +98,7 @@ def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False scorers = [ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), - (Rouge(), "ROUGE_L"), + (Rouge(2), ["ROUGE_1", "ROUGE_2", "ROUGE_L"]), (Cider(), "CIDEr") ] for scorer, method in scorers: @@ -151,7 +151,7 @@ class NLGEval(object): # Overlap 'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'METEOR', - 'ROUGE_L', + 'ROUGE_1', 'ROUGE_2', 'ROUGE_L', 'CIDEr', # Skip-thought @@ -210,8 +210,18 @@ def load_scorers(self): if 'METEOR' not in self.metrics_to_omit: self.scorers.append((Meteor(), "METEOR")) + if 'ROUGE_L' not in self.metrics_to_omit: - self.scorers.append((Rouge(), "ROUGE_L")) + omit_rouge_i = False + for i in range(1, 2 + 1): + if 'ROUGE_{}'.format(i) in self.metrics_to_omit: + omit_rouge_i = True + if i > 1: + self.scorers.append((Rouge(i - 1), ['ROUGE_{}'.format(j) for j in range(1, i)] + ["ROUGE_L"])) + break + if not omit_rouge_i: + self.scorers.append((Rouge(2), ["ROUGE_1", "ROUGE_2", "ROUGE_L"])) + if 'CIDEr' not in self.metrics_to_omit: self.scorers.append((Cider(), "CIDEr")) From 01f73db17235599f4345401d1dfbc51272c49c69 Mon Sep 17 00:00:00 2001 From: Nicolas REMOND Date: Wed, 5 Jun 2019 10:26:34 +0900 Subject: [PATCH 3/3] Tests updated --- nlgeval/tests/test_nlgeval.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nlgeval/tests/test_nlgeval.py b/nlgeval/tests/test_nlgeval.py index 08b5d60..867bee1 100644 --- a/nlgeval/tests/test_nlgeval.py +++ b/nlgeval/tests/test_nlgeval.py @@ -28,7 +28,7 @@ def test_compute_metrics_oo(self): self.assertAlmostEqual(0.980075, scores['EmbeddingAverageCosineSimilairty'], places=5) self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5) - self.assertEqual(11, len(scores)) + self.assertEqual(13, len(scores)) scores = n.compute_metrics(ref_list=[ [ @@ -56,7 +56,7 @@ def test_compute_metrics_oo(self): self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5) self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5) - self.assertEqual(11, len(scores)) + self.assertEqual(13, len(scores)) # Non-ASCII tests. scores = n.compute_individual_metrics(ref=["Test en français.", @@ -73,7 +73,7 @@ def test_compute_metrics_oo(self): self.assertAlmostEqual(0.906562, scores['EmbeddingAverageCosineSimilairty'], places=5) self.assertAlmostEqual(0.815158, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.940959, scores['GreedyMatchingScore'], places=5) - self.assertEqual(11, len(scores)) + self.assertEqual(13, len(scores)) scores = n.compute_individual_metrics(ref=["テスト"], hyp="テスト") @@ -83,7 +83,7 @@ def test_compute_metrics_oo(self): self.assertAlmostEqual(0.0, scores['CIDEr'], places=3) self.assertAlmostEqual(1.0, scores['SkipThoughtCS'], places=3) self.assertAlmostEqual(1.0, scores['GreedyMatchingScore'], places=3) - self.assertEqual(11, len(scores)) + self.assertEqual(13, len(scores)) def test_compute_metrics_omit(self): n = NLGEval(metrics_to_omit=['Bleu_3', 'METEOR', 'EmbeddingAverageCosineSimilairty']) @@ -99,7 +99,7 @@ def test_compute_metrics_omit(self): self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5) self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5) - self.assertEqual(7, len(scores)) + self.assertEqual(9, len(scores)) def test_compute_metrics(self): # The example from the README. @@ -118,4 +118,4 @@ def test_compute_metrics(self): self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5) self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5) self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5) - self.assertEqual(11, len(scores)) + self.assertEqual(13, len(scores))