sentence-bleu-nbest

Matthias Huck · Matthias Huck · commit 34d1d3a9043d · 2015-04-30T19:44:29.000+01:00
diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp
@@ -45,14 +45,14 @@ BleuScorer::BleuScorer(const string& config)
   } else if (reflen == REFLEN_CLOSEST) {
     m_ref_length_type = CLOSEST;
   } else {
-    throw runtime_error("Unknown reference length strategy: " + reflen);
+    UTIL_THROW2("Unknown reference length strategy: " + reflen);
   }
 }
 
 BleuScorer::~BleuScorer() {}
 
 size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
-                               unsigned int n, bool is_testing)
+                               unsigned int n, bool is_testing) const
 {
   assert(n > 0);
   vector<int> encoded_tokens;
@@ -94,41 +94,46 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
   mert::VocabularyFactory::GetVocabulary()->clear();
 
   //load reference data
-  for (size_t i = 0; i < referenceFiles.size(); ++i) {
+  for (size_t i = 0; i < referenceFiles.size(); ++i) 
+  {
     TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
 
-    if (!OpenReference(referenceFiles[i].c_str(), i)) {
-      throw runtime_error("Unable to open " + referenceFiles[i]);
+    ifstream ifs(referenceFiles[i].c_str());
+    UTIL_THROW_IF2(!ifs, "Cannot open " << referenceFiles[i]);
+    if (!OpenReferenceStream(&ifs, i)) {
+      UTIL_THROW2("Unable to open " + referenceFiles[i]);
     }
   }
 }
 
-bool BleuScorer::OpenReference(const char* filename, size_t file_id)
-{
-  ifstream ifs(filename);
-  if (!ifs) {
-    cerr << "Cannot open " << filename << endl;
-    return false;
-  }
-  return OpenReferenceStream(&ifs, file_id);
-}
-
 bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
 {
   if (is == NULL) return false;
 
   string line;
   size_t sid = 0;
   while (getline(*is, line)) {
+    // TODO: rather than loading the whole reference corpus into memory, can we stream it line by line?
+    //  (loading the whole reference corpus can take gigabytes of RAM if done with millions of sentences)
     line = preprocessSentence(line);
     if (file_id == 0) {
       Reference* ref = new Reference;
       m_references.push_back(ref);    // Take ownership of the Reference object.
     }
-    if (m_references.size() <= sid) {
-      cerr << "Reference " << file_id << "has too many sentences." << endl;
-      return false;
+    UTIL_THROW_IF2(m_references.size() <= sid, "Reference " << file_id << "has too many sentences.");
+
+    ProcessReferenceLine(line, m_references[sid]);
+
+    if (sid > 0 && sid % 100 == 0) {
+      TRACE_ERR(".");
     }
+    ++sid;
+  }
+  return true;
+}
+
+void BleuScorer::ProcessReferenceLine(const std::string& line, Reference* ref) const
+{
     NgramCounts counts;
     size_t length = CountNgrams(line, counts, kBleuNgramOrder);
 
@@ -138,35 +143,30 @@ bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
       const NgramCounts::Value newcount = ci->second;
 
       NgramCounts::Value oldcount = 0;
-      m_references[sid]->get_counts()->Lookup(ngram, &oldcount);
+      ref->get_counts()->Lookup(ngram, &oldcount);
       if (newcount > oldcount) {
-        m_references[sid]->get_counts()->operator[](ngram) = newcount;
+        ref->get_counts()->operator[](ngram) = newcount;
       }
     }
     //add in the length
-    m_references[sid]->push_back(length);
-    if (sid > 0 && sid % 100 == 0) {
-      TRACE_ERR(".");
-    }
-    ++sid;
-  }
-  return true;
+    ref->push_back(length);
 }
 
 void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
 {
-  if (sid >= m_references.size()) {
-    stringstream msg;
-    msg << "Sentence id (" << sid << ") not found in reference set";
-    throw runtime_error(msg.str());
-  }
+  UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
+  CalcBleuStats(m_references[sid], text, entry);
+}
+
+void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const
+{
   NgramCounts testcounts;
   // stats for this line
   vector<ScoreStatsType> stats(kBleuNgramOrder * 2);
   string sentence = preprocessSentence(text);
   const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder, true);
 
-  const int reference_len = CalcReferenceLength(sid, length);
+  const int reference_len = CalcReferenceLength(ref, length);
   stats.push_back(reference_len);
 
   //precision on each ngram type
@@ -177,7 +177,7 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
     NgramCounts::Value correct = 0;
 
     NgramCounts::Value v = 0;
-    if (m_references[sid]->get_counts()->Lookup(testcounts_it->first, &v)) {
+    if (ref->get_counts()->Lookup(testcounts_it->first, &v)) {
       correct = min(v, guess);
     }
     stats[len * 2 - 2] += correct;
@@ -207,21 +207,20 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
   return exp(logbleu);
 }
 
-int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length)
+int BleuScorer::CalcReferenceLength(const Reference* ref, std::size_t length) const
 {
   switch (m_ref_length_type) {
   case AVERAGE:
-    return m_references[sentence_id]->CalcAverage();
+    return ref->CalcAverage();
     break;
   case CLOSEST:
-    return m_references[sentence_id]->CalcClosest(length);
+    return ref->CalcClosest(length);
     break;
   case SHORTEST:
-    return m_references[sentence_id]->CalcShortest();
+    return ref->CalcShortest();
     break;
   default:
-    cerr << "unknown reference types." << endl;
-    exit(1);
+    UTIL_THROW2("Unknown reference types");
   }
 }
 
@@ -298,29 +297,23 @@ vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string&
 
   vector<FeatureDataIterator> featureDataIters;
   vector<ScoreDataIterator> scoreDataIters;
-  for (size_t i = 0; i < featureFiles.size(); ++i) {
+  for (size_t i = 0; i < featureFiles.size(); ++i) 
+  {
     featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
     scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
   }
 
   vector<pair<size_t,size_t> > hypotheses;
-  if (featureDataIters[0] == FeatureDataIterator::end()) {
-    cerr << "Error: at the end of feature data iterator" << endl;
-    exit(1);
-  }
-  for (size_t i = 0; i < featureFiles.size(); ++i) {
-    if (featureDataIters[i] == FeatureDataIterator::end()) {
-      cerr << "Error: Feature file " << i << " ended prematurely" << endl;
-      exit(1);
-    }
-    if (scoreDataIters[i] == ScoreDataIterator::end()) {
-      cerr << "Error: Score file " << i << " ended prematurely" << endl;
-      exit(1);
-    }
-    if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
-      cerr << "Error: features and scores have different size" << endl;
-      exit(1);
-    }
+  UTIL_THROW_IF2(featureDataIters[0] == FeatureDataIterator::end(), 
+                 "At the end of feature data iterator");
+  for (size_t i = 0; i < featureFiles.size(); ++i) 
+  {
+    UTIL_THROW_IF2(featureDataIters[i] == FeatureDataIterator::end(), 
+                   "Feature file " << i << " ended prematurely");
+    UTIL_THROW_IF2(scoreDataIters[i] == ScoreDataIterator::end(), 
+                   "Score file " << i << " ended prematurely");
+    UTIL_THROW_IF2(featureDataIters[i]->size() != scoreDataIters[i]->size(), 
+                   "Features and scores have different size");
     for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
       hypotheses.push_back(pair<size_t,size_t>(i,j));
     }
diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h
@@ -42,11 +42,14 @@ class BleuScorer: public StatisticsBasedScorer
     return 2 * kBleuNgramOrder + 1;
   }
 
-  int CalcReferenceLength(std::size_t sentence_id, std::size_t length);
+  void CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const;
+
+  int CalcReferenceLength(const Reference* ref, std::size_t length) const;
 
   ReferenceLengthType GetReferenceLengthType() const {
     return m_ref_length_type;
   }
+
   void SetReferenceLengthType(ReferenceLengthType type) {
     m_ref_length_type = type;
   }
@@ -62,14 +65,14 @@ class BleuScorer: public StatisticsBasedScorer
   /**
    * Count the ngrams of each type, up to the given length in the input line.
    */
-  std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false);
+  std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
 
   void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
 
-  bool OpenReference(const char* filename, std::size_t file_id);
-
   // NOTE: this function is used for unit testing.
-  virtual bool OpenReferenceStream(std::istream* is, std::size_t file_id);
+  bool OpenReferenceStream(std::istream* is, std::size_t file_id);
+
+  void ProcessReferenceLine(const std::string& line, Reference* ref) const;
 
   //private:
 protected:
diff --git a/mert/Jamfile b/mert/Jamfile
@@ -66,11 +66,13 @@ exe evaluator : evaluator.cpp mert_lib ;
 
 exe sentence-bleu : sentence-bleu.cpp mert_lib ;
 
+exe sentence-bleu-nbest : sentence-bleu-nbest.cpp mert_lib ;
+
 exe pro : pro.cpp mert_lib ..//boost_program_options ;
 
 exe kbmira : kbmira.cpp mert_lib ..//boost_program_options ..//boost_filesystem ;
 
-alias programs : mert extractor evaluator pro kbmira sentence-bleu ;
+alias programs : mert extractor evaluator pro kbmira sentence-bleu sentence-bleu-nbest ;
 
 unit-test bleu_scorer_test : BleuScorerTest.cpp mert_lib ..//boost_unit_test_framework ;
 unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_framework ;
diff --git a/mert/Scorer.cpp b/mert/Scorer.cpp
@@ -64,7 +64,7 @@ void Scorer::InitConfig(const string& config)
   }
 }
 
-void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
+void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) const
 {
   for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
        it; ++it) {
@@ -81,7 +81,7 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
   }
 }
 
-void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded)
+void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded) const
 {
   for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
        it; ++it) {
diff --git a/mert/Scorer.h b/mert/Scorer.h
@@ -187,12 +187,12 @@ class Scorer
    * Tokenise line and encode.
    * Note: We assume that all tokens are separated by whitespaces.
    */
-  void TokenizeAndEncode(const std::string& line, std::vector<int>& encoded);
+  void TokenizeAndEncode(const std::string& line, std::vector<int>& encoded) const;
 
   /*
    * Tokenize functions for testing only.
    */
-  void TokenizeAndEncodeTesting(const std::string& line, std::vector<int>& encoded);
+  void TokenizeAndEncodeTesting(const std::string& line, std::vector<int>& encoded) const;
 
   /**
    * Every inherited scorer should call this function for each sentence
diff --git a/mert/sentence-bleu-nbest.cpp b/mert/sentence-bleu-nbest.cpp
@@ -0,0 +1,44 @@
+#include <iostream>
+#include <vector>
+#include <string>
+
+#include "BleuScorer.h"
+#include "moses/Util.h"
+
+using namespace MosesTuning;
+
+int main(int argc, char **argv)
+{
+  if (argc == 1) {
+    std::cerr << "Usage: ./sentence-bleu-nbest ref1 [ref2 ...] < plain-nbest > bleu-scores" << std::endl;
+    return 1;
+  }
+
+  std::vector<std::string> refFiles(argv + 1, argv + argc);
+
+  // TODO all of these are empty for now
+  std::string config;
+  std::string factors;
+  std::string filter;
+
+  BleuScorer scorer(config);
+  scorer.setFactors(factors);
+  scorer.setFilter(filter);
+  scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
+
+  // Loading sentences and preparing statistics
+  std::string nbestLine;
+  while ( getline(std::cin, nbestLine) ) 
+  {
+    std::vector<std::string> items;
+    Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| ");
+    size_t sid = Moses::Scan<size_t>(items[0]);
+
+    ScoreStats scoreStats;
+    scorer.prepareStats(sid, items[1], scoreStats);
+    std::vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
+    std::cout << smoothedSentenceBleu(stats) << std::endl;
+  }
+
+  return 0;
+}
diff --git a/mert/sentence-bleu.cpp b/mert/sentence-bleu.cpp
@@ -23,22 +23,19 @@ int main(int argc, char **argv)
   BleuScorer scorer(config);
   scorer.setFactors(factors);
   scorer.setFilter(filter);
-  scorer.setReferenceFiles(refFiles);
-
-  vector<ScoreStats> entries;
+  scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
 
   // Loading sentences and preparing statistics
-  ScoreStats scoreentry;
-  string line;
-  while (getline(cin, line)) {
-    scorer.prepareStats(entries.size(), line, scoreentry);
-    entries.push_back(scoreentry);
+  string hypothesisLine;
+  size_t sid = 0;
+  while (getline(std::cin, hypothesisLine)) 
+  {
+    ScoreStats scoreStats;
+    scorer.prepareStats(sid, hypothesisLine, scoreStats);
+    vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
+    std::cout << smoothedSentenceBleu(stats) << std::endl;
+    ++sid;
   }
 
-  vector<ScoreStats>::const_iterator sentIt;
-  for (sentIt = entries.begin(); sentIt != entries.end(); sentIt++) {
-    vector<float> stats(sentIt->getArray(), sentIt->getArray() + sentIt->size());
-    cout << smoothedSentenceBleu(stats) << "\n";
-  }
   return 0;
 }
diff --git a/moses/Util.cpp b/moses/Util.cpp
@@ -90,13 +90,6 @@ bool FileExists(const std::string& filePath)
   return !ifs.fail();
 }
 
-const std::string Trim(const std::string& str, const std::string dropChars)
-{
-  std::string res = str;
-  res.erase(str.find_last_not_of(dropChars)+1);
-  return res.erase(0, res.find_first_not_of(dropChars));
-}
-
 void ResetUserTime()
 {
   g_timer.start();
diff --git a/moses/Util.h b/moses/Util.h

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ void Scorer::InitConfig(const string& config)`
`64`	`64`	`}`
`65`	`65`	`}`
`66`	`66`
`67`		`-void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)`
	`67`	`+void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) const`
`68`	`68`	`{`
`69`	`69`	`for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));`
`70`	`70`	`it; ++it) {`
`@@ -81,7 +81,7 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)`
`81`	`81`	`}`
`82`	`82`	`}`
`83`	`83`
`84`		`-void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded)`
	`84`	`+void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded) const`
`85`	`85`	`{`
`86`	`86`	`for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));`
`87`	`87`	`it; ++it) {`
Original file line number	Diff line number	Diff line change
`@@ -90,13 +90,6 @@ bool FileExists(const std::string& filePath)`
`90`	`90`	`return !ifs.fail();`
`91`	`91`	`}`
`92`	`92`
`93`		`-const std::string Trim(const std::string& str, const std::string dropChars)`
`94`		`-{`
`95`		`- std::string res = str;`
`96`		`- res.erase(str.find_last_not_of(dropChars)+1);`
`97`		`- return res.erase(0, res.find_first_not_of(dropChars));`
`98`		`-}`
`99`		`-`
`100`	`93`	`void ResetUserTime()`
`101`	`94`	`{`
`102`	`95`	`g_timer.start();`