@@ -45,14 +45,14 @@ BleuScorer::BleuScorer(const string& config)
45
45
} else if (reflen == REFLEN_CLOSEST) {
46
46
m_ref_length_type = CLOSEST;
47
47
} else {
48
- throw runtime_error (" Unknown reference length strategy: " + reflen);
48
+ UTIL_THROW2 (" Unknown reference length strategy: " + reflen);
49
49
}
50
50
}
51
51
52
52
BleuScorer::~BleuScorer () {}
53
53
54
54
size_t BleuScorer::CountNgrams (const string& line, NgramCounts& counts,
55
- unsigned int n, bool is_testing)
55
+ unsigned int n, bool is_testing) const
56
56
{
57
57
assert (n > 0 );
58
58
vector<int > encoded_tokens;
@@ -94,41 +94,46 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
94
94
mert::VocabularyFactory::GetVocabulary ()->clear ();
95
95
96
96
// load reference data
97
- for (size_t i = 0 ; i < referenceFiles.size (); ++i) {
97
+ for (size_t i = 0 ; i < referenceFiles.size (); ++i)
98
+ {
98
99
TRACE_ERR (" Loading reference from " << referenceFiles[i] << endl);
99
100
100
- if (!OpenReference (referenceFiles[i].c_str (), i)) {
101
- throw runtime_error (" Unable to open " + referenceFiles[i]);
101
+ ifstream ifs (referenceFiles[i].c_str ());
102
+ UTIL_THROW_IF2 (!ifs, " Cannot open " << referenceFiles[i]);
103
+ if (!OpenReferenceStream (&ifs, i)) {
104
+ UTIL_THROW2 (" Unable to open " + referenceFiles[i]);
102
105
}
103
106
}
104
107
}
105
108
106
- bool BleuScorer::OpenReference (const char * filename, size_t file_id)
107
- {
108
- ifstream ifs (filename);
109
- if (!ifs) {
110
- cerr << " Cannot open " << filename << endl;
111
- return false ;
112
- }
113
- return OpenReferenceStream (&ifs, file_id);
114
- }
115
-
116
109
bool BleuScorer::OpenReferenceStream (istream* is, size_t file_id)
117
110
{
118
111
if (is == NULL ) return false ;
119
112
120
113
string line;
121
114
size_t sid = 0 ;
122
115
while (getline (*is, line)) {
116
+ // TODO: rather than loading the whole reference corpus into memory, can we stream it line by line?
117
+ // (loading the whole reference corpus can take gigabytes of RAM if done with millions of sentences)
123
118
line = preprocessSentence (line);
124
119
if (file_id == 0 ) {
125
120
Reference* ref = new Reference;
126
121
m_references.push_back (ref); // Take ownership of the Reference object.
127
122
}
128
- if (m_references.size () <= sid) {
129
- cerr << " Reference " << file_id << " has too many sentences." << endl;
130
- return false ;
123
+ UTIL_THROW_IF2 (m_references.size () <= sid, " Reference " << file_id << " has too many sentences." );
124
+
125
+ ProcessReferenceLine (line, m_references[sid]);
126
+
127
+ if (sid > 0 && sid % 100 == 0 ) {
128
+ TRACE_ERR (" ." );
131
129
}
130
+ ++sid;
131
+ }
132
+ return true ;
133
+ }
134
+
135
+ void BleuScorer::ProcessReferenceLine (const std::string& line, Reference* ref) const
136
+ {
132
137
NgramCounts counts;
133
138
size_t length = CountNgrams (line, counts, kBleuNgramOrder );
134
139
@@ -138,35 +143,30 @@ bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
138
143
const NgramCounts::Value newcount = ci->second ;
139
144
140
145
NgramCounts::Value oldcount = 0 ;
141
- m_references[sid] ->get_counts ()->Lookup (ngram, &oldcount);
146
+ ref ->get_counts ()->Lookup (ngram, &oldcount);
142
147
if (newcount > oldcount) {
143
- m_references[sid] ->get_counts ()->operator [](ngram) = newcount;
148
+ ref ->get_counts ()->operator [](ngram) = newcount;
144
149
}
145
150
}
146
151
// add in the length
147
- m_references[sid]->push_back (length);
148
- if (sid > 0 && sid % 100 == 0 ) {
149
- TRACE_ERR (" ." );
150
- }
151
- ++sid;
152
- }
153
- return true ;
152
+ ref->push_back (length);
154
153
}
155
154
156
155
void BleuScorer::prepareStats (size_t sid, const string& text, ScoreStats& entry)
157
156
{
158
- if (sid >= m_references.size ()) {
159
- stringstream msg;
160
- msg << " Sentence id (" << sid << " ) not found in reference set" ;
161
- throw runtime_error (msg.str ());
162
- }
157
+ UTIL_THROW_IF2 (sid >= m_references.size (), " Sentence id (" << sid << " ) not found in reference set" );
158
+ CalcBleuStats (m_references[sid], text, entry);
159
+ }
160
+
161
+ void BleuScorer::CalcBleuStats (const Reference* ref, const std::string& text, ScoreStats& entry) const
162
+ {
163
163
NgramCounts testcounts;
164
164
// stats for this line
165
165
vector<ScoreStatsType> stats (kBleuNgramOrder * 2 );
166
166
string sentence = preprocessSentence (text);
167
167
const size_t length = CountNgrams (sentence, testcounts, kBleuNgramOrder , true );
168
168
169
- const int reference_len = CalcReferenceLength (sid , length);
169
+ const int reference_len = CalcReferenceLength (ref , length);
170
170
stats.push_back (reference_len);
171
171
172
172
// precision on each ngram type
@@ -177,7 +177,7 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
177
177
NgramCounts::Value correct = 0 ;
178
178
179
179
NgramCounts::Value v = 0 ;
180
- if (m_references[sid] ->get_counts ()->Lookup (testcounts_it->first , &v)) {
180
+ if (ref ->get_counts ()->Lookup (testcounts_it->first , &v)) {
181
181
correct = min (v, guess);
182
182
}
183
183
stats[len * 2 - 2 ] += correct;
@@ -207,21 +207,20 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
207
207
return exp (logbleu);
208
208
}
209
209
210
- int BleuScorer::CalcReferenceLength (size_t sentence_id, size_t length)
210
+ int BleuScorer::CalcReferenceLength (const Reference* ref, std:: size_t length) const
211
211
{
212
212
switch (m_ref_length_type) {
213
213
case AVERAGE:
214
- return m_references[sentence_id] ->CalcAverage ();
214
+ return ref ->CalcAverage ();
215
215
break ;
216
216
case CLOSEST:
217
- return m_references[sentence_id] ->CalcClosest (length);
217
+ return ref ->CalcClosest (length);
218
218
break ;
219
219
case SHORTEST:
220
- return m_references[sentence_id] ->CalcShortest ();
220
+ return ref ->CalcShortest ();
221
221
break ;
222
222
default :
223
- cerr << " unknown reference types." << endl;
224
- exit (1 );
223
+ UTIL_THROW2 (" Unknown reference types" );
225
224
}
226
225
}
227
226
@@ -298,29 +297,23 @@ vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string&
298
297
299
298
vector<FeatureDataIterator> featureDataIters;
300
299
vector<ScoreDataIterator> scoreDataIters;
301
- for (size_t i = 0 ; i < featureFiles.size (); ++i) {
300
+ for (size_t i = 0 ; i < featureFiles.size (); ++i)
301
+ {
302
302
featureDataIters.push_back (FeatureDataIterator (featureFiles[i]));
303
303
scoreDataIters.push_back (ScoreDataIterator (scoreFiles[i]));
304
304
}
305
305
306
306
vector<pair<size_t ,size_t > > hypotheses;
307
- if (featureDataIters[0 ] == FeatureDataIterator::end ()) {
308
- cerr << " Error: at the end of feature data iterator" << endl;
309
- exit (1 );
310
- }
311
- for (size_t i = 0 ; i < featureFiles.size (); ++i) {
312
- if (featureDataIters[i] == FeatureDataIterator::end ()) {
313
- cerr << " Error: Feature file " << i << " ended prematurely" << endl;
314
- exit (1 );
315
- }
316
- if (scoreDataIters[i] == ScoreDataIterator::end ()) {
317
- cerr << " Error: Score file " << i << " ended prematurely" << endl;
318
- exit (1 );
319
- }
320
- if (featureDataIters[i]->size () != scoreDataIters[i]->size ()) {
321
- cerr << " Error: features and scores have different size" << endl;
322
- exit (1 );
323
- }
307
+ UTIL_THROW_IF2 (featureDataIters[0 ] == FeatureDataIterator::end (),
308
+ " At the end of feature data iterator" );
309
+ for (size_t i = 0 ; i < featureFiles.size (); ++i)
310
+ {
311
+ UTIL_THROW_IF2 (featureDataIters[i] == FeatureDataIterator::end (),
312
+ " Feature file " << i << " ended prematurely" );
313
+ UTIL_THROW_IF2 (scoreDataIters[i] == ScoreDataIterator::end (),
314
+ " Score file " << i << " ended prematurely" );
315
+ UTIL_THROW_IF2 (featureDataIters[i]->size () != scoreDataIters[i]->size (),
316
+ " Features and scores have different size" );
324
317
for (size_t j = 0 ; j < featureDataIters[i]->size (); ++j) {
325
318
hypotheses.push_back (pair<size_t ,size_t >(i,j));
326
319
}
0 commit comments