Skip to content

Commit b9f7953

Browse files
committed
Consistency improvements in internal feature vector
representation. An array with integer keys will be passed through array_count_values to be made a frequency vector. Bonus: By doing that in multinomial_nb_classifier we have shaved off a few function calls and additions
1 parent d28bade commit b9f7953

File tree

6 files changed

+46
-15
lines changed

6 files changed

+46
-15
lines changed

classifier/multinomial_nb_classifier.php

+4-2
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,11 @@ public function classify(array $classes, Document $d) {
6060
public function getScore($class, Document $d) {
6161
$score = log($this->model->getPrior($class));
6262
$features = $this->feature_factory->getFeatureArray($class,$d);
63-
foreach ($features as $f)
63+
if (is_int(key($features)))
64+
$features = array_count_values($features);
65+
foreach ($features as $f=>$fcnt)
6466
{
65-
$score += log($this->model->getCondProb($f,$class));
67+
$score += $fcnt*log($this->model->getCondProb($f,$class));
6668
}
6769
return $score;
6870
}

clustering/euclidean_centroid.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class Euclidean implements CentroidFactory
1818
* @return array A sparse vector representing the document to the n-dimensional euclidean space
1919
*/
2020
protected function getVector(array $doc) {
21-
if (key($doc)===0)
21+
if (is_int(key($doc)))
2222
return array_count_values($doc);
2323
else
2424
return $doc;

feature_factories/callables_as_features.php

+31-4
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,33 @@
99
* (function names, closures, array($object,'func_name'), etc.) and
1010
* calls them consecutively using the return value as a feature's unique
1111
* string.
12+
*
13+
* The class can model both feature frequency and presence
1214
*/
1315
class FunctionFeatures implements FeatureFactory
1416
{
15-
17+
1618
protected $functions;
19+
protected $frequency;
20+
21+
/**
22+
* @param array $f An array of feature functions
23+
*/
1724
public function __construct(array $f=array()) {
1825
$this->functions=$f;
26+
$this->frequency=false;
27+
}
28+
/**
29+
* Set the feature factory to model frequency instead of presence
30+
*/
31+
public function modelFrequency() {
32+
$this->frequency = true;
33+
}
34+
/**
35+
* Set the feature factory to model presence instead of frequency
36+
*/
37+
public function modelPresence() {
38+
$this->frequency = false;
1939
}
2040
/**
2141
* Add a function as a feature
@@ -52,15 +72,22 @@ public function getFeatureArray($class, Document $d) {
5272
{
5373
foreach ($f as $ff)
5474
{
55-
$set[$ff] = 1;
75+
if (!isset($set[$ff]))
76+
$set[$ff] = 0;
77+
$set[$ff]++;
5678
}
5779
}
5880
else
5981
{
60-
$set[$f] = 1;
82+
if (!isset($set[$f]))
83+
$set[$f] = 0;
84+
$set[$f]++;
6185
}
6286
}
63-
return array_keys($set);
87+
if ($this->frequency)
88+
return $set;
89+
else
90+
return array_keys($set);
6491
}
6592

6693
}

models/feature_based_nb.php

+6-4
Original file line numberDiff line numberDiff line change
@@ -139,16 +139,18 @@ protected function countTrainingSet(FeatureFactory $ff, TrainingSet $tset, array
139139
$c = $tdoc->getClass();
140140
$ndocs_per_class[$c]++;
141141
$features = $ff->getFeatureArray($c,$tdoc);
142-
foreach ($features as $f)
142+
if (is_int(key($features)))
143+
$features = array_count_values($features);
144+
foreach ($features as $f=>$fcnt)
143145
{
144146
if (!isset($voc[$f]))
145147
$voc[$f] = 0;
146148

147-
$termcount_per_class[$c]++;
149+
$termcount_per_class[$c]+=$fcnt;
148150
if (isset($termcount[$c][$f]))
149-
$termcount[$c][$f]++;
151+
$termcount[$c][$f]+=$fcnt;
150152
else
151-
$termcount[$c][$f] = 1;
153+
$termcount[$c][$f] = $fcnt;
152154
}
153155
}
154156
}

similarity/cosine_similarity.php

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,11 @@ class CosineSimilarity implements Similarity, Distance
4545
public function similarity(&$A, &$B) {
4646
// This means they are simple text vectors
4747
// so we need to count to make them vectors
48-
if (key($A)===0)
48+
if (is_int(key($A)))
4949
$v1 = array_count_values($A);
5050
else
5151
$v1 = &$A;
52-
if (key($B)===0)
52+
if (is_int(key($B)))
5353
$v2 = array_count_values($B);
5454
else
5555
$v2 = &$B;

similarity/euclidean.php

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@ class Euclidean implements Distance
1515
* @return float The euclidean distance between $A and $B
1616
*/
1717
public function dist(&$A, &$B) {
18-
if (key($A)===0)
18+
if (is_int(key($A)))
1919
$v1 = array_count_values($A);
2020
else
2121
$v1 = &$A;
22-
if (key($B)===0)
22+
if (is_int(key($B)))
2323
$v2 = array_count_values($B);
2424
else
2525
$v2 = &$B;

0 commit comments

Comments
 (0)