Skip to content

Commit 0c2dff6

Browse files
committed
Initial commit for the psr-0 compatibility changes
1 parent b9f7953 commit 0c2dff6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+3316
-0
lines changed
+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
namespace NlpTools\Classifiers;
4+
5+
interface Classifier
6+
{
7+
/**
8+
* Decide in which class C member of $classes would $d fit best.
9+
*
10+
* @param array $classes A set of classes
11+
* @param Document $d A Document
12+
* @return string A class
13+
*/
14+
public function classify(array $classes, \NlpTools\Documents\Document $d);
15+
}
16+
17+
?>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
<?php
2+
3+
namespace NlpTools\Classifiers;
4+
5+
use \NlpTools\Documents\Document;
6+
use \NlpTools\FeatureFactories\FeatureFactory;
7+
use \NlpTools\Models\LinearModel;
8+
9+
/**
10+
* Classify using a linear model. A model that assigns a weight l for
11+
* each feature f.
12+
*/
13+
class FeatureBasedLinearClassifier implements Classifier
14+
{
15+
// The feature factory
16+
protected $feature_factory;
17+
// The LinearModel
18+
protected $model;
19+
20+
public function __construct(FeatureFactory $ff, LinearModel $m) {
21+
$this->feature_factory = $ff;
22+
$this->model = $m;
23+
}
24+
25+
/**
26+
* Compute the vote for every class. Return the class that
27+
* receive the maximum vote.
28+
*
29+
* @param array $classes A set of classes
30+
* @param Document $d A Document
31+
* @return string A class
32+
*/
33+
public function classify(array $classes, Document $d) {
34+
$maxclass = current($classes);
35+
$maxvote = $this->getVote($maxclass,$d);
36+
while ($class = next($classes))
37+
{
38+
$v = $this->getVote($class,$d);
39+
if ($v>$maxvote)
40+
{
41+
$maxclass = $class;
42+
$maxvote = $v;
43+
}
44+
}
45+
return $maxclass;
46+
}
47+
48+
/**
49+
* Compute the features that fire for the Document $d. The sum of
50+
* the weights of the features is the vote.
51+
*
52+
* @param string $class The vote for class $class
53+
* @param Document $d The vote for Document $d
54+
* @return float The vote of the model for class $class and Document $d
55+
*/
56+
public function getVote($class, Document $d) {
57+
$v = 0;
58+
$features = $this->feature_factory->getFeatureArray($class,$d);
59+
foreach ($features as $f)
60+
{
61+
$v += $this->model->getWeight($f);
62+
}
63+
return $v;
64+
}
65+
}
66+
67+
?>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
<?php
2+
3+
namespace NlpTools\Classifiers;
4+
5+
use \NlpTools\Documents\Document;
6+
use \NlpTools\FeatureFactories\FeatureFactory;
7+
use \NlpTools\Models\MultinomialNBModel;
8+
9+
/**
10+
* Use a multinomia NB model to classify a document
11+
*/
12+
class MultinomialNBClassifier implements Classifier
13+
{
14+
// The feature factory
15+
protected $feature_factory;
16+
// The NBModel
17+
protected $model;
18+
19+
public function __construct(FeatureFactory $ff, MultinomialNBModel $m) {
20+
$this->feature_factory = $ff;
21+
$this->model = $m;
22+
}
23+
24+
/**
25+
* Compute the probability of $d belonging to each class
26+
* successively and return that class that has the maximum
27+
* probability.
28+
*
29+
* @param array $classes The classes from which to choose
30+
* @param Document $d The document to classify
31+
* @return string $class The class that has the maximum probability
32+
*/
33+
public function classify(array $classes, Document $d) {
34+
$maxclass = current($classes);
35+
$maxscore = $this->getScore($maxclass,$d);
36+
while ($class=next($classes))
37+
{
38+
$score = $this->getScore($class,$d);
39+
if ($score>$maxscore)
40+
{
41+
$maxclass = $class;
42+
$maxscore = $score;
43+
}
44+
}
45+
return $maxclass;
46+
}
47+
48+
/**
49+
* Compute the log of the probability of the Document $d belonging
50+
* to class $class. We compute the log so that we can sum over the
51+
* logarithms instead of multiplying each probability.
52+
*
53+
* @todo perhaps MultinomialNBModel should have precomputed the logs
54+
* ex.: getLogPrior() and getLogCondProb()
55+
*
56+
* @param string $class The class for which we are getting a score
57+
* @param Document The document whose score we are getting
58+
* @return float The log of the probability of $d belonging to $class
59+
*/
60+
public function getScore($class, Document $d) {
61+
$score = log($this->model->getPrior($class));
62+
$features = $this->feature_factory->getFeatureArray($class,$d);
63+
if (is_int(key($features)))
64+
$features = array_count_values($features);
65+
foreach ($features as $f=>$fcnt)
66+
{
67+
$score += $fcnt*log($this->model->getCondProb($f,$class));
68+
}
69+
return $score;
70+
}
71+
72+
}
73+
74+
?>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?php
2+
3+
namespace NlpTools\Clustering\CentroidFactories;
4+
5+
interface CentroidFactory
6+
{
7+
/**
8+
* Parse the provided docs and create a doc that given a metric
9+
* of distance is the centroid of the provided docs.
10+
*
11+
* The second array is to choose some of the provided docs to
12+
* compute the centroid.
13+
*
14+
* @param array $docs The docs from which the centroid will be computed
15+
* @param array $choose The indexes from which the centroid will be computed (if empty all the docs will be used)
16+
* @return mixed The centroid. It could be any form of data a number, a vector (it will be the same as the data provided in docs)
17+
*/
18+
public function getCentroid(array &$docs, array $choose=array());
19+
}
20+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
<?php
2+
3+
namespace NlpTools\Clustering\CentroidFactories;
4+
5+
/**
6+
* Computes the euclidean centroid of the provided sparse vectors
7+
*/
8+
class Euclidean implements CentroidFactory
9+
{
10+
/**
11+
* If the document is a collection of tokens or features transorm it to
12+
* a sparse vector with frequency information.
13+
*
14+
* Ex.: If 'A' appears twice in the doc the dimension 'A' will have value 2
15+
* in the resulting vector
16+
*
17+
* @param array $doc The doc data to transform to sparse vector
18+
* @return array A sparse vector representing the document to the n-dimensional euclidean space
19+
*/
20+
protected function getVector(array $doc) {
21+
if (is_int(key($doc)))
22+
return array_count_values($doc);
23+
else
24+
return $doc;
25+
}
26+
27+
/**
28+
* Compute the mean value for each dimension.
29+
*
30+
* @param array $docs The docs from which the centroid will be computed
31+
* @param array $choose The indexes from which the centroid will be computed (if empty all the docs will be used)
32+
* @return mixed The centroid. It could be any form of data a number, a vector (it will be the same as the data provided in docs)
33+
*/
34+
public function getCentroid(array &$docs, array $choose=array()) {
35+
$v = array();
36+
if (empty($choose))
37+
$choose = range(0,count($docs)-1);
38+
$cnt = count($choose);
39+
foreach ($choose as $idx) {
40+
$doc = $this->getVector($docs[$idx]);
41+
foreach ($doc as $k=>$w) {
42+
if (!isset($v[$k]))
43+
$v[$k] = $w;
44+
else
45+
$v[$k] += $w;
46+
}
47+
}
48+
foreach ($v as &$w) {
49+
$w /= $cnt;
50+
}
51+
return $v;
52+
}
53+
}
54+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
<?php
2+
3+
namespace NlpTools\Clustering\CentroidFactories;
4+
5+
/**
6+
This class computes the centroid of the hamming distance between two strings
7+
that are the binary representations of two integers (the strings are supposed
8+
to only contain the characters 1 and 0).
9+
*/
10+
class Hamming implements CentroidFactory
11+
{
12+
13+
/**
14+
* Return a number in binary encoding in a string such that the sum of its
15+
* hamming distances of each document is minimized.
16+
*
17+
* Assumptions: The docs array should contain strings that are properly padded
18+
* binary (they should all be the same length).
19+
*/
20+
public function getCentroid(array &$docs, array $choose=array()) {
21+
$bitl = strlen($docs[0]);
22+
$buckets = array_fill_keys(
23+
range(0,$bitl-1),
24+
0
25+
);
26+
if (empty($choose))
27+
$choose = range(0,count($docs)-1);
28+
foreach ($choose as $idx) {
29+
$s = $docs[$idx];
30+
foreach ($buckets as $i=>&$v) {
31+
if ($s[$i]=='1')
32+
$v += 1;
33+
else
34+
$v -= 1;
35+
}
36+
}
37+
return implode(
38+
'',
39+
array_map(
40+
function ($v) {
41+
return ($v>0) ? '1' : '0';
42+
},
43+
$buckets
44+
)
45+
);
46+
}
47+
48+
}
49+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
<?php
2+
3+
namespace NlpTools\Clustering\CentroidFactories;
4+
5+
/**
6+
* MeanAngle computes the unit vector with angle the average of all
7+
* the given vectors. The purpose is to compute a vector M such that
8+
* sum(cosine_similarity(M,x_i)) is maximized
9+
*/
10+
class MeanAngle extends Euclidean
11+
{
12+
protected function normalize(array $v) {
13+
$norm = array_reduce(
14+
$v,
15+
function ($v,$w) {
16+
return $v+$w*$w;
17+
}
18+
);
19+
$norm = sqrt($norm);
20+
return array_map(
21+
function ($vi) use($norm) {
22+
return $vi/$norm;
23+
},
24+
$v
25+
);
26+
}
27+
28+
public function getCentroid(array &$docs, array $choose=array()) {
29+
if (empty($choose))
30+
$choose = range(0,count($docs)-1);
31+
$cnt = count($choose);
32+
$v = array();
33+
foreach ($choose as $idx) {
34+
$d = $this->normalize($this->getVector($docs[$idx]));
35+
foreach ($d as $i=>$vi) {
36+
if (!isset($v[$i]))
37+
$v[$i] = $vi;
38+
else
39+
$v[$i] += $vi;
40+
}
41+
}
42+
return array_map(
43+
function ($vi) use($cnt) {
44+
return $vi/$cnt;
45+
},
46+
$v
47+
);
48+
}
49+
}
50+

src/NlpTools/Clustering/Clusterer.php

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
<?php
2+
3+
namespace NlpTools\Clustering;
4+
5+
use NlpTools\FeatureFactories\FeatureFactory;
6+
use NlpTools\Documents\TrainingSet;
7+
8+
abstract class Clusterer
9+
{
10+
/**
11+
* Group the documents together
12+
*
13+
* @param TrainingSet $documents The documents to be clustered
14+
* @param FeatureFactory $ff A feature factory to transform the documents given
15+
* @return array The clusters, an array containing arrays of offsets for the documents
16+
*/
17+
abstract public function cluster(TrainingSet $documents, FeatureFactory $ff);
18+
19+
/**
20+
* Helper function to transform a TrainingSet to an array of feature vectors
21+
*/
22+
protected function getDocumentArray(TrainingSet $documents, FeatureFactory $ff) {
23+
$docs = array();
24+
foreach ($documents as $d) {
25+
$docs[] = $ff->getFeatureArray('',$d);
26+
}
27+
return $docs;
28+
}
29+
}
30+

0 commit comments

Comments
 (0)