Skip to content

Commit 6322b82

Browse files
committed
Merge branch 'clustering'
2 parents c038432 + c28d0d7 commit 6322b82

21 files changed

+639
-53
lines changed

autoloader.php

+10-2
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,22 @@
2525
"NlpTools\Random\Distributions\Gamma"=>"/random/src/NlpTools/Random/Distributions/Gamma.php",
2626
"NlpTools\Random\Distributions\Dirichlet"=>"/random/src/NlpTools/Random/Distributions/Dirichlet.php",
2727
"NlpTools\Random\Distributions\AbstractDistribution"=>"/random/src/NlpTools/Random/Distributions/AbstractDistribution.php",
28+
"NlpTools\Clustering\CentroidFactories\Hamming"=>"/clustering/hamming_centroid.php",
29+
"NlpTools\Clustering\CentroidFactories\Euclidean"=>"/clustering/euclidean_centroid.php",
30+
"NlpTools\Clustering\CentroidFactories\CentroidFactory"=>"/clustering/centroid_factory.php",
31+
"NlpTools\Clustering\Clusterer"=>"/clustering/cluster.php",
32+
"NlpTools\Clustering\KMeans"=>"/clustering/k_means.php",
33+
"NlpTools\Clustering\CentroidFactories\MeanAngle"=>"/clustering/mean_angle_centroid.php",
2834
"NlpTools\Classifiers\MultinomialNBClassifier"=>"/classifier/multinomial_nb_classifier.php",
2935
"NlpTools\Classifiers\Classifier"=>"/classifier/classifier.php",
3036
"NlpTools\Classifiers\FeatureBasedLinearClassifier"=>"/classifier/feature_based_linear_classifier.php",
3137
"NlpTools\Similarity\JaccardIndex"=>"/similarity/jaccard_index.php",
38+
"NlpTools\Similarity\Euclidean"=>"/similarity/euclidean.php",
39+
"NlpTools\Similarity\Distance"=>"/similarity/distance.php",
3240
"NlpTools\Similarity\Simhash"=>"/similarity/simhash.php",
3341
"NlpTools\Similarity\CosineSimilarity"=>"/similarity/cosine_similarity.php",
34-
"NlpTools\Similarity\SetSimilarity"=>"/similarity/set_similarity.php",
35-
"NlpTools\Similarity\SetDistance"=>"/similarity/set_distance.php",
42+
"NlpTools\Similarity\Similarity"=>"/similarity/similarity.php",
43+
"NlpTools\Similarity\HammingDistance"=>"/similarity/hamming.php",
3644
"NlpTools\Stemmers\RegexStemmer"=>"/stemmers/regex_stemmer.php",
3745
"NlpTools\Stemmers\Stemmer"=>"/stemmers/stemmer.php",
3846
"NlpTools\Stemmers\PorterStemmer"=>"/stemmers/porter_stemmer.php",

clustering/centroid_factory.php

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?php
2+
3+
namespace NlpTools\Clustering\CentroidFactories;
4+
5+
interface CentroidFactory
6+
{
7+
/**
8+
* Parse the provided docs and create a doc that given a metric
9+
* of distance is the centroid of the provided docs.
10+
*
11+
* The second array is to choose some of the provided docs to
12+
* compute the centroid.
13+
*
14+
* @param array $docs The docs from which the centroid will be computed
15+
* @param array $choose The indexes from which the centroid will be computed (if empty all the docs will be used)
16+
* @return mixed The centroid. It could be any form of data a number, a vector (it will be the same as the data provided in docs)
17+
*/
18+
public function getCentroid(array &$docs, array $choose=array());
19+
}
20+

clustering/cluster.php

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
<?php
2+
3+
namespace NlpTools\Clustering;
4+
5+
use NlpTools\FeatureFactories\FeatureFactory;
6+
use NlpTools\Documents\TrainingSet;
7+
8+
abstract class Clusterer
9+
{
10+
/**
11+
* Group the documents together
12+
*
13+
* @param TrainingSet $documents The documents to be clustered
14+
* @param FeatureFactory $ff A feature factory to transform the documents given
15+
* @return array The clusters, an array containing arrays of offsets for the documents
16+
*/
17+
abstract public function cluster(TrainingSet $documents, FeatureFactory $ff);
18+
19+
/**
20+
* Helper function to transform a TrainingSet to an array of feature vectors
21+
*/
22+
protected function getDocumentArray(TrainingSet $documents, FeatureFactory $ff) {
23+
$docs = array();
24+
foreach ($documents as $d) {
25+
$docs[] = $ff->getFeatureArray('',$d);
26+
}
27+
return $docs;
28+
}
29+
}
30+

clustering/euclidean_centroid.php

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
<?php
2+
3+
namespace NlpTools\Clustering\CentroidFactories;
4+
5+
/**
6+
* Computes the euclidean centroid of the provided sparse vectors
7+
*/
8+
class Euclidean implements CentroidFactory
9+
{
10+
/**
11+
* If the document is a collection of tokens or features transorm it to
12+
* a sparse vector with frequency information.
13+
*
14+
* Ex.: If 'A' appears twice in the doc the dimension 'A' will have value 2
15+
* in the resulting vector
16+
*
17+
* @param array $doc The doc data to transform to sparse vector
18+
* @return array A sparse vector representing the document to the n-dimensional euclidean space
19+
*/
20+
protected function getVector(array $doc) {
21+
if (key($doc)===0)
22+
return array_count_values($doc);
23+
else
24+
return $doc;
25+
}
26+
27+
/**
28+
* Compute the mean value for each dimension.
29+
*
30+
* @param array $docs The docs from which the centroid will be computed
31+
* @param array $choose The indexes from which the centroid will be computed (if empty all the docs will be used)
32+
* @return mixed The centroid. It could be any form of data a number, a vector (it will be the same as the data provided in docs)
33+
*/
34+
public function getCentroid(array &$docs, array $choose=array()) {
35+
$v = array();
36+
if (empty($choose))
37+
$choose = range(0,count($docs)-1);
38+
$cnt = count($choose);
39+
foreach ($choose as $idx) {
40+
$doc = $this->getVector($docs[$idx]);
41+
foreach ($doc as $k=>$w) {
42+
if (!isset($v[$k]))
43+
$v[$k] = $w;
44+
else
45+
$v[$k] += $w;
46+
}
47+
}
48+
foreach ($v as &$w) {
49+
$w /= $cnt;
50+
}
51+
return $v;
52+
}
53+
}
54+

clustering/hamming_centroid.php

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
<?php
2+
3+
namespace NlpTools\Clustering\CentroidFactories;
4+
5+
/**
6+
This class computes the centroid of the hamming distance between two strings
7+
that are the binary representations of two integers (the strings are supposed
8+
to only contain the characters 1 and 0).
9+
*/
10+
class Hamming implements CentroidFactory
11+
{
12+
13+
/**
14+
* Return a number in binary encoding in a string such that the sum of its
15+
* hamming distances of each document is minimized.
16+
*
17+
* Assumptions: The docs array should contain strings that are properly padded
18+
* binary (they should all be the same length).
19+
*/
20+
public function getCentroid(array &$docs, array $choose=array()) {
21+
$bitl = strlen($docs[0]);
22+
$buckets = array_fill_keys(
23+
range(0,$bitl-1),
24+
0
25+
);
26+
if (empty($choose))
27+
$choose = range(0,count($docs)-1);
28+
foreach ($choose as $idx) {
29+
$s = $docs[$idx];
30+
foreach ($buckets as $i=>&$v) {
31+
if ($s[$i]=='1')
32+
$v += 1;
33+
else
34+
$v -= 1;
35+
}
36+
}
37+
return implode(
38+
'',
39+
array_map(
40+
function ($v) {
41+
return ($v>0) ? '1' : '0';
42+
},
43+
$buckets
44+
)
45+
);
46+
}
47+
48+
}
49+

clustering/k_means.php

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
<?php
2+
3+
namespace NlpTools\Clustering;
4+
5+
use NlpTools\Similarity\Distance;
6+
use NlpTools\Clustering\CentroidFactories\CentroidFactory;
7+
use NlpTools\Documents\TrainingSet;
8+
use NlpTools\FeatureFactories\FeatureFactory;
9+
10+
/**
11+
* This clusterer uses the KMeans algorithm for clustering documents.
12+
* It accepts as parameters the number of clusters and the distance metric
13+
* as well as the methodology for computing the new centroids (thus it
14+
* can be used to cluster documents in spaces other than the euclidean
15+
* vector space).
16+
* A description of this algorithm can be found at
17+
* http://en.wikipedia.org/wiki/K-means_clustering
18+
*/
19+
class KMeans extends Clusterer
20+
{
21+
protected $dist;
22+
protected $centroidF;
23+
protected $n;
24+
protected $cutoff;
25+
26+
/**
27+
* Initialize the K Means clusterer
28+
*
29+
* @param int $n The number of clusters to compute
30+
* @param Distance $d The distance metric to be used (Euclidean, Hamming, ...)
31+
* @param CentroidFactory $cf This parameter will be used to create the new centroids from a set of documents
32+
* @param float $cutoff When the maximum change of the centroids is smaller than that stop iterating
33+
*/
34+
public function __construct($n, Distance $d, CentroidFactory $cf, $cutoff=1e-5) {
35+
$this->dist = $d;
36+
$this->n = $n;
37+
$this->cutoff = $cutoff;
38+
$this->centroidF = $cf;
39+
}
40+
41+
/**
42+
* Apply the feature factory to the documents and then cluster the resulting array
43+
* using the provided distance metric and centroid factory.
44+
*/
45+
public function cluster(TrainingSet $documents, FeatureFactory $ff) {
46+
// transform the documents according to the FeatureFactory
47+
$docs = $this->getDocumentArray($documents,$ff);
48+
49+
// choose N centroids at random
50+
$centroids = array();
51+
foreach (array_rand($docs,$this->n) as $key) {
52+
$centroids[] = $docs[$key];
53+
}
54+
55+
// cache the distance and centroid factory functions for use
56+
// with closures
57+
$dist = array($this->dist,'dist');
58+
$cf = array($this->centroidF,'getCentroid');
59+
60+
// looooooooop
61+
while (true)
62+
{
63+
// compute the distance each document has from our centroids
64+
// the array is MxN where M = count($docs) and N = count($centroids)
65+
$distances = array_map(
66+
function ($doc) use(&$centroids,$dist) {
67+
return array_map(
68+
function ($c) use($dist,$doc) {
69+
return call_user_func($dist,&$c,&$doc);
70+
//return $dist($c,$doc);
71+
},
72+
$centroids
73+
);
74+
},
75+
$docs
76+
);
77+
78+
// initialize the empty clusters
79+
$clusters = array_fill_keys(
80+
array_keys($centroids),
81+
array()
82+
);
83+
foreach ($distances as $idx=>$d) {
84+
// assign document idx to the closest centroid
85+
$clusters[array_search(min($d),$d)][] = $idx;
86+
}
87+
88+
// compute the new centroids from the assigned documents
89+
// using the centroid factory function
90+
$new_centroids = array_map(
91+
function ($cluster) use(&$docs,$cf) {
92+
return call_user_func($cf,&$docs,$cluster);
93+
},
94+
$clusters
95+
);
96+
97+
// compute the change each centroid had from the previous one
98+
$changes = array_map(
99+
$dist,
100+
$new_centroids,
101+
$centroids
102+
);
103+
104+
// if the largest change is small enough we are done
105+
if (max($changes)<$this->cutoff) {
106+
// return the clusters, the centroids and the distances
107+
return array($clusters,$centroids,$distances);
108+
}
109+
110+
// update the centroids and loooooop again
111+
$centroids = $new_centroids;
112+
}
113+
}
114+
}
115+

clustering/mean_angle_centroid.php

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
<?php
2+
3+
namespace NlpTools\Clustering\CentroidFactories;
4+
5+
/**
6+
* MeanAngle computes the unit vector with angle the average of all
7+
* the given vectors. The purpose is to compute a vector M such that
8+
* sum(cosine_similarity(M,x_i)) is maximized
9+
*/
10+
class MeanAngle extends Euclidean
11+
{
12+
protected function normalize(array $v) {
13+
$norm = array_reduce(
14+
$v,
15+
function ($v,$w) {
16+
return $v+$w*$w;
17+
}
18+
);
19+
$norm = sqrt($norm);
20+
return array_map(
21+
function ($vi) use($norm) {
22+
return $vi/$norm;
23+
},
24+
$v
25+
);
26+
}
27+
28+
public function getCentroid(array &$docs, array $choose=array()) {
29+
if (empty($choose))
30+
$choose = range(0,count($docs)-1);
31+
$cnt = count($choose);
32+
$v = array();
33+
foreach ($choose as $idx) {
34+
$d = $this->normalize($this->getVector($docs[$idx]));
35+
foreach ($d as $i=>$vi) {
36+
if (!isset($v[$i]))
37+
$v[$i] = $vi;
38+
else
39+
$v[$i] += $vi;
40+
}
41+
}
42+
return array_map(
43+
function ($vi) use($cnt) {
44+
return $vi/$cnt;
45+
},
46+
$v
47+
);
48+
}
49+
}
50+

0 commit comments

Comments
 (0)