Skip to content

Commit 4f6668c

Browse files
authored
k-means clustering implementation (#46)
1 parent 2b42296 commit 4f6668c

File tree

8 files changed

+454
-51
lines changed

8 files changed

+454
-51
lines changed

README.md

+6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ A Go language port of [Turfjs](http://turfjs.org/docs/)
77

88
Turf for Go is a ported library in GoLang ported from the Turf.js library.
99

10+
## Extra modules
11+
This version also include the clustering module that doesn't exist in the official turf library.
12+
1013
# Ported functions
1114

1215
## measurement
@@ -33,6 +36,9 @@ Turf for Go is a ported library in GoLang ported from the Turf.js library.
3336
- [ ] square
3437
- [ ] greatCircle
3538

39+
## clustering
40+
- [x] kmeans
41+
3642
## Coordinate Mutation
3743
- [ ] cleanCoords
3844
- [ ] flip

assert/assert.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,14 @@ func isNil(object interface{}) bool {
8282

8383
// Nil asserts that the specified object is nil.
8484
//
85-
// assert.Nil(t, err)
85+
// assert.Nil(t, err)
8686
func Nil(t *testing.T, object interface{}) bool {
8787
return isNil(object)
8888
}
8989

9090
// NotNil asserts that the specified object is not nil.
9191
//
92-
// assert.NotNil(t, err)
92+
// assert.NotNil(t, err)
9393
func NotNil(t *testing.T, object interface{}) bool {
9494
return !isNil(object)
9595
}

clustering/clustering.go

+268
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
package clustering
2+
3+
import (
4+
"errors"
5+
"fmt"
6+
"math"
7+
"math/rand"
8+
"time"
9+
10+
"github.com/tomchavakis/geojson/geometry"
11+
"github.com/tomchavakis/turf-go/measurement"
12+
meta "github.com/tomchavakis/turf-go/meta/coordAll"
13+
)
14+
15+
type Distance string
16+
17+
const (
18+
Euclidean Distance = "Euclidean"
19+
Haversine Distance = "Haversine"
20+
)
21+
22+
// Parameters for the KMean Clustering
23+
type Parameters struct {
24+
k int // number of clusters
25+
points []geometry.Point // pointSet
26+
distanceType Distance
27+
}
28+
29+
// KMeans initialisation
30+
// http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf
31+
func KMeans(params Parameters) (map[geometry.Point][]geometry.Point, error) {
32+
33+
if params.k < 2 {
34+
return nil, fmt.Errorf("at least 2 centroids required")
35+
}
36+
37+
if params.k > len(params.points) {
38+
return nil, fmt.Errorf("clusters can't be more than the length of the points")
39+
}
40+
41+
return getClusters(params)
42+
43+
}
44+
45+
func initialisation(tmpCluster map[geometry.Point][]geometry.Point, centroids []geometry.Point, ctrIdx map[int]bool, params Parameters) (map[geometry.Point][]geometry.Point, error) {
46+
// create a cluster of points based on random centroids
47+
for i, p := range params.points {
48+
if _, isCentroid := ctrIdx[i]; isCentroid {
49+
//tmpCluster[p] = tmpCluster[p]
50+
continue
51+
}
52+
53+
// get the distance from all the centroids
54+
ptCtrsDistances, err := getDistance(p, centroids, params.distanceType)
55+
if err != nil {
56+
return nil, err
57+
}
58+
59+
// get the minimum distance index for this point
60+
minDistanceIdx := minDistanceIdx(ptCtrsDistances)
61+
nearestCentroid := centroids[minDistanceIdx]
62+
tmpCluster[nearestCentroid] = append(tmpCluster[nearestCentroid], p)
63+
}
64+
return tmpCluster, nil
65+
}
66+
67+
func getClusters(params Parameters) (map[geometry.Point][]geometry.Point, error) {
68+
ctrIdx, centroids := getCentroids(params)
69+
70+
tmpCluster := make(map[geometry.Point][]geometry.Point)
71+
meanCluster := make(map[geometry.Point][]geometry.Point)
72+
73+
init := false
74+
for {
75+
tmpMeanCluster := make(map[geometry.Point][]geometry.Point)
76+
77+
if !init {
78+
tmpCl, err := initialisation(tmpCluster, centroids, ctrIdx, params)
79+
if err != nil {
80+
return nil, err
81+
}
82+
tmpCluster = tmpCl
83+
}
84+
85+
// calculate the mass mean of each cluster
86+
clusterMeanPoints := []geometry.Point{}
87+
for i, c := range tmpCluster {
88+
// median included
89+
if !init {
90+
c = append(c, i)
91+
init = true
92+
}
93+
meanClusterPoint, err := meanClusterPoint(c)
94+
if err != nil {
95+
return nil, err
96+
}
97+
clusterMeanPoints = append(clusterMeanPoints, *meanClusterPoint)
98+
}
99+
100+
for _, p := range params.points {
101+
102+
// get the distance from all the cluster mean
103+
meanCtrsDistances, err := getDistance(p, clusterMeanPoints, params.distanceType)
104+
if err != nil {
105+
return nil, err
106+
}
107+
108+
// get the minimum distance index for this point
109+
minDistanceIdx := minDistanceIdx(meanCtrsDistances)
110+
nearestMean := clusterMeanPoints[minDistanceIdx]
111+
tmpMeanCluster[nearestMean] = append(tmpMeanCluster[nearestMean], p)
112+
}
113+
tmpCluster = tmpMeanCluster
114+
// exit point
115+
if isEqual(meanCluster, tmpMeanCluster) {
116+
break
117+
}
118+
119+
meanCluster = tmpMeanCluster
120+
}
121+
122+
//getDistance(d)
123+
return meanCluster, nil
124+
}
125+
126+
// TODO: Find the centroid
127+
128+
// https://sites.google.com/site/yangdingqi/home/foursquare-dataset
129+
// https://desktop.arcgis.com/en/arcmap/latest/tools/spatial-statistics-toolbox/h-how-mean-center-spatial-statistics-works.html
130+
// https://postgis.net/docs/ST_Centroid.html
131+
// https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_centroid
132+
// https://stackoverflow.com/questions/30299267/geometric-median-of-multidimensional-points
133+
// https://www.pnas.org/content/pnas/97/4/1423.full.pdf
134+
func meanClusterPoint(cluster []geometry.Point) (*geometry.Point, error) {
135+
var pointSet = []geometry.Point{}
136+
excludeWrapCoord := true
137+
for _, v := range cluster {
138+
coords, err := meta.CoordAll(&v, &excludeWrapCoord)
139+
if err != nil {
140+
return nil, err
141+
}
142+
pointSet = append(pointSet, coords...)
143+
}
144+
145+
coordsLength := len(pointSet)
146+
if coordsLength < 1 {
147+
return nil, errors.New("no coordinates found")
148+
}
149+
150+
xSum := 0.0
151+
ySum := 0.0
152+
153+
for i := 0; i < coordsLength; i++ {
154+
xSum += pointSet[i].Lng
155+
ySum += pointSet[i].Lat
156+
}
157+
158+
finalCenterLongtitude := xSum / float64(coordsLength)
159+
finalCenterLatitude := ySum / float64(coordsLength)
160+
161+
return &geometry.Point{
162+
Lat: finalCenterLatitude,
163+
Lng: finalCenterLongtitude,
164+
}, nil
165+
166+
}
167+
168+
func getCentroids(params Parameters) (map[int]bool, []geometry.Point) {
169+
var centroids []geometry.Point
170+
ctrIdx := make(map[int]bool)
171+
172+
idx := getRandoms(len(params.points), params.k)
173+
for _, v := range idx {
174+
centroids = append(centroids, params.points[v])
175+
ctrIdx[v] = true
176+
}
177+
178+
return ctrIdx, centroids
179+
}
180+
181+
// l length, k number of clusters
182+
func getRandoms(l int, k int) []int {
183+
r := rand.New(rand.NewSource(time.Now().UnixNano()))
184+
return r.Perm(l)[:k]
185+
}
186+
187+
func euclideanDistance(lat1 float64, lon1 float64, lat2 float64, lon2 float64) float64 {
188+
dist := math.Sqrt(math.Pow(lat2-lat1, 2) + math.Pow(lon2-lon1, 2))
189+
return dist
190+
}
191+
192+
// getDistance returns the distance between the point and the centroids
193+
func getDistance(p geometry.Point, centroids []geometry.Point, dt Distance) (map[int]float64, error) {
194+
ds := make(map[int]float64)
195+
196+
if dt == Euclidean {
197+
for i, c := range centroids {
198+
d := euclideanDistance(p.Lat, p.Lng, c.Lat, c.Lng)
199+
ds[i] = d
200+
}
201+
} else { // haversine distance implemented
202+
for i, c := range centroids {
203+
d, err := measurement.Distance(p.Lng, p.Lat, c.Lng, c.Lat, "")
204+
if err != nil {
205+
return nil, err
206+
}
207+
ds[i] = d
208+
}
209+
}
210+
return ds, nil
211+
}
212+
213+
func memoizeCluster(key geometry.Point, cluster []geometry.Point) map[string]bool {
214+
mr := make(map[string]bool)
215+
for _, v := range cluster {
216+
s := memoizationSignature(key, v)
217+
mr[s] = true
218+
}
219+
return mr
220+
}
221+
222+
func memoizationSignature(key geometry.Point, p geometry.Point) string {
223+
return fmt.Sprintf("%f_%f_%f", key, p.Lat, p.Lng)
224+
}
225+
226+
func mergeMaps(maps ...map[string]bool) map[string]bool {
227+
result := make(map[string]bool)
228+
for _, m := range maps {
229+
for k, v := range m {
230+
result[k] = v
231+
}
232+
}
233+
return result
234+
}
235+
236+
func isEqual(clusterA map[geometry.Point][]geometry.Point, clusterB map[geometry.Point][]geometry.Point) bool {
237+
if len(clusterA) != len(clusterB) {
238+
return false
239+
}
240+
241+
memo := make(map[string]bool)
242+
for i, v1 := range clusterA {
243+
tmp := memoizeCluster(i, v1)
244+
memo = mergeMaps(memo, tmp)
245+
}
246+
247+
for j, arrB := range clusterB {
248+
for _, v := range arrB {
249+
if !memo[memoizationSignature(j, v)] {
250+
return false
251+
}
252+
}
253+
}
254+
255+
return true
256+
}
257+
258+
func minDistanceIdx(ptCtrsDistances map[int]float64) int {
259+
minDistanceIdx := 0
260+
minD := ptCtrsDistances[minDistanceIdx]
261+
for i, d := range ptCtrsDistances {
262+
if d < minD {
263+
minD = d
264+
minDistanceIdx = i
265+
}
266+
}
267+
return minDistanceIdx
268+
}

0 commit comments

Comments
 (0)