Skip to content

Commit 165b29d

Browse files
authored
Create dbscan.py
1 parent d69b5c9 commit 165b29d

File tree

1 file changed

+116
-0
lines changed

1 file changed

+116
-0
lines changed

Day-14-DBSCAN/dbscan.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
"""
2+
Checkout Density Based Spectral Clustering Blag:
3+
https://blog.dominodatalab.com/topology-and-density-based-clustering/
4+
5+
- Compared to centroid-based clustering like k-means, density-based clustering works by
6+
identifying “dense” clusters of points, allowing it to learn clusters of arbitrary shape
7+
and identify outliers in the data.
8+
"""
9+
import torch
10+
from sklearn.datasets import load_iris
11+
from sklearn.model_selection import train_test_split
12+
from sklearn.metrics import accuracy_score
13+
from sklearn import datasets
14+
15+
class DBScan:
16+
def __init__(self, eps = 2.5, min_points=30):
17+
"""
18+
eps - radius distance around which a cluster is considered.
19+
min_points - Number of points to be present inside the radius
20+
(check out density reachable or border points from blog to understand how cluster points are considered)
21+
"""
22+
self.eps = eps
23+
self.minimum_points = min_points
24+
25+
def euclidean_distance(self, x1, x2):
26+
"""
27+
:param x1: input tensor
28+
:param x2: input tensor
29+
:return: distance between tensors
30+
"""
31+
return torch.cdist(x1, x2)
32+
33+
def direct_neighbours(self, sample):
34+
"""
35+
:param sample: Sample whose neighbors needs to be identified
36+
:return: all the neighbors within eps distance
37+
"""
38+
neighbors = []
39+
idxs = torch.arange(self.X.shape[0])
40+
for i, _sample in enumerate(self.X[idxs != sample]):
41+
42+
distance = self.euclidean_distance(self.X[sample].unsqueeze(0), _sample.unsqueeze(0))
43+
if distance < self.eps:
44+
neighbors.append(i)
45+
46+
return torch.tensor(neighbors)
47+
48+
def density_neighbors(self, sample, neighbors):
49+
"""
50+
Recursive method which expands the cluster until we have reached the border
51+
of the dense area (density determined by eps and min_samples)
52+
53+
:param sample: Sample whose border points to be identified
54+
:param neighbors: samples and its neighbors within eps distance
55+
:return: It updates the number of points assigned to each cluster, by finding
56+
border points and its relative points. In a sense, it expands cluster.
57+
"""
58+
cluster = [sample]
59+
for neighbor_i in neighbors:
60+
if not neighbor_i in self.visited_samples:
61+
self.visited_samples.append(neighbor_i)
62+
self.neighbors[neighbor_i] = self.direct_neighbours(neighbor_i)
63+
64+
if len(self.neighbors[neighbor_i]) >= self.minimum_points:
65+
expanded_cluster = self.density_neighbors(
66+
neighbor_i, self.neighbors[neighbor_i])
67+
cluster = cluster + expanded_cluster
68+
else:
69+
cluster.append(neighbor_i)
70+
71+
return cluster
72+
73+
def get_cluster_label(self):
74+
"""
75+
:return: assign cluster label based on expanded clusters
76+
"""
77+
labels = torch.zeros(self.X.shape[0]).fill_(len(self.clusters))
78+
for cluster_i, cluster in enumerate(self.clusters):
79+
for sample_i in cluster:
80+
labels[sample_i] = cluster_i
81+
82+
return labels
83+
84+
def predict(self, X):
85+
"""
86+
:param X: input tensor
87+
:return: predicting the labels os samples depending on its distance from clusters
88+
"""
89+
self.X = X
90+
self.clusters = []
91+
self.visited_samples = []
92+
self.neighbors = {}
93+
n_samples = X.shape[0]
94+
95+
for sample_i in range(n_samples):
96+
if sample_i in self.visited_samples:
97+
continue
98+
self.neighbors[sample_i] = self.direct_neighbours(sample_i)
99+
if len(self.neighbors[sample_i]) >= self.minimum_points:
100+
self.visited_samples.append(sample_i)
101+
new_cluster = self.density_neighbors(
102+
sample_i, self.neighbors[sample_i])
103+
self.clusters.append(new_cluster)
104+
105+
cluster_labels = self.get_cluster_label()
106+
return cluster_labels
107+
108+
if __name__ == '__main__':
109+
iris = load_iris()
110+
torch.manual_seed(0)
111+
X = torch.tensor(iris.data, dtype=torch.float)
112+
y = torch.tensor(iris.target)
113+
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
114+
dbscan = DBScan(eps=0.25, min_points=20)
115+
ypred = dbscan.predict(x_train)
116+
print(f'Accuracy Score: {accuracy_score(y_train, ypred)}')

0 commit comments

Comments
 (0)