ankitkumar2000 · Amrit-Raj-Nits · Feb 13, 2020
diff --git a/DBSCAN_Version_2 b/DBSCAN_Version_2
@@ -0,0 +1,278 @@
+"""
+Created on Sat Jan 25 16:36:46 2020
+@author: Amrit Raj
+@Acknoledgment: Arpit, Tarun, and Ankit
+"""
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+dataset=pd.read_excel('Final Data.xlsx')
+#check=dataset.iloc[:, 13].values
+x=dataset.iloc[:, 2:12].values
+print(x)
+
+#Getting rid of categorical data of region...
+category_Hot_Encoded=pd.get_dummies(x[:, 6])
+x=np.append(x, category_Hot_Encoded, axis=1)
+print(x)
+x=np.delete(x, 6, 1)# This removes the region coulumm which in turn would be replaced by category_Hot_Encoded
+#x=pd.DataFrame(x) to view it as  a data frame
+#Getting rid of categorical data of Gender...
+category_Hot_Encoded=pd.get_dummies(x[:, 3])
+x=np.append(x, category_Hot_Encoded, axis=1)
+print(x)
+x=np.delete(x, 3, 1)# This removes the region coulumm which in turn would be replaced by category_Hot_Encoded
+#x=pd.DataFrame(x) to view it as  a data frame
+#Now we remove the female categorical data to avoid the categorical data trap
+x=np.delete(x, 13, 1)
+#x=pd.DataFrame(x)...just to visualize
+#Getting rid of categorical data of Gender...
+#Getting rid of categorical data of Goals...
+#Getting rid of categorical data of Branch...
+#x=pd.DataFrame(x)
+category_Hot_Encoded=pd.get_dummies(x[:, 1])
+x=np.append(x, category_Hot_Encoded, axis=1)
+print(x)
+x=np.delete(x, 1, 1)# This removes the region coulumm which in turn would be replaced by category_Hot_Encoded
+#Getting rid of categorical data of Branch...
+#x=pd.DataFrame(x)
+category_Hot_Encoded=pd.get_dummies(x[:, 1])
+x=np.append(x, category_Hot_Encoded, axis=1)
+print(x)
+x=np.delete(x, 1, 1)
+#x=pd.DataFrame(x)
+#Going for categorical data of
+category_Hot_Encoded=pd.get_dummies(x[:, 5])
+x=np.append(x, category_Hot_Encoded, axis=1)
+print(x)
+x=np.delete(x, 5, 1)
+#Deleting the two parts...
+x=np.delete(x, 3, 1)
+x=np.delete(x, 3, 1)
+x=np.delete(x, 1, 1)
+x_copy=x#This is used to maintain a x copy in object form
+x=pd.DataFrame(x)
+
+
+"""
+'''Feature Scaling"'''
+
+##Scholar ID
+y=dataset.iloc[:, 1:2].values
+#type(y[1][1])
+from sklearn.preprocessing import StandardScaler
+sc_X = StandardScaler()
+y = sc_X.fit_transform(y)
+x=np.append(x, y, axis=1)
+x=pd.DataFrame(x)
+x=x.drop(labels=0, axis=1)
+
+
+##Expenditure
+y=dataset.iloc[:, 3:4].values
+#type(y[1][1])
+from sklearn.preprocessing import StandardScaler
+sc_X = StandardScaler()
+y = sc_X.fit_transform(y)
+x=np.append(x, y, axis=1)
+x=pd.DataFrame(x)
+x=x.drop(labels=0, axis=1)
+
+'''Scaling Done'''
+"""
+"""
+import statsmodels.formula.api as sm
+reg_OLS=sm.OLS(endog=y_train_cluster, exog=x_train_cluster).fit();
+reg_OLS.summary()
+
+'''Dimensionality Reduction'''
+from sklearn.decomposition import PCA
+pca = PCA(n_components = 2)
+x = pca.fit_transform(x)
+explained_variance = pca.explained_variance_ratio_
+
+
+#Splitting the data set into train and test set.................................
+"""
+'''Dimensionality Reduction'''
+from sklearn.decomposition import PCA
+pca = PCA(n_components = 2)
+x = pca.fit_transform(x)
+explained_variance = pca.explained_variance_ratio_
+
+from sklearn.cross_validation import train_test_split
+x_train_cluster, x_test_cluster=train_test_split(x, test_size=0.2, random_state=1)
+
+
+#x_train_cluster=x[50:325, :];
+#x_test_cluster=x[0:50, :];
+
+
+
+##   Model ''''''
+from sklearn.cluster import DBSCAN
+from sklearn import metrics
+from sklearn.datasets import make_blobs
+from sklearn.preprocessing import StandardScaler
+
+
+# #############################################################################
+# Generate sample data
+centers = [[1, 1], [-1, -1], [1, -1]]
+x_train_cluster, labels_true = make_blobs(n_samples=270, centers=centers, cluster_std=0.3,
+                            random_state=0)
+
+
+# #############################################################################
+# Compute DBSCAN
+db = DBSCAN(eps=0.3, min_samples=10).fit(x_train_cluster)
+core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
+core_samples_mask[db.core_sample_indices_] = True
+labels = db.labels_
+
+# Number of clusters in labels, ignoring noise if present.
+n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+n_noise_ = list(labels).count(-1)
+
+print('Estimated number of clusters: %d' % n_clusters_)
+print('Estimated number of noise points: %d' % n_noise_)
+print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
+print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
+print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
+print("Adjusted Rand Index: %0.3f"
+      % metrics.adjusted_rand_score(labels_true, labels))
+print("Adjusted Mutual Information: %0.3f"
+      % metrics.adjusted_mutual_info_score(labels_true, labels))
+print("Silhouette Coefficient: %0.3f"
+      % metrics.silhouette_score(x_train_cluster, labels))
+
+# #############################################################################
+# Plot result
+import matplotlib.pyplot as plt
+
+# Black removed and is used for noise instead.
+unique_labels = set(labels)
+colors = [plt.cm.Spectral(each)
+          for each in np.linspace(0, 1, len(unique_labels))]
+for k, col in zip(unique_labels, colors):
+    if k == -1:
+        # Black used for noise.
+        col = [0, 0, 0, 1]
+
+    class_member_mask = (labels == k)
+
+    xy = x_train_cluster[class_member_mask & core_samples_mask]
+    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
+             markeredgecolor='k', markersize=14)
+
+    xy = x_train_cluster[class_member_mask & ~core_samples_mask]
+    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
+             markeredgecolor='k', markersize=6)
+
+plt.title('Estimated number of clusters: %d' % n_clusters_)
+plt.show()
+
+#Now, we apply the Random Forest on the given data........................
+
+y_labels=labels
+
+
+"""
+#Splitting the data set into train and test set.................................
+
+from sklearn.cross_validation import train_test_split
+x_train, x_test, y_train, y_test=train_test_split(x, y_labels, test_size=0.2, random_state=0)
+"""
+y_train_cluster=y_labels;
+#y_test_cluster=y_labels[275:326, :];
+
+#Applying Random Forest...................................................
+"""
+from sklearn.ensemble import RandomForestClassifier
+classifier=RandomForestClassifier(n_estimators=1, criterion='entropy', random_state=0)
+classifier.fit(x_train_cluster, y_train_cluster)
+"""
+
+from sklearn.linear_model import LogisticRegression
+classifier=LogisticRegression(random_state=0)
+classifier.fit(x_train_cluster, y_train_cluster)
+xx=pd.DataFrame(x_train_cluster[:,:1])
+
+#Predicting the group of test set.....
+y_pred=classifier.predict(x_test_cluster)
+
+
+
+#For getting the actual clusters of the last 50 datasets.........................................
+
+
+
+# #############################################################################
+# Generate sample data
+centers = [[1, 1], [-1, -1], [1, -1]]
+x_test_cluster, labels_true = make_blobs(n_samples=64, centers=centers, cluster_std=0.3,
+                            random_state=0)
+
+
+# #############################################################################
+# Compute DBSCAN
+db = DBSCAN(eps=0.45, min_samples=10).fit(x_test_cluster)
+core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
+core_samples_mask[db.core_sample_indices_] = True
+labels = db.labels_
+
+# Number of clusters in labels, ignoring noise if present.
+n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+n_noise_ = list(labels).count(-1)
+
+print('Estimated number of clusters: %d' % n_clusters_)
+print('Estimated number of noise points: %d' % n_noise_)
+print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
+print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
+print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
+print("Adjusted Rand Index: %0.3f"
+      % metrics.adjusted_rand_score(labels_true, labels))
+print("Adjusted Mutual Information: %0.3f"
+      % metrics.adjusted_mutual_info_score(labels_true, labels))
+print("Silhouette Coefficient: %0.3f"
+      % metrics.silhouette_score(x_test_cluster, labels))
+
+# #############################################################################
+# Plot result
+import matplotlib.pyplot as plt
+
+# Black removed and is used for noise instead.
+unique_labels = set(labels)
+colors = [plt.cm.Spectral(each)
+          for each in np.linspace(0, 1, len(unique_labels))]
+for k, col in zip(unique_labels, colors):
+    if k == -1:
+        # Black used for noise.
+        col = [0, 0, 0, 1]
+
+    class_member_mask = (labels == k)
+
+    xy = x_test_cluster[class_member_mask & core_samples_mask]
+    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
+             markeredgecolor='k', markersize=14)
+
+    xy = x_test_cluster[class_member_mask & ~core_samples_mask]
+    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
+             markeredgecolor='k', markersize=6)
+
+plt.title('Estimated number of clusters: %d' % n_clusters_)
+plt.show()
+
+
+y_test_cluster=labels;
+
+
+
+#making the confusion matrix...
+from sklearn.metrics import confusion_matrix
+cm=confusion_matrix(y_test_cluster, y_pred)
+
+#Statistics for the results of Random forest...
+from sklearn.metrics import classification_report
+print(classification_report(y_test_cluster, y_pred));