Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
278 changes: 278 additions & 0 deletions DBSCAN_Version_2
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
"""
Created on Sat Jan 25 16:36:46 2020
@author: Amrit Raj
@Acknoledgment: Arpit, Tarun, and Ankit
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

dataset=pd.read_excel('Final Data.xlsx')
#check=dataset.iloc[:, 13].values
x=dataset.iloc[:, 2:12].values
print(x)

#Getting rid of categorical data of region...
category_Hot_Encoded=pd.get_dummies(x[:, 6])
x=np.append(x, category_Hot_Encoded, axis=1)
print(x)
x=np.delete(x, 6, 1)# This removes the region coulumm which in turn would be replaced by category_Hot_Encoded
#x=pd.DataFrame(x) to view it as a data frame
#Getting rid of categorical data of Gender...
category_Hot_Encoded=pd.get_dummies(x[:, 3])
x=np.append(x, category_Hot_Encoded, axis=1)
print(x)
x=np.delete(x, 3, 1)# This removes the region coulumm which in turn would be replaced by category_Hot_Encoded
#x=pd.DataFrame(x) to view it as a data frame
#Now we remove the female categorical data to avoid the categorical data trap
x=np.delete(x, 13, 1)
#x=pd.DataFrame(x)...just to visualize
#Getting rid of categorical data of Gender...
#Getting rid of categorical data of Goals...
#Getting rid of categorical data of Branch...
#x=pd.DataFrame(x)
category_Hot_Encoded=pd.get_dummies(x[:, 1])
x=np.append(x, category_Hot_Encoded, axis=1)
print(x)
x=np.delete(x, 1, 1)# This removes the region coulumm which in turn would be replaced by category_Hot_Encoded
#Getting rid of categorical data of Branch...
#x=pd.DataFrame(x)
category_Hot_Encoded=pd.get_dummies(x[:, 1])
x=np.append(x, category_Hot_Encoded, axis=1)
print(x)
x=np.delete(x, 1, 1)
#x=pd.DataFrame(x)
#Going for categorical data of
category_Hot_Encoded=pd.get_dummies(x[:, 5])
x=np.append(x, category_Hot_Encoded, axis=1)
print(x)
x=np.delete(x, 5, 1)
#Deleting the two parts...
x=np.delete(x, 3, 1)
x=np.delete(x, 3, 1)
x=np.delete(x, 1, 1)
x_copy=x#This is used to maintain a x copy in object form
x=pd.DataFrame(x)


"""
'''Feature Scaling"'''

##Scholar ID
y=dataset.iloc[:, 1:2].values
#type(y[1][1])
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
y = sc_X.fit_transform(y)
x=np.append(x, y, axis=1)
x=pd.DataFrame(x)
x=x.drop(labels=0, axis=1)


##Expenditure
y=dataset.iloc[:, 3:4].values
#type(y[1][1])
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
y = sc_X.fit_transform(y)
x=np.append(x, y, axis=1)
x=pd.DataFrame(x)
x=x.drop(labels=0, axis=1)

'''Scaling Done'''
"""
"""
import statsmodels.formula.api as sm
reg_OLS=sm.OLS(endog=y_train_cluster, exog=x_train_cluster).fit();
reg_OLS.summary()

'''Dimensionality Reduction'''
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
x = pca.fit_transform(x)
explained_variance = pca.explained_variance_ratio_


#Splitting the data set into train and test set.................................
"""
'''Dimensionality Reduction'''
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
x = pca.fit_transform(x)
explained_variance = pca.explained_variance_ratio_

from sklearn.cross_validation import train_test_split
x_train_cluster, x_test_cluster=train_test_split(x, test_size=0.2, random_state=1)


#x_train_cluster=x[50:325, :];
#x_test_cluster=x[0:50, :];



## Model ''''''
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler


# #############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
x_train_cluster, labels_true = make_blobs(n_samples=270, centers=centers, cluster_std=0.3,
random_state=0)


# #############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(x_train_cluster)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(x_train_cluster, labels))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]

class_member_mask = (labels == k)

xy = x_train_cluster[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)

xy = x_train_cluster[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

#Now, we apply the Random Forest on the given data........................

y_labels=labels


"""
#Splitting the data set into train and test set.................................

from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x, y_labels, test_size=0.2, random_state=0)
"""
y_train_cluster=y_labels;
#y_test_cluster=y_labels[275:326, :];

#Applying Random Forest...................................................
"""
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=1, criterion='entropy', random_state=0)
classifier.fit(x_train_cluster, y_train_cluster)
"""

from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression(random_state=0)
classifier.fit(x_train_cluster, y_train_cluster)
xx=pd.DataFrame(x_train_cluster[:,:1])

#Predicting the group of test set.....
y_pred=classifier.predict(x_test_cluster)



#For getting the actual clusters of the last 50 datasets.........................................



# #############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
x_test_cluster, labels_true = make_blobs(n_samples=64, centers=centers, cluster_std=0.3,
random_state=0)


# #############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.45, min_samples=10).fit(x_test_cluster)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(x_test_cluster, labels))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]

class_member_mask = (labels == k)

xy = x_test_cluster[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)

xy = x_test_cluster[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()


y_test_cluster=labels;



#making the confusion matrix...
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test_cluster, y_pred)

#Statistics for the results of Random forest...
from sklearn.metrics import classification_report
print(classification_report(y_test_cluster, y_pred));