-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans_all_participant_2axis_summed_nodes_unnormalized.py
182 lines (143 loc) · 6.96 KB
/
kmeans_all_participant_2axis_summed_nodes_unnormalized.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# grab data
import csv
import matplotlib
from DataSet import DataSet #custom class
from OptimalClusterFinder import OptimalClusterFinder #custom class
import numpy as np #scikit-learn requires this
import itertools
#kmeans
from sklearn.cluster import KMeans #sci-kit learn
import matplotlib.pyplot as plt # plotting
from mpl_toolkits.mplot3d import Axes3D #3D MatPlotLib - if you have matplotlib, you have this
from sklearn.metrics import silhouette_samples
#benchmark tutorial
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
#save
import os
#debugging
from pprint import pprint
def load_csv(file_name:str, directory:str="Data/")->list:
"""Load CSV from Data directory.
:param file_name: Filename
:param directory: Directory where file is stored.
:returns: CSV data as a list contain a list for rows. Each row represents a group."""
file_path="Data/"
file_path+=file_name
with open(file_path, 'r') as file:
csv_reader = csv.reader(file)
data = []
for row in csv_reader:
data.append(row)
return data
def clean_compeletion_csv(data:list)->tuple:
"""Cleans up time/accuracy CSV for convient use.
:param data: List of list from load_csv(file_name) where each row is a group.
:returns: List of tuples (time (in secs):float, accuracy as percentage:float) with each row represents a group."""
clean_data=[]
data=data[1:]
for row in data:
junk, junk2, time = row[1].split()
hours, minutes, seconds = map(float, time.split(':'))
in_seconds = minutes * 60 + seconds
clean_data.append((in_seconds,float(row[2])))
return clean_data
#k-means documention: https://scikit-learn.org/1.5/modules/generated/sklearn.cluster.KMeans.html
#k-mean tutorial: https://scikit-learn.org/1.5/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py
def get_names(num_groups)->list:
"""Made this so I can add names to the dots in the matplotlib charts.
:returns: array of strings with names of each person [g#, letter], where g# is the group number and letter is the person in the group (person a, person b, person c, person d)"""
names=[]
for i in range(num_groups*4):
names.append(f'{(i//4)+1}{chr(97+(i%4))}')
return names
def main():
num_groups=11
total_participants=num_groups*4
directory="/Graphs/All_Participant/Summed_Nodes/"
graph_name="new_unnormalized_summed_nodes_of_for_2features"
#load json data - must give a file name, can also take another folder relative to the location of the current file that calls it in the directory
prox_data=DataSet("proximity_graphs.json", my_directed=False)
convo_data=DataSet("conversation_graphs.json", my_directed=True)
atten_data=DataSet("shared_attention_graphs.json", my_directed=False)
fig = plt.figure(figsize=(15, 5))
data_sets=2
num_clusters = 3
data_features=np.zeros((total_participants,data_sets))
#Overhead: so I can do this in one snazzy loop
features=[prox_data,convo_data, atten_data]
axises=[(0,1),(0,2),(1,2)]
axis_name=["Proximity", "Talking", "Attention"]
x_axis=0 #proximity
y_axis=1 #talking
name_labels=get_names(num_groups=11)
#Doing 2 Feature Kmeans for 3 combos of graphs
for index, axis in enumerate(axises):
for i in range(total_participants):
if axis[0] == 1: #this for convo divide it by 3
data_features[i][0]=features[axis[0]].get_sum_all_nodes()[i]/3
else:
data_features[i][0]=features[axis[0]].get_sum_all_nodes()[i]
if axis[1] == 1: #this for convo divide it by 3
data_features[i][1]=features[axis[1]].get_sum_all_nodes()[i]/3
else:
data_features[i][1]=features[axis[1]].get_sum_all_nodes()[i]
###### Make K Means Model and Extract Features ##########
data=data_features
# determine # of clusters
# leave this commented to show the final graph, you can view 3 silloute score OR final graph
# this_graph_name=str(graph_name + "_" + axis_name[axis[0]] + "_" + axis_name[axis[1]])
# finder = OptimalClusterFinder(data=data, max_clusters=10, graph_name=this_graph_name,directory=directory)
# finder.find_optimal_clusters()
# optimal_clusters = finder.get_optimal_clusters()
# print(f"")
# finder.plot_combined_metrics()
# Create KMeans model and fit the data
kmeans = KMeans(n_clusters=num_clusters, random_state=21) # seed at 21 because of forever 21
kmeans.fit(data)
# After the model is made, get the cluster centroids and labels
centroids = kmeans.cluster_centers_ # the center points of the cluster generated by the Kmeans model for each feature
labels = kmeans.labels_ # returns labels for each feature - this is useful because it tells us who is in what roles
# Make silhouette scores core each person
silhouette_scores = silhouette_samples(data, labels)
# Print silhouette scores
# for i, score in enumerate(silhouette_scores):
# print(f"Point {((i//4)+1)}{chr(97+(i%4))}: Silhouette Score = {score}")
roles=[[] for _ in range(num_clusters)]
for i, (label, score) in enumerate(zip(labels, silhouette_scores)):
roles[label].append(f'{(i//4)+1}{chr(97+(i%4))}{score:.2f}')
## prints roles define by k cluster
print_name=graph_name + ":\n"+ axis_name[axis[0]] +" & "+axis_name[axis[1]]
print(print_name)
print(f'Role 1\tRole 2\tRole 3')#there is 3 if num_clusters=3
print(22 * "_")
for element in itertools.zip_longest(*roles):
print(f'{element[0]}\t{element[1]}\t{element[2]}')#there is 3 if num_clusters=3
print("\n")
######## Plotting 2 feature of 3 feature Graph ########
num=130+1+index
ax1 = fig.add_subplot(num)
# Scatter plot
for i in range(num_clusters):
ax1.scatter(data[labels == i, x_axis], data[labels == i, y_axis], label=f'Cluster {i + 1}')
# labels put in the plot
for i in range(total_participants):
ax1.text(data[i, x_axis], data[i, y_axis], name_labels[i]) # Label each point with its index
# Plot centroids - center dots for clusters
ax1.scatter(centroids[:, x_axis], centroids[:, y_axis], s=350, c='red', marker='X', label='Centroids')
ax1.set_title(f'Kmeans for 2 Feature - {axis_name[axis[0]]} and {axis_name[axis[1]]}')
ax1.set_xlabel(f'{axis_name[axis[0]]}')
ax1.set_ylabel(f'{axis_name[axis[1]]}')
ax1.legend()
fig.suptitle("Unnormalized 2 Feature Kmeans Cluster")
path=os.getcwd()
path += directory
path += graph_name +".png"
plt.savefig(path)
plt.show()
if __name__ == "__main__":
main()