-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkumeleme.py
78 lines (65 loc) · 2.59 KB
/
kumeleme.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 22 15:42:13 2024
@author: Doç. Dr. Selçuk KIRAN
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
os.environ["OMP_NUM_THREADS"]="1"
xls = pd.ExcelFile('netflix.xlsx')
df1 = pd.read_excel(xls, 'Sheet4')
xls.close()
data = df1.iloc[:,[0,1,2,7]].values
q = np.quantile(data[:,1], [0,0.25,0.5,0.75,1])
iqr=q[3]-q[1]
lower_boundary=q[1]-1.5*iqr
upper_boundary=q[3]+1.5*iqr
print("Dizi aykırı değer sınırları")
print("---------------------------")
print("Alt sınır: ", lower_boundary)
print("Üst sınır: ", upper_boundary)
q = np.quantile(data[:,2], [0,0.25,0.5,0.75,1])
iqr=q[3]-q[1]
lower_boundary=q[1]-1.5*iqr
upper_boundary=q[3]+1.5*iqr
print("")
print("Film aykırı değer sınırları")
print("---------------------------")
print("Alt sınır: ", lower_boundary)
print("Üst sınır: ", upper_boundary)
#cluster_data = data[data[:,3]==0]
#cluster_data=cluster_data[:,[1,2]]
from sklearn.cluster import KMeans
xls = pd.ExcelFile('netflix.xlsx')
df1 = pd.read_excel(xls, 'Sheet5')
xls.close()
cluster_data = df1.iloc[:,[1,2]].values
wcss = []
kume_sayisi_listesi = range(1, 11)
for i in kume_sayisi_listesi :
kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
kmeans.fit(cluster_data)
wcss.append(kmeans.inertia_)
plt.plot(kume_sayisi_listesi, wcss)
plt.title('Küme Sayısı Belirlemek için Dirsek Yöntemi')
plt.xlabel('Küme Sayısı')
plt.ylabel('WCSS')
plt.show()
kmeans = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_kmeans = kmeans.fit_predict(cluster_data)
temp=np.column_stack((df1,cluster_data))
temp=np.column_stack((temp,kmeans.labels_))
result=pd.DataFrame(temp[:,[0,8]])
result.to_excel(excel_writer = "result.xlsx", header=["Ülke", "Küme"])
plt.scatter(cluster_data[y_kmeans == 0, 0], cluster_data[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Küme 1')
plt.scatter(cluster_data[y_kmeans == 1, 0], cluster_data[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Küme 2')
plt.scatter(cluster_data[y_kmeans == 2, 0], cluster_data[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Küme 3')
plt.scatter(cluster_data[y_kmeans == 3, 0], cluster_data[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Küme 4')
plt.scatter(cluster_data[y_kmeans == 4, 0], cluster_data[y_kmeans == 4, 1], s = 100, c = 'yellow', label = 'Küme 5')
plt.title('Netflix Filmler vs TV Dizileri')
plt.xlabel('TV Dizileri')
plt.ylabel('Filmler')
plt.legend()
plt.show()