forked from jasonkli/cs231nProject
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_data.py
More file actions
executable file
·120 lines (108 loc) · 3.18 KB
/
extract_data.py
File metadata and controls
executable file
·120 lines (108 loc) · 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from os import listdir
from os.path import isdir, join
from PIL import Image
import numpy as np
import os
import csv
FRAMES_PATH = "/frames/x40/"
ATYPIA_PATH = "/atypia/x40/"
SIZE = 36,36
def imageToVector(directory):
frames = directory + FRAMES_PATH
atypia = directory + ATYPIA_PATH
frames_files = [join(frames,f) for f in listdir(frames)]
atypia_files = [join(atypia, f) for f in listdir(atypia)]
num_files = len(frames_files)
data = []
scores = []
for i in range(num_files):
with open(atypia_files[i], 'r') as f:
lines = list(csv.reader(f, delimiter=','))
"""if len(lines) > 0 and int(lines[0][0]) < 3:
scores.append(int(lines[0][0]))
img = Image.open(frames_files[i])
img.thumbnail(SIZE, Image.ANTIALIAS)
arr = np.array(img)
arr = arr[:, 0:arr.shape[0],:]
data.append(arr)"""
if len(lines[0]) > 1:
score = int(max(set(lines[1][1:4]), key=lines[1][1:4].count))
if score < 3:
scores.append(int(max(set(lines[1][1:4]), key=lines[1][1:4].count)))
img = Image.open(frames_files[i])
img.thumbnail(SIZE, Image.ANTIALIAS)
arr = np.array(img)
arr = arr[:, 0:arr.shape[0],:]
data.append(arr)
return np.array(data), np.array(scores)
def main():
directories = [f for f in listdir(os.getcwd()) if isdir(f) and 'git' not in f]
X_a = []
y_a = []
X_h = []
y_h = []
X = []
y = []
# Extract data from files into X and y
for directory in directories:
data, scores = imageToVector(directory)
a,b,c,d = data.shape
data = data.reshape(a,d,b,c)
X.append(data)
y.append(scores)
if "A" in directory:
X_a.append(data)
y_a.append(scores)
else:
X_h.append(data)
y_h.append(scores)
X = np.concatenate(np.array(X), axis=0) / 255.0
y = np.concatenate(np.array(y), axis=0) - 1
X_a = np.concatenate(np.array(X_a), axis=0) / 255.0
y_a = np.concatenate(np.array(y_a), axis=0) - 1
X_h = np.concatenate(np.array(X_h), axis=0) / 255.0
y_h = np.concatenate(np.array(y_h), axis=0) - 1
print(np.count_nonzero(y_a == 0))
print(np.count_nonzero(y_a == 1))
length = X_a.shape[0]
print(X_a[0].shape)
print(len(X_a), len(X_h))
# Randomize
index = np.random.permutation(length)
index_full = np.random.permutation(2 * length)
X_a= X_a[index]
y_a = y_a[index]
X_h = X_h[index]
y_h = y_h[index]
X = X[index_full]
y = y[index_full]
#Split into train and val sets
cutoff = int(length * .7)
X_a_train = X_a[0:cutoff,:,:,:]
y_a_train = y_a[0:cutoff]
X_a_val = X_a[cutoff:length,:,:,:]
y_a_val = y_a[cutoff:length]
X_h_train = X_h[0:cutoff,:,:,:]
y_h_train = y_h[0:cutoff]
X_h_val = X_h[cutoff:length,:,:,:]
y_h_val = y_h[cutoff:length]
X_train = X[0:2*cutoff,:,:,:]
y_train = y[0:2*cutoff]
X_val = X[2*cutoff:2*length,:,:,:]
y_val = y[2*cutoff:2*length]
print(len(X_train), len(X_val))
#Save files
np.save('X_a_train.npy', X_a_train)
np.save('y_a_train.npy', y_a_train)
np.save('X_a_val.npy', X_a_val)
np.save('y_a_val.npy', y_a_val)
np.save('X_h_train.npy', X_h_train)
np.save('y_h_train.npy', y_h_train)
np.save('X_h_val.npy', X_h_val)
np.save('y_h_val.npy', y_h_val)
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_val.npy', X_val)
np.save('y_val.npy', y_val)
if __name__ == "__main__":
main()