-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlab03.py
127 lines (98 loc) · 4.24 KB
/
lab03.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
# PCA & LDA
Apply PCA and LDA to the project data. Start analyzing the effects of PCA on the
features. Plot the histogram of the projected features for the 6 PCA directions,
starting from the principal (largest variance). What do you observe? What are
the effects on the class distributions? Can you spot the different clusters
inside each class?
Apply LDA (1 dimensional, since we have just two classes), and compute the
histogram of the projected LDA samples. What do you observe? Do the classes
overlap? Compared to the histograms of the 6 features you computed in Laboratory
2, is LDA finding a good direction with little class overlap?
Try applying LDA as classifier. Divide the dataset in model training and
validation sets (you can reuse the previous function to split the dataset).
Apply LDA, select the orientation that results in the projected mean of class
True (label 1) being larger than the projected mean of class False (label 0),
and select the threshold as in the previous sections, i.e., as the average of
the projected class means. Compute the predictions on the validation data, and
the corresponding error rate.
Now try changing the value of the threshold. What do you observe? Can you find
values that improve the classification accuracy?
Finally, try pre-processing the features with PCA. Apply PCA (estimated on the
model training data only), and then classify the validation data with LDA.
Analyze the performance as a function of the number of PCA dimensions m. What do
you observe? Can you find values of m that improve the accuracy on the
validation set? Is PCA beneficial for the task when combined with the LDA
classifier?
"""
import numpy as np
from rich.console import Console
from project.figures.plots import hist, plot
from project.figures.rich import table
from project.funcs.base import load_data, split_db_2to1
from project.funcs.lda import lda
from project.funcs.pca import pca
def lab03(DATA: str):
console = Console()
X, y = load_data(DATA)
_, PCA_data = pca(X, X.shape[1])
hist(PCA_data, y, file_name="pca/histograms")
_, LDA_data = lda(X, y, 1)
hist(LDA_data, y, file_name="lda/histograms")
# Classification using LDA
(X_train, y_train), (X_val, y_val) = split_db_2to1(X.T, y)
X_train = X_train.T
X_val = X_val.T
_, X_train_lda = lda(X_train, y_train, 1)
_, X_val_lda = lda(X_val, y_val, 1)
threshold = (
X_train_lda[y_train == 0].mean() + X_train_lda[y_train == 1].mean()
) / 2.0
y_pred = [0 if x >= threshold else 1 for x in X_val_lda.T[0]]
table(
console,
"Mean of the projected class means",
{
"Threshold": f"{threshold:.2f}",
"Error rate": f"{np.sum(y_val != y_pred) / y_val.size * 100:.2f}%",
},
)
# Check if we can find a better threshold
thresholds = np.linspace(X_train_lda.min(), X_train_lda.max(), 1000)
empirical_error_rate = None
empricial_threshold = None
for threshold in thresholds:
y_pred = [0 if x >= threshold else 1 for x in X_val_lda.T[0]]
new_err = np.sum(y_val != y_pred) / y_val.size * 100
if empirical_error_rate is None or new_err < empirical_error_rate:
empirical_error_rate = new_err
empricial_threshold = threshold
table(
console,
"Brute force search",
{
"Threshold": f"{empricial_threshold:.2f}",
"Error rate": f"{empirical_error_rate:.2f}%",
},
)
error_rates_pca = [] # Error rates in percentage
for i in range(1, X.shape[1] + 1):
X_train_pca = pca(X_train, i)[1]
_, X_train_lda = lda(X_train_pca, y_train, 1)
_, X_val_lda = lda(X_val, y_val, 1)
threshold = (
X_train_lda[y_train == 0].mean() + X_train_lda[y_train == 1].mean()
) / 2.0
# Predict the validation data
y_pred = [0 if x >= threshold else 1 for x in X_val_lda.T[0]]
# Calculate the error rate
error_rates_pca.append(np.sum(y_val != y_pred) / y_val.size * 100)
plot(
dict({"Error rate": error_rates_pca}),
range(1, X.shape[1] + 1),
colors=["purple"],
file_name="error_rate_pca",
figsize=(8, 3),
xlabel="Number of PCA dimensions",
ylabel="Error rate (%)",
)