-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathplot_varying_epsilon.py
136 lines (112 loc) · 4.67 KB
/
plot_varying_epsilon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
import openml
import pandas as pd
import seaborn as sns
from diffprivlib.models import DecisionTreeClassifier as DiffPrivLibTree
from sklearn.model_selection import ShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_random_state
from tqdm import tqdm
from privatree.bdpt import BDPTClassifier
from privatree.dpgdf import DPGDTClassifier
from privatree.privatree import PrivaTreeClassifier
def plot_figure(results_df, data_id, baseline):
results_df["method"] = results_df["method"].replace("DPGDT", "DPGDF")
# Make sure to not plot anything too far below the baseline (majority class prediction)
# because it will skew the axes by a lot
max_score = results_df.groupby("method").mean()["test accuracy"].max()
min_score = baseline
margin = 0.1 * (max_score - min_score)
_, ax = plt.subplots(figsize=(5.5, 3.5))
sns.lineplot(
x="epsilon", y="test accuracy", hue="method", marker="o", data=results_df
)
ax.axhline(y=baseline, color="gray", ls="dashed", label="majority class")
plt.legend(loc="lower center", bbox_to_anchor=(0.43, 1.0), ncols=3)
plt.xscale("log")
plt.xlabel("$\\epsilon$")
plt.ylim(min_score - margin, max_score + margin)
plt.tight_layout()
plt.savefig(f"out/varying_epsilon_{data_id}.png", bbox_inches="tight")
plt.savefig(f"out/varying_epsilon_{data_id}.pdf", bbox_inches="tight")
plt.close()
max_depth = 4
test_size = 0.2
n_splits = 50
sns.set_theme(style="whitegrid", palette="colorblind")
assert len(sys.argv) == 2
data_id = int(sys.argv[1])
dataset = openml.datasets.get_dataset(data_id)
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format="array", target=dataset.default_target_attribute
)
# Drop rows with NaNs
keep_rows = ~np.any(np.isnan(X), axis=1)
X = X[keep_rows]
y = y[keep_rows]
counts = np.bincount(y, minlength=2)
baseline = max(counts[0] / counts.sum(), counts[1] / counts.sum())
if not os.path.isfile(f"out/varying_epsilon_{data_id}.csv"):
results = []
# for epsilon in np.logspace(-3, 1, 9, base=10):
random_state = check_random_state(0)
for epsilon in np.logspace(-3, 1, 5, base=10):
splitter = ShuffleSplit(
n_splits=n_splits, test_size=test_size, random_state=random_state
)
for train_i, test_i in tqdm(splitter.split(X, y), total=n_splits):
X_train = X[train_i]
y_train = y[train_i]
X_test = X[test_i]
y_test = y[test_i]
# Separate categorical features for DPGDF
X_train_cat = X_train[:, categorical_indicator]
X_test_cat = X_test[:, categorical_indicator]
tree = DecisionTreeClassifier(
max_depth=max_depth, random_state=random_state
)
tree.fit(X_train, y_train)
accuracy = tree.score(X_test, y_test)
results.append((epsilon, accuracy, "Decision tree"))
tree = DiffPrivLibTree(
max_depth=max_depth, epsilon=epsilon, random_state=random_state
)
tree.fit(X_train, y_train)
accuracy = tree.score(X_test, y_test)
results.append((epsilon, accuracy, "DiffPrivLib"))
tree = PrivaTreeClassifier(
max_depth=max_depth,
epsilon=epsilon,
categorical_features=categorical_indicator,
random_state=random_state,
)
tree.fit(X_train, y_train)
accuracy = tree.score(X_test, y_test)
results.append((epsilon, accuracy, "PrivaTree (ours)"))
tree = BDPTClassifier(
max_depth=max_depth, epsilon=epsilon, random_state=random_state
)
tree.fit(X_train, y_train)
accuracy = tree.score(X_test, y_test)
results.append((epsilon, accuracy, "BDPT"))
if X_train_cat.shape[1] > 0:
tree = DPGDTClassifier(
max_depth=max_depth,
epsilon=epsilon,
categorical_features=True,
random_state=random_state,
)
tree.fit(X_train_cat, y_train)
accuracy = tree.score(X_test_cat, y_test)
results.append((epsilon, accuracy, "DPGDT"))
results_df = pd.DataFrame(
results, columns=["epsilon", "test accuracy", "method"]
)
results_df.to_csv(f"out/varying_epsilon_{data_id}.csv")
plot_figure(results_df, data_id, baseline)
else:
results_df = pd.read_csv(f"out/varying_epsilon_{data_id}.csv")
plot_figure(results_df, data_id, baseline)