-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathstochastic_counterfactuals.py
166 lines (132 loc) · 6.01 KB
/
stochastic_counterfactuals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from util import *
from distance_functions import *
from neighbor_generator import generate_random_data
from sys import maxint
from scipy.optimize import least_squares
def apply_counterfactual(x, delta, continuous, discrete):
xcf = cPickle.loads(cPickle.dumps(x))
for att, val in delta.items():
new_val = None
if att in continuous:
delta = val
new_val = xcf[att] - delta
if att in discrete:
new_val = val
xcf[att] = new_val
return xcf
def get_random_counterfactual(dfx, blackbox, diff_outcome, X2E, class_name, columns, discrete, continuous, features_type,
label_encoder, mad=None, max_iter=1000, tot_max_iter=100000):
columns_no_class = list(columns)
columns_no_class.remove(class_name)
y1 = diff_outcome
best_x1 = None
min_dist = np.inf
count = 0
count_tot = 0
if mad:
def distance_function(x0, x1):
return mad_distance(x0, x1, mad)
else:
def distance_function(x0, x1):
return mixed_distance(x0, x1, discrete, continuous, class_name,
ddist=simple_match_distance,
cdist=normalized_euclidean_distance)
while True:
count_tot += 1
x1 = np.array(generate_random_data(X2E, class_name, columns, discrete, continuous, features_type,
size=1, uniform=True)[-1])
df_x1 = pd.DataFrame(data=x1.reshape(1, -1), columns=columns_no_class).to_dict('records')[0]
fwx1 = blackbox.predict(x1.reshape(1, -1))[0]
if fwx1 == y1:
count += 1
if mad:
dist = distance_function(dfx.values(), df_x1.values())
else:
dist = distance_function(dfx, df_x1)
if dist < min_dist:
min_dist = dist
best_x1 = x1
if count == max_iter or count_tot == tot_max_iter:
break
medoid = pd.DataFrame(data=best_x1.reshape(1, -1), columns=columns_no_class)
discrete_no_class = list(discrete)
discrete_no_class.remove(class_name)
medoid = label_decode(medoid, discrete_no_class, label_encoder)
medoid = medoid.to_dict('records')[0]
counterfactuals = list()
counterfactual = dict()
for att, val in medoid.items():
if att == class_name:
continue
if att in discrete:
if dfx[att] != val:
counterfactual[att] = val
elif att in continuous:
if dfx[att] - val != 0.0:
counterfactual[att] = dfx[att] - val
counterfactuals.append(counterfactual)
return counterfactuals
def fun_mixed(x1_lambda, y1, df_x, blackbox, discrete, continuous, class_name, columns_no_class):
x1, lambdav = x1_lambda[:-1], x1_lambda[-1]
df_x1 = pd.DataFrame(data=x1.reshape(1, -1), columns=columns_no_class).to_dict('records')[0]
d = mixed_distance(df_x, df_x1, discrete, continuous, class_name,
ddist=simple_match_distance, cdist=normalized_euclidean_distance)
fwx1 = blackbox.predict(x1.reshape(1, -1))[0]
return lambdav*(fwx1-y1)**2 + d
def fun_mad(x1_lambda, y1, df_x, blackbox, columns_no_class, mad):
x1, lambdav = x1_lambda[:-1], x1_lambda[-1]
df_x1 = pd.DataFrame(data=x1.reshape(1, -1), columns=columns_no_class).to_dict('records')[0]
d = mad_distance(df_x.values(), df_x1.values(), mad)
fwx1 = blackbox.predict(x1.reshape(1, -1))[0]
return lambdav*(fwx1-y1)**2 + d
def get_stochastic_counterfactual(dfx, blackbox, X2E, diff_outcome, class_name, columns, discrete, continuous,
features_type, label_encoder, mad=None, max_iter=1000):
columns_no_class = list(columns)
columns_no_class.remove(class_name)
y1 = diff_outcome
min_bounds = np.append(np.min(X2E, axis=0), 0)
max_bounds = np.append(np.max(X2E, axis=0), maxint)
bounds = (min_bounds, max_bounds)
niter = 0
while True:
x1 = np.array(generate_random_data(X2E, class_name, columns, discrete, continuous, features_type,
size=1, uniform=True)[-1])
x1_lambdav = np.append(x1, [np.random.randint(min_bounds[-1], max_bounds[-1])])
fun = fun_mad if mad else fun_mixed
d0 = fun(x1_lambdav, y1, dfx, blackbox, columns_no_class, mad) if mad else fun(
x1_lambdav, y1, dfx, blackbox, discrete, continuous, class_name, columns_no_class)
try:
if mad:
res = least_squares(fun, x1_lambdav, bounds=bounds, args=(y1, dfx, blackbox, columns_no_class, mad,),
xtol=1e-4, max_nfev=1000)
else:
res = least_squares(fun, x1_lambdav, bounds=bounds, args=(
y1, dfx, blackbox, discrete, continuous, class_name, columns_no_class,),
xtol=1e-4, max_nfev=1000)
except ValueError:
continue
d = fun(res.x, y1, dfx, blackbox, columns_no_class, mad) if mad else fun(res.x, y1, dfx, blackbox,
discrete, continuous, class_name, columns_no_class)
niter += 1
if blackbox.predict(res.x[:-1].reshape(1, -1))[0] == y1 and d < d0:
break
if niter >= max_iter:
break
medoid = pd.DataFrame(data=res.x[:-1].astype(int).reshape(1, -1), columns=columns_no_class)
discrete_no_class = list(discrete)
discrete_no_class.remove(class_name)
medoid = label_decode(medoid, discrete_no_class, label_encoder)
medoid = medoid.to_dict('records')[0]
counterfactuals = list()
counterfactual = dict()
for att, val in medoid.items():
if att == class_name:
continue
if att in discrete:
if dfx[att] != val:
counterfactual[att] = val
elif att in continuous:
if dfx[att] - val != 0.0:
counterfactual[att] = dfx[att] - val
counterfactuals.append(counterfactual)
return counterfactuals