Skip to content

Commit 71310e7

Browse files
committed
Fixes
1 parent 50c1a6e commit 71310e7

21 files changed

+3344
-525
lines changed

Untitled0.ipynb

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"metadata": {
3+
"name": "Untitled0"
4+
},
5+
"nbformat": 3,
6+
"nbformat_minor": 0,
7+
"worksheets": []
8+
}

discrete_choice.ipynb

+844-26
Large diffs are not rendered by default.

discrete_choice.py

+200-6
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55

66
# Discrete Choice Models - Fair's Affair data
77

8-
# <rawcell>
8+
# <markdowncell>
99

10-
# A survey of women only was conducted in 1974 by Redbook asking about extramarital affairs.
10+
# A survey of women only was conducted in 1974 by *Redbook* asking about extramarital affairs.
1111

1212
# <codecell>
1313

@@ -68,7 +68,10 @@
6868

6969
# <codecell>
7070

71-
resp = dict(zip(range(1,9), respondent1000[["occupation", "educ", "occupation_husb", "rate_marriage", "age", "yrs_married", "children", "religious"]].tolist()))
71+
resp = dict(zip(range(1,9), respondent1000[["occupation", "educ",
72+
"occupation_husb", "rate_marriage",
73+
"age", "yrs_married", "children",
74+
"religious"]].tolist()))
7275
resp.update({0 : 1})
7376
print resp
7477

@@ -77,9 +80,21 @@
7780
mfx = affair_mod.margeff(atexog=resp)
7881
print pandas.Series(mfx, index=affair_mod.params.index[1:])
7982

83+
# <codecell>
84+
85+
affair_mod.predict(respondent1000)
86+
87+
# <codecell>
88+
89+
affair_mod.fittedvalues[1000]
90+
91+
# <codecell>
92+
93+
affair_mod.model.cdf(affair_mod.fittedvalues[1000])
94+
8095
# <rawcell>
8196

82-
# We do have a problem in that we have used an inefficient estimator, and our coefficients are biased. This can lead to large differences in the estimated marginal effects. The "correct" model here is likely the Tobit model. We have an work in progress branch "tobit-model" on github, if anyone is interested in censored regression models.
97+
# The "correct" model here is likely the Tobit model. We have an work in progress branch "tobit-model" on github, if anyone is interested in censored regression models.
8398

8499
# <headingcell level=3>
85100

@@ -105,9 +120,188 @@
105120

106121
# <rawcell>
107122

108-
# Compare the estimates of the Logit Fair model above to a Probit model.
123+
# Compare the estimates of the Logit Fair model above to a Probit model. Does the prediction table look better? Much difference in marginal effects?
109124

110125
# <headingcell level=3>
111126

112-
# GLM Example
127+
# Genarlized Linear Model Example
128+
129+
# <codecell>
130+
131+
print sm.datasets.star98.SOURCE
132+
133+
# <codecell>
134+
135+
print sm.datasets.star98.DESCRLONG
136+
137+
# <codecell>
138+
139+
print sm.datasets.star98.NOTE
140+
141+
# <codecell>
142+
143+
dta = sm.datasets.star98.load_pandas().data
144+
print dta.columns
145+
146+
# <codecell>
147+
148+
print dta[['NABOVE', 'NBELOW', 'LOWINC', 'PERASIAN', 'PERBLACK', 'PERHISP', 'PERMINTE']].head(10)
149+
150+
# <codecell>
151+
152+
print dta[['AVYRSEXP', 'AVSALK', 'PERSPENK', 'PTRATIO', 'PCTAF', 'PCTCHRT', 'PCTYRRND']].head(10)
153+
154+
# <codecell>
155+
156+
formula = 'NABOVE + NBELOW ~ LOWINC + PERASIAN + PERBLACK + PERHISP + PCTCHRT '
157+
formula += '+ PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF'
158+
159+
# <headingcell level=4>
160+
161+
# Aside: Binomial distribution
162+
163+
# <rawcell>
164+
165+
# Toss a six-sided die 5 times, what's the probability of exactly 2 fours?
166+
167+
# <codecell>
168+
169+
stats.binom(5, 1./6).pmf(2)
170+
171+
# <codecell>
172+
173+
from scipy.misc import comb
174+
comb(5,2) * (1/6.)**2 * (5/6.)**3
175+
176+
# <codecell>
177+
178+
from statsmodels.formula.api import glm
179+
glm_mod = glm(formula, dta, family=sm.families.Binomial()).fit()
180+
181+
# <codecell>
182+
183+
print glm_mod.summary()
184+
185+
# <rawcell>
186+
187+
# The number of trials
188+
189+
# <codecell>
190+
191+
glm_mod.model._data._orig_endog.sum(1)
192+
193+
# <codecell>
194+
195+
glm_mod.fittedvalues * glm_mod.model._data._orig_endog.sum(1)
196+
197+
# <rawcell>
198+
199+
# First differences: We hold all explanatory variables constant at their means and manipulate the percentage of low income households to assess its impact
200+
# on the response variables:
201+
202+
# <codecell>
203+
204+
exog = glm_mod.model._data._orig_exog # get the dataframe
205+
206+
# <codecell>
207+
208+
means25 = exog.mean()
209+
print means25
210+
211+
# <codecell>
212+
213+
means25['LOWINC'] = exog['LOWINC'].quantile(.25)
214+
print means25
215+
216+
# <codecell>
217+
218+
means75 = exog.mean()
219+
means75['LOWINC'] = exog['LOWINC'].quantile(.75)
220+
print means75
221+
222+
# <codecell>
223+
224+
resp25 = glm_mod.predict(means25)
225+
resp75 = glm_mod.predict(means75)
226+
diff = resp75 - resp25
227+
228+
# <rawcell>
229+
230+
# The interquartile first difference for the percentage of low income households in a school district is:
231+
232+
# <codecell>
233+
234+
print "%2.4f%%" % (diff[0]*100)
235+
236+
# <codecell>
237+
238+
nobs = glm_mod.nobs
239+
y = glm_mod.model.endog
240+
yhat = glm_mod.mu
241+
242+
# <codecell>
243+
244+
from statsmodels.graphics.api import abline_plot
245+
fig = plt.figure(figsize=(12,8))
246+
ax = fig.add_subplot(111, ylabel='Observed Values', xlabel='Fitted Values')
247+
ax.scatter(yhat, y)
248+
y_vs_yhat = sm.OLS(y, sm.add_constant(yhat, prepend=True)).fit()
249+
fig = abline_plot(model_results=y_vs_yhat, ax=ax)
250+
251+
# <headingcell level=4>
252+
253+
# Plot fitted values vs Pearson residuals
254+
255+
# <markdowncell>
256+
257+
# Pearson residuals are defined to be
258+
#
259+
# $$\frac{(y - \mu)}{\sqrt{(var(\mu))}}$$
260+
#
261+
# where var is typically determined by the family. E.g., binomial variance is $np(1 - p)$
262+
263+
# <codecell>
264+
265+
fig = plt.figure(figsize=(12,8))
266+
ax = fig.add_subplot(111, title='Residual Dependence Plot', xlabel='Fitted Values',
267+
ylabel='Pearson Residuals')
268+
ax.scatter(yhat, stats.zscore(glm_mod.resid_pearson))
269+
ax.axis('tight')
270+
ax.plot([0.0, 1.0],[0.0, 0.0], 'k-');
271+
272+
# <headingcell level=4>
273+
274+
# Histogram of standardized deviance residuals with Kernel Density Estimate overlayed
275+
276+
# <markdowncell>
277+
278+
# The definition of the deviance residuals depends on the family. For the Binomial distribution this is
279+
#
280+
# $$r_{dev} = sign\(Y-\mu\)*\sqrt{2n(Y\log\frac{Y}{\mu}+(1-Y)\log\frac{(1-Y)}{(1-\mu)}}$$
281+
#
282+
# They can be used to detect ill-fitting covariates
283+
284+
# <codecell>
285+
286+
resid = glm_mod.resid_deviance
287+
resid_std = stats.zscore(resid)
288+
kde_resid = sm.nonparametric.KDE(resid_std)
289+
kde_resid.fit()
290+
291+
# <codecell>
292+
293+
fig = plt.figure(figsize=(12,8))
294+
ax = fig.add_subplot(111, title="Standardized Deviance Residuals")
295+
ax.hist(resid_std, bins=25, normed=True);
296+
ax.plot(kde_resid.support, kde_resid.density, 'r');
297+
298+
# <headingcell level=4>
299+
300+
# QQ-plot of deviance residuals
301+
302+
# <codecell>
303+
304+
fig = plt.figure(figsize=(12,8))
305+
ax = fig.add_subplot(111)
306+
fig = sm.graphics.qqplot(resid, line='r', ax=ax)
113307

0 commit comments

Comments
 (0)