|
5 | 5 |
|
6 | 6 | # Discrete Choice Models - Fair's Affair data
|
7 | 7 |
|
8 |
| -# <rawcell> |
| 8 | +# <markdowncell> |
9 | 9 |
|
10 |
| -# A survey of women only was conducted in 1974 by Redbook asking about extramarital affairs. |
| 10 | +# A survey of women only was conducted in 1974 by *Redbook* asking about extramarital affairs. |
11 | 11 |
|
12 | 12 | # <codecell>
|
13 | 13 |
|
|
68 | 68 |
|
69 | 69 | # <codecell>
|
70 | 70 |
|
71 |
| -resp = dict(zip(range(1,9), respondent1000[["occupation", "educ", "occupation_husb", "rate_marriage", "age", "yrs_married", "children", "religious"]].tolist())) |
| 71 | +resp = dict(zip(range(1,9), respondent1000[["occupation", "educ", |
| 72 | + "occupation_husb", "rate_marriage", |
| 73 | + "age", "yrs_married", "children", |
| 74 | + "religious"]].tolist())) |
72 | 75 | resp.update({0 : 1})
|
73 | 76 | print resp
|
74 | 77 |
|
|
77 | 80 | mfx = affair_mod.margeff(atexog=resp)
|
78 | 81 | print pandas.Series(mfx, index=affair_mod.params.index[1:])
|
79 | 82 |
|
| 83 | +# <codecell> |
| 84 | + |
| 85 | +affair_mod.predict(respondent1000) |
| 86 | + |
| 87 | +# <codecell> |
| 88 | + |
| 89 | +affair_mod.fittedvalues[1000] |
| 90 | + |
| 91 | +# <codecell> |
| 92 | + |
| 93 | +affair_mod.model.cdf(affair_mod.fittedvalues[1000]) |
| 94 | + |
80 | 95 | # <rawcell>
|
81 | 96 |
|
82 |
| -# We do have a problem in that we have used an inefficient estimator, and our coefficients are biased. This can lead to large differences in the estimated marginal effects. The "correct" model here is likely the Tobit model. We have an work in progress branch "tobit-model" on github, if anyone is interested in censored regression models. |
| 97 | +# The "correct" model here is likely the Tobit model. We have an work in progress branch "tobit-model" on github, if anyone is interested in censored regression models. |
83 | 98 |
|
84 | 99 | # <headingcell level=3>
|
85 | 100 |
|
|
105 | 120 |
|
106 | 121 | # <rawcell>
|
107 | 122 |
|
108 |
| -# Compare the estimates of the Logit Fair model above to a Probit model. |
| 123 | +# Compare the estimates of the Logit Fair model above to a Probit model. Does the prediction table look better? Much difference in marginal effects? |
109 | 124 |
|
110 | 125 | # <headingcell level=3>
|
111 | 126 |
|
112 |
| -# GLM Example |
| 127 | +# Genarlized Linear Model Example |
| 128 | + |
| 129 | +# <codecell> |
| 130 | + |
| 131 | +print sm.datasets.star98.SOURCE |
| 132 | + |
| 133 | +# <codecell> |
| 134 | + |
| 135 | +print sm.datasets.star98.DESCRLONG |
| 136 | + |
| 137 | +# <codecell> |
| 138 | + |
| 139 | +print sm.datasets.star98.NOTE |
| 140 | + |
| 141 | +# <codecell> |
| 142 | + |
| 143 | +dta = sm.datasets.star98.load_pandas().data |
| 144 | +print dta.columns |
| 145 | + |
| 146 | +# <codecell> |
| 147 | + |
| 148 | +print dta[['NABOVE', 'NBELOW', 'LOWINC', 'PERASIAN', 'PERBLACK', 'PERHISP', 'PERMINTE']].head(10) |
| 149 | + |
| 150 | +# <codecell> |
| 151 | + |
| 152 | +print dta[['AVYRSEXP', 'AVSALK', 'PERSPENK', 'PTRATIO', 'PCTAF', 'PCTCHRT', 'PCTYRRND']].head(10) |
| 153 | + |
| 154 | +# <codecell> |
| 155 | + |
| 156 | +formula = 'NABOVE + NBELOW ~ LOWINC + PERASIAN + PERBLACK + PERHISP + PCTCHRT ' |
| 157 | +formula += '+ PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF' |
| 158 | + |
| 159 | +# <headingcell level=4> |
| 160 | + |
| 161 | +# Aside: Binomial distribution |
| 162 | + |
| 163 | +# <rawcell> |
| 164 | + |
| 165 | +# Toss a six-sided die 5 times, what's the probability of exactly 2 fours? |
| 166 | + |
| 167 | +# <codecell> |
| 168 | + |
| 169 | +stats.binom(5, 1./6).pmf(2) |
| 170 | + |
| 171 | +# <codecell> |
| 172 | + |
| 173 | +from scipy.misc import comb |
| 174 | +comb(5,2) * (1/6.)**2 * (5/6.)**3 |
| 175 | + |
| 176 | +# <codecell> |
| 177 | + |
| 178 | +from statsmodels.formula.api import glm |
| 179 | +glm_mod = glm(formula, dta, family=sm.families.Binomial()).fit() |
| 180 | + |
| 181 | +# <codecell> |
| 182 | + |
| 183 | +print glm_mod.summary() |
| 184 | + |
| 185 | +# <rawcell> |
| 186 | + |
| 187 | +# The number of trials |
| 188 | + |
| 189 | +# <codecell> |
| 190 | + |
| 191 | +glm_mod.model._data._orig_endog.sum(1) |
| 192 | + |
| 193 | +# <codecell> |
| 194 | + |
| 195 | +glm_mod.fittedvalues * glm_mod.model._data._orig_endog.sum(1) |
| 196 | + |
| 197 | +# <rawcell> |
| 198 | + |
| 199 | +# First differences: We hold all explanatory variables constant at their means and manipulate the percentage of low income households to assess its impact |
| 200 | +# on the response variables: |
| 201 | + |
| 202 | +# <codecell> |
| 203 | + |
| 204 | +exog = glm_mod.model._data._orig_exog # get the dataframe |
| 205 | + |
| 206 | +# <codecell> |
| 207 | + |
| 208 | +means25 = exog.mean() |
| 209 | +print means25 |
| 210 | + |
| 211 | +# <codecell> |
| 212 | + |
| 213 | +means25['LOWINC'] = exog['LOWINC'].quantile(.25) |
| 214 | +print means25 |
| 215 | + |
| 216 | +# <codecell> |
| 217 | + |
| 218 | +means75 = exog.mean() |
| 219 | +means75['LOWINC'] = exog['LOWINC'].quantile(.75) |
| 220 | +print means75 |
| 221 | + |
| 222 | +# <codecell> |
| 223 | + |
| 224 | +resp25 = glm_mod.predict(means25) |
| 225 | +resp75 = glm_mod.predict(means75) |
| 226 | +diff = resp75 - resp25 |
| 227 | + |
| 228 | +# <rawcell> |
| 229 | + |
| 230 | +# The interquartile first difference for the percentage of low income households in a school district is: |
| 231 | + |
| 232 | +# <codecell> |
| 233 | + |
| 234 | +print "%2.4f%%" % (diff[0]*100) |
| 235 | + |
| 236 | +# <codecell> |
| 237 | + |
| 238 | +nobs = glm_mod.nobs |
| 239 | +y = glm_mod.model.endog |
| 240 | +yhat = glm_mod.mu |
| 241 | + |
| 242 | +# <codecell> |
| 243 | + |
| 244 | +from statsmodels.graphics.api import abline_plot |
| 245 | +fig = plt.figure(figsize=(12,8)) |
| 246 | +ax = fig.add_subplot(111, ylabel='Observed Values', xlabel='Fitted Values') |
| 247 | +ax.scatter(yhat, y) |
| 248 | +y_vs_yhat = sm.OLS(y, sm.add_constant(yhat, prepend=True)).fit() |
| 249 | +fig = abline_plot(model_results=y_vs_yhat, ax=ax) |
| 250 | + |
| 251 | +# <headingcell level=4> |
| 252 | + |
| 253 | +# Plot fitted values vs Pearson residuals |
| 254 | + |
| 255 | +# <markdowncell> |
| 256 | + |
| 257 | +# Pearson residuals are defined to be |
| 258 | +# |
| 259 | +# $$\frac{(y - \mu)}{\sqrt{(var(\mu))}}$$ |
| 260 | +# |
| 261 | +# where var is typically determined by the family. E.g., binomial variance is $np(1 - p)$ |
| 262 | + |
| 263 | +# <codecell> |
| 264 | + |
| 265 | +fig = plt.figure(figsize=(12,8)) |
| 266 | +ax = fig.add_subplot(111, title='Residual Dependence Plot', xlabel='Fitted Values', |
| 267 | + ylabel='Pearson Residuals') |
| 268 | +ax.scatter(yhat, stats.zscore(glm_mod.resid_pearson)) |
| 269 | +ax.axis('tight') |
| 270 | +ax.plot([0.0, 1.0],[0.0, 0.0], 'k-'); |
| 271 | + |
| 272 | +# <headingcell level=4> |
| 273 | + |
| 274 | +# Histogram of standardized deviance residuals with Kernel Density Estimate overlayed |
| 275 | + |
| 276 | +# <markdowncell> |
| 277 | + |
| 278 | +# The definition of the deviance residuals depends on the family. For the Binomial distribution this is |
| 279 | +# |
| 280 | +# $$r_{dev} = sign\(Y-\mu\)*\sqrt{2n(Y\log\frac{Y}{\mu}+(1-Y)\log\frac{(1-Y)}{(1-\mu)}}$$ |
| 281 | +# |
| 282 | +# They can be used to detect ill-fitting covariates |
| 283 | + |
| 284 | +# <codecell> |
| 285 | + |
| 286 | +resid = glm_mod.resid_deviance |
| 287 | +resid_std = stats.zscore(resid) |
| 288 | +kde_resid = sm.nonparametric.KDE(resid_std) |
| 289 | +kde_resid.fit() |
| 290 | + |
| 291 | +# <codecell> |
| 292 | + |
| 293 | +fig = plt.figure(figsize=(12,8)) |
| 294 | +ax = fig.add_subplot(111, title="Standardized Deviance Residuals") |
| 295 | +ax.hist(resid_std, bins=25, normed=True); |
| 296 | +ax.plot(kde_resid.support, kde_resid.density, 'r'); |
| 297 | + |
| 298 | +# <headingcell level=4> |
| 299 | + |
| 300 | +# QQ-plot of deviance residuals |
| 301 | + |
| 302 | +# <codecell> |
| 303 | + |
| 304 | +fig = plt.figure(figsize=(12,8)) |
| 305 | +ax = fig.add_subplot(111) |
| 306 | +fig = sm.graphics.qqplot(resid, line='r', ax=ax) |
113 | 307 |
|
0 commit comments