Skip to content

Commit 35c8f52

Browse files
author
Soledad Galli
committed
renumbered chapters, expanded reqs.txt, improved ch4 var transf
1 parent 5830965 commit 35c8f52

File tree

77 files changed

+505
-129
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+505
-129
lines changed

ch4-tranforming-numerical-vars/Recipe-1-logarithmic-transformation.ipynb renamed to ch04-tranforming-numerical-vars/Recipe-1-logarithmic-transformation.ipynb

+34-12
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@
180180
"data = pd.DataFrame(boston_dataset.data,\n",
181181
" columns=boston_dataset.feature_names)\n",
182182
"\n",
183+
"# display top 5 rows (not in book recipe)\n",
183184
"data.head()"
184185
]
185186
},
@@ -202,7 +203,7 @@
202203
}
203204
],
204205
"source": [
205-
"# plot the histogram to inspect variable distributions\n",
206+
"# plot the histogram to inspect variable distributions (not in book)\n",
206207
"\n",
207208
"data.hist(bins=30, figsize=(12,12))\n",
208209
"plt.show()"
@@ -268,6 +269,9 @@
268269
"metadata": {},
269270
"outputs": [],
270271
"source": [
272+
"# make a copy of the dataframe where we will store the modified\n",
273+
"# variables\n",
274+
"\n",
271275
"data_tf = data.copy()"
272276
]
273277
},
@@ -277,6 +281,7 @@
277281
"metadata": {},
278282
"outputs": [],
279283
"source": [
284+
"# apply log to a set of variables\n",
280285
"data_tf[['LSTAT', 'NOX', 'DIS', 'RM']] = np.log(\n",
281286
" data[['LSTAT', 'NOX', 'DIS', 'RM']])"
282287
]
@@ -300,6 +305,8 @@
300305
}
301306
],
302307
"source": [
308+
"# use diagnostic plot function to address variable transformation\n",
309+
"\n",
303310
"diagnostic_plots(data_tf, 'LSTAT')"
304311
]
305312
},
@@ -316,6 +323,7 @@
316323
"metadata": {},
317324
"outputs": [],
318325
"source": [
326+
"# start the transformer with NumPy log as argument\n",
319327
"transformer = FunctionTransformer(np.log, validate=True)"
320328
]
321329
},
@@ -325,8 +333,10 @@
325333
"metadata": {},
326334
"outputs": [],
327335
"source": [
336+
"# make a list of columns to transform\n",
328337
"cols = ['LSTAT', 'NOX', 'DIS', 'RM']\n",
329338
"\n",
339+
"# transform subset of dataframe\n",
330340
"data_t = transformer.transform(data[cols])"
331341
]
332342
},
@@ -336,6 +346,7 @@
336346
"metadata": {},
337347
"outputs": [],
338348
"source": [
349+
"# capture returned NumPy array in pandas dataframe\n",
339350
"data_tf = pd.DataFrame(data_t, columns=cols)"
340351
]
341352
},
@@ -358,14 +369,16 @@
358369
}
359370
],
360371
"source": [
372+
"# use diagnostic plot function to evaluate transformation\n",
373+
"# (not in book)\n",
361374
"diagnostic_plots(data_tf, 'LSTAT')"
362375
]
363376
},
364377
{
365378
"cell_type": "markdown",
366379
"metadata": {},
367380
"source": [
368-
"## Logarithmic transformation with Feature Engine"
381+
"## Logarithmic transformation with Feature-engine"
369382
]
370383
},
371384
{
@@ -385,8 +398,10 @@
385398
}
386399
],
387400
"source": [
401+
"# initialize the transformer\n",
388402
"lt = LogTransformer(variables=['LSTAT', 'NOX', 'DIS', 'RM'])\n",
389403
"\n",
404+
"# fit transformer to data set\n",
390405
"lt.fit(data)"
391406
]
392407
},
@@ -396,6 +411,7 @@
396411
"metadata": {},
397412
"outputs": [],
398413
"source": [
414+
"# transform variables within our data set\n",
399415
"data_tf = lt.transform(data)"
400416
]
401417
},
@@ -418,14 +434,19 @@
418434
}
419435
],
420436
"source": [
437+
"# use diagnostic plot function to evaluate transformation\n",
438+
"# (not in book)\n",
439+
"\n",
421440
"diagnostic_plots(data_tf, 'LSTAT')"
422441
]
423442
},
424443
{
425444
"cell_type": "markdown",
426445
"metadata": {},
427446
"source": [
428-
"## Logarithm with Scikit-learn selecting a group of variables"
447+
"## Logarithm with Scikit-learn selecting a group of variables\n",
448+
"\n",
449+
"### Not in book"
429450
]
430451
},
431452
{
@@ -450,11 +471,12 @@
450471
"metadata": {},
451472
"outputs": [],
452473
"source": [
474+
"# load data set\n",
453475
"boston_dataset = load_boston()\n",
454476
"\n",
455477
"# create a dataframe with the independent variables\n",
456478
"data = pd.DataFrame(boston_dataset.data,\n",
457-
" columns=boston_dataset.feature_names)"
479+
" columns=boston_dataset.feature_names)"
458480
]
459481
},
460482
{
@@ -463,10 +485,14 @@
463485
"metadata": {},
464486
"outputs": [],
465487
"source": [
488+
"# list of variables to transform\n",
466489
"cols = ['LSTAT', 'NOX', 'DIS', 'RM']\n",
467490
"\n",
468-
"log_transformer = Pipeline(steps=[('log_transformer', FunctionTransformer(np.log, validate=True))])\n",
491+
"# set transformer within a pipeline\n",
492+
"log_transformer = Pipeline(steps=[('log_transformer',\n",
493+
" FunctionTransformer(np.log, validate=True))])\n",
469494
"\n",
495+
"# set pipeline within ColumnTransformer to select features\n",
470496
"preprocessor = ColumnTransformer(transformers=[\n",
471497
" ('log_transformer', log_transformer, cols)], remainder='passthrough')"
472498
]
@@ -477,8 +503,10 @@
477503
"metadata": {},
478504
"outputs": [],
479505
"source": [
506+
"# fit to the data (the entire dataframe this time)\n",
480507
"preprocessor.fit(data)\n",
481508
"\n",
509+
"# transform the data\n",
482510
"data_tf = preprocessor.transform(data)"
483511
]
484512
},
@@ -1641,15 +1669,9 @@
16411669
}
16421670
],
16431671
"source": [
1672+
"# visualize the data as a pandas dataframe\n",
16441673
"pd.DataFrame(data_tf)"
16451674
]
1646-
},
1647-
{
1648-
"cell_type": "code",
1649-
"execution_count": null,
1650-
"metadata": {},
1651-
"outputs": [],
1652-
"source": []
16531675
}
16541676
],
16551677
"metadata": {

ch4-tranforming-numerical-vars/Recipe-2-reciprocal-transformation.ipynb renamed to ch04-tranforming-numerical-vars/Recipe-2-reciprocal-transformation.ipynb

+27-10
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@
180180
"data = pd.DataFrame(boston_dataset.data,\n",
181181
" columns=boston_dataset.feature_names)\n",
182182
"\n",
183+
"# display top 5 rows (not in book recipe)\n",
183184
"data.head()"
184185
]
185186
},
@@ -202,7 +203,7 @@
202203
}
203204
],
204205
"source": [
205-
"# plot the histogram to inspect variable distributions\n",
206+
"# plot the histogram to inspect variable distributions (not in book)\n",
206207
"\n",
207208
"data.hist(bins=30, figsize=(12,12))\n",
208209
"plt.show()"
@@ -250,7 +251,7 @@
250251
}
251252
],
252253
"source": [
253-
"# check original distribution\n",
254+
"# check original distribution of the variable DIS\n",
254255
"\n",
255256
"diagnostic_plots(data, 'DIS')"
256257
]
@@ -268,6 +269,9 @@
268269
"metadata": {},
269270
"outputs": [],
270271
"source": [
272+
"# make a copy of the dataframe where we will store the modified\n",
273+
"# variables\n",
274+
"\n",
271275
"data_tf = data.copy()"
272276
]
273277
},
@@ -277,6 +281,8 @@
277281
"metadata": {},
278282
"outputs": [],
279283
"source": [
284+
"# apply the reciprocal to a set of variables\n",
285+
"\n",
280286
"data_tf[['LSTAT', 'NOX', 'DIS', 'RM']] = np.reciprocal(data[['LSTAT', 'NOX', 'DIS', 'RM']])"
281287
]
282288
},
@@ -299,6 +305,8 @@
299305
}
300306
],
301307
"source": [
308+
"# use diagnostic plot function to address variable transformation\n",
309+
"\n",
302310
"diagnostic_plots(data_tf, 'DIS')"
303311
]
304312
},
@@ -315,6 +323,8 @@
315323
"metadata": {},
316324
"outputs": [],
317325
"source": [
326+
"# start the transformer with NumPy reciprocal as argument\n",
327+
"\n",
318328
"transformer = FunctionTransformer(np.reciprocal, validate=True)"
319329
]
320330
},
@@ -324,8 +334,10 @@
324334
"metadata": {},
325335
"outputs": [],
326336
"source": [
337+
"# make a list of columns to transform\n",
327338
"cols = ['LSTAT', 'NOX', 'DIS', 'RM']\n",
328339
"\n",
340+
"# transform subset of dataframe\n",
329341
"data_t = transformer.transform(data[cols])"
330342
]
331343
},
@@ -335,6 +347,8 @@
335347
"metadata": {},
336348
"outputs": [],
337349
"source": [
350+
"# capture returned NumPy array in pandas dataframe\n",
351+
"\n",
338352
"data_tf = pd.DataFrame(data_t, columns=cols)"
339353
]
340354
},
@@ -357,14 +371,17 @@
357371
}
358372
],
359373
"source": [
374+
"# use diagnostic plot function to evaluate transformation\n",
375+
"# (not in book)\n",
376+
"\n",
360377
"diagnostic_plots(data_tf, 'DIS')"
361378
]
362379
},
363380
{
364381
"cell_type": "markdown",
365382
"metadata": {},
366383
"source": [
367-
"## Reciprocal transformation with Feature Engine"
384+
"## Reciprocal transformation with Feature-engine"
368385
]
369386
},
370387
{
@@ -384,7 +401,10 @@
384401
}
385402
],
386403
"source": [
404+
"# initialize the transformer\n",
387405
"rt = ReciprocalTransformer(variables = ['LSTAT', 'NOX', 'DIS', 'RM'])\n",
406+
"\n",
407+
"# fit transformer to the entire dataframe\n",
388408
"rt.fit(data)"
389409
]
390410
},
@@ -394,6 +414,7 @@
394414
"metadata": {},
395415
"outputs": [],
396416
"source": [
417+
"# transform the indicated variables within our data set\n",
397418
"data_tf = rt.transform(data)"
398419
]
399420
},
@@ -416,15 +437,11 @@
416437
}
417438
],
418439
"source": [
440+
"# use diagnostic plot function to evaluate transformation\n",
441+
"# (not in book)\n",
442+
"\n",
419443
"diagnostic_plots(data_tf, 'DIS')"
420444
]
421-
},
422-
{
423-
"cell_type": "code",
424-
"execution_count": null,
425-
"metadata": {},
426-
"outputs": [],
427-
"source": []
428445
}
429446
],
430447
"metadata": {

0 commit comments

Comments
 (0)