diff --git a/01_materials/labs/01_setup.ipynb b/01_materials/labs/01_setup.ipynb index 92feacca8..d1fea6a68 100644 --- a/01_materials/labs/01_setup.ipynb +++ b/01_materials/labs/01_setup.ipynb @@ -247,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -257,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -273,9 +273,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-01-14 21:07:16,806, 492669213.py, 3, INFO, Hello world!\n" + ] + } + ], "source": [ "from utils.logger import get_logger\n", "_logs = get_logger(__name__)\n", @@ -307,7 +315,7 @@ ], "metadata": { "kernelspec": { - "display_name": "production-env (3.11.13)", + "display_name": "production-env (3.11.14)", "language": "python", "name": "python3" }, @@ -321,7 +329,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.13" + "version": "3.11.14" } }, "nbformat": 4, diff --git a/01_materials/labs/04_transforms.ipynb b/01_materials/labs/04_transforms.ipynb index 614061fa2..cb45613ee 100644 --- a/01_materials/labs/04_transforms.ipynb +++ b/01_materials/labs/04_transforms.ipynb @@ -190,9 +190,1330 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocess',\n",
+ " ColumnTransformer(remainder='passthrough',\n",
+ " transformers=[('numeric_simple',\n",
+ " Pipeline(steps=[('imputer',\n",
+ " SimpleImputer(strategy='median')),\n",
+ " ('standardizer',\n",
+ " StandardScaler())]),\n",
+ " ['revolving_unsecured_line_utilization',\n",
+ " 'age', 'num_30_59_days_late',\n",
+ " 'debt_ratio',\n",
+ " 'monthly_income',\n",
+ " 'num_open_credit_loans',\n",
+ " 'num_90_days_late',\n",
+ " 'num_real_estate_loans',\n",
+ " 'num_60_89_days_late',\n",
+ " 'num_dependents'])])),\n",
+ " ('model', LogisticRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. ['revolving_unsecured_line_utilization', 'age', 'num_30_59_days_late', 'debt_ratio', 'monthly_income', 'num_open_credit_loans', 'num_90_days_late', 'num_real_estate_loans', 'num_60_89_days_late', 'num_dependents']
| \n", + " | \n",
+ " \n",
+ " copy\n",
+ " copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_mean\n",
+ " with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_std\n",
+ " with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).\n", + " \n", + " | \n",
+ " True | \n", + "
passthrough
| \n", + " | coord_x | \n", + "coord_y | \n", + "month | \n", + "day | \n", + "ffmc | \n", + "dmc | \n", + "dc | \n", + "isi | \n", + "temp | \n", + "rh | \n", + "wind | \n", + "rain | \n", + "area | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "7 | \n", + "5 | \n", + "mar | \n", + "fri | \n", + "86.2 | \n", + "26.2 | \n", + "94.3 | \n", + "5.1 | \n", + "8.2 | \n", + "51 | \n", + "6.7 | \n", + "0.0 | \n", + "0.0 | \n", + "
| 1 | \n", + "7 | \n", + "4 | \n", + "oct | \n", + "tue | \n", + "90.6 | \n", + "35.4 | \n", + "669.1 | \n", + "6.7 | \n", + "18.0 | \n", + "33 | \n", + "0.9 | \n", + "0.0 | \n", + "0.0 | \n", + "
| 2 | \n", + "7 | \n", + "4 | \n", + "oct | \n", + "sat | \n", + "90.6 | \n", + "43.7 | \n", + "686.9 | \n", + "6.7 | \n", + "14.6 | \n", + "33 | \n", + "1.3 | \n", + "0.0 | \n", + "0.0 | \n", + "
| 3 | \n", + "8 | \n", + "6 | \n", + "mar | \n", + "fri | \n", + "91.7 | \n", + "33.3 | \n", + "77.5 | \n", + "9.0 | \n", + "8.3 | \n", + "97 | \n", + "4.0 | \n", + "0.2 | \n", + "0.0 | \n", + "
| 4 | \n", + "8 | \n", + "6 | \n", + "mar | \n", + "sun | \n", + "89.3 | \n", + "51.3 | \n", + "102.2 | \n", + "9.6 | \n", + "11.4 | \n", + "99 | \n", + "1.8 | \n", + "0.0 | \n", + "0.0 | \n", + "
| \n", + " | coord_x | \n", + "coord_y | \n", + "month | \n", + "day | \n", + "ffmc | \n", + "dmc | \n", + "dc | \n", + "isi | \n", + "temp | \n", + "rh | \n", + "wind | \n", + "rain | \n", + "area | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | \n", + "517.000000 | \n", + "517.000000 | \n", + "517 | \n", + "517 | \n", + "517.000000 | \n", + "517.000000 | \n", + "517.000000 | \n", + "517.000000 | \n", + "517.000000 | \n", + "517.000000 | \n", + "517.000000 | \n", + "517.000000 | \n", + "517.000000 | \n", + "
| unique | \n", + "NaN | \n", + "NaN | \n", + "12 | \n", + "7 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| top | \n", + "NaN | \n", + "NaN | \n", + "aug | \n", + "sun | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| freq | \n", + "NaN | \n", + "NaN | \n", + "184 | \n", + "95 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| mean | \n", + "4.669246 | \n", + "4.299807 | \n", + "NaN | \n", + "NaN | \n", + "90.644681 | \n", + "110.872340 | \n", + "547.940039 | \n", + "9.021663 | \n", + "18.889168 | \n", + "44.288201 | \n", + "4.017602 | \n", + "0.021663 | \n", + "12.847292 | \n", + "
| std | \n", + "2.313778 | \n", + "1.229900 | \n", + "NaN | \n", + "NaN | \n", + "5.520111 | \n", + "64.046482 | \n", + "248.066192 | \n", + "4.559477 | \n", + "5.806625 | \n", + "16.317469 | \n", + "1.791653 | \n", + "0.295959 | \n", + "63.655818 | \n", + "
| min | \n", + "1.000000 | \n", + "2.000000 | \n", + "NaN | \n", + "NaN | \n", + "18.700000 | \n", + "1.100000 | \n", + "7.900000 | \n", + "0.000000 | \n", + "2.200000 | \n", + "15.000000 | \n", + "0.400000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
| 25% | \n", + "3.000000 | \n", + "4.000000 | \n", + "NaN | \n", + "NaN | \n", + "90.200000 | \n", + "68.600000 | \n", + "437.700000 | \n", + "6.500000 | \n", + "15.500000 | \n", + "33.000000 | \n", + "2.700000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
| 50% | \n", + "4.000000 | \n", + "4.000000 | \n", + "NaN | \n", + "NaN | \n", + "91.600000 | \n", + "108.300000 | \n", + "664.200000 | \n", + "8.400000 | \n", + "19.300000 | \n", + "42.000000 | \n", + "4.000000 | \n", + "0.000000 | \n", + "0.520000 | \n", + "
| 75% | \n", + "7.000000 | \n", + "5.000000 | \n", + "NaN | \n", + "NaN | \n", + "92.900000 | \n", + "142.400000 | \n", + "713.900000 | \n", + "10.800000 | \n", + "22.800000 | \n", + "53.000000 | \n", + "4.900000 | \n", + "0.000000 | \n", + "6.570000 | \n", + "
| max | \n", + "9.000000 | \n", + "9.000000 | \n", + "NaN | \n", + "NaN | \n", + "96.200000 | \n", + "291.300000 | \n", + "860.600000 | \n", + "56.100000 | \n", + "33.300000 | \n", + "100.000000 | \n", + "9.400000 | \n", + "6.400000 | \n", + "1090.840000 | \n", + "
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
| \n", + " | \n",
+ " \n",
+ " copy\n",
+ " copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_mean\n",
+ " with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_std\n",
+ " with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).\n", + " \n", + " | \n",
+ " True | \n", + "
OneHotEncoder(drop='if_binary')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
OneHotEncoder(drop='if_binary')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocess',\n",
+ " ColumnTransformer(remainder='passthrough',\n",
+ " transformers=[('numeric_simple',\n",
+ " Pipeline(steps=[('imputer',\n",
+ " SimpleImputer(strategy='median')),\n",
+ " ('standardizer',\n",
+ " StandardScaler())]),\n",
+ " Index(['coord_x', 'coord_y', 'ffmc', 'dmc', 'dc', 'isi', 'temp', 'rh', 'wind',\n",
+ " 'rain'],\n",
+ " dtype='object'))])),\n",
+ " ('model', Lasso())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Index(['coord_x', 'coord_y', 'ffmc', 'dmc', 'dc', 'isi', 'temp', 'rh', 'wind',\n", + " 'rain'],\n", + " dtype='object')
| \n", + " | \n",
+ " \n",
+ " copy\n",
+ " copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_mean\n",
+ " with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_std\n",
+ " with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).\n", + " \n", + " | \n",
+ " True | \n", + "
passthrough
Pipeline(steps=[('preprocess',\n",
+ " ColumnTransformer(remainder='passthrough',\n",
+ " transformers=[('numeric_std',\n",
+ " Pipeline(steps=[('imputer',\n",
+ " SimpleImputer(strategy='median')),\n",
+ " ('standardizer',\n",
+ " StandardScaler())]),\n",
+ " Index(['coord_x', 'coord_y', 'ffmc', 'dmc', 'dc', 'isi', 'temp', 'rh', 'wind',\n",
+ " 'rain'],\n",
+ " dtype='object')),\n",
+ " ('numeric_yj',\n",
+ " Pipeline(steps=[('imputer',\n",
+ " SimpleImputer(strategy='median')),\n",
+ " ('standardizer',\n",
+ " StandardScaler()),\n",
+ " ('transform',\n",
+ " PowerTransformer())]),\n",
+ " ['temp', 'wind', 'ffmc'])])),\n",
+ " ('clf', Lasso())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Index(['coord_x', 'coord_y', 'ffmc', 'dmc', 'dc', 'isi', 'temp', 'rh', 'wind',\n", + " 'rain'],\n", + " dtype='object')
| \n", + " | \n",
+ " \n",
+ " copy\n",
+ " copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_mean\n",
+ " with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_std\n",
+ " with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).\n", + " \n", + " | \n",
+ " True | \n", + "
['temp', 'wind', 'ffmc']
| \n", + " | \n",
+ " \n",
+ " copy\n",
+ " copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_mean\n",
+ " with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " with_std\n",
+ " with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " method\n",
+ " method: {'yeo-johnson', 'box-cox'}, default='yeo-johnson' The power transform method. Available methods are: - 'yeo-johnson' [1]_, works with positive and negative values - 'box-cox' [2]_, only works with strictly positive values\n", + " \n", + " | \n",
+ " 'yeo-johnson' | \n", + "
| \n", + " | \n",
+ " \n",
+ " standardize\n",
+ " standardize: bool, default=True Set to True to apply zero-mean, unit-variance normalization to the transformed output.\n", + " \n", + " | \n",
+ " True | \n", + "
| \n", + " | \n",
+ " \n",
+ " copy\n",
+ " copy: bool, default=True Set to False to perform inplace computation during transformation.\n", + " \n", + " | \n",
+ " True | \n", + "
passthrough