diff --git a/02_activities/assignments/Pipeline_a b/02_activities/assignments/Pipeline_a new file mode 100644 index 000000000..e29676842 Binary files /dev/null and b/02_activities/assignments/Pipeline_a differ diff --git a/02_activities/assignments/assignment_2.ipynb b/02_activities/assignments/assignment_2.ipynb index 29d661c57..008c0181a 100644 --- a/02_activities/assignments/assignment_2.ipynb +++ b/02_activities/assignments/assignment_2.ipynb @@ -97,18 +97,54 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# Load the libraries as required." + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, FunctionTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.linear_model import Ridge\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import GridSearchCV, cross_val_score\n", + "import pickle\n", + "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 517 entries, 0 to 516\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 coord_x 517 non-null int64 \n", + " 1 coord_y 517 non-null int64 \n", + " 2 month 517 non-null object \n", + " 3 day 517 non-null object \n", + " 4 ffmc 517 non-null float64\n", + " 5 dmc 517 non-null float64\n", + " 6 dc 517 non-null float64\n", + " 7 isi 517 non-null float64\n", + " 8 temp 517 non-null float64\n", + " 9 rh 517 non-null int64 \n", + " 10 wind 517 non-null float64\n", + " 11 rain 517 non-null float64\n", + " 12 area 517 non-null float64\n", + "dtypes: float64(8), int64(3), object(2)\n", + "memory usage: 52.6+ KB\n" + ] + } + ], "source": [ "# Load data\n", "columns = [\n", @@ -129,17 +165,52 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " coord_x coord_y month day ffmc dmc dc isi temp rh wind rain\n", + "0 7 5 mar fri 86.2 26.2 94.3 5.1 8.2 51 6.7 0.0\n", + "1 7 4 oct tue 90.6 35.4 669.1 6.7 18.0 33 0.9 0.0\n", + "2 7 4 oct sat 90.6 43.7 686.9 6.7 14.6 33 1.3 0.0\n", + "3 8 6 mar fri 91.7 33.3 77.5 9.0 8.3 97 4.0 0.2\n", + "4 8 6 mar sun 89.3 51.3 102.2 9.6 11.4 99 1.8 0.0\n" + ] + } + ], + "source": [ + "X = fires_dt.drop(columns = ['area'])\n", + "print(X.head())" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 0.0\n", + "1 0.0\n", + "2 0.0\n", + "3 0.0\n", + "4 0.0\n", + "Name: area, dtype: float64\n", + "X_train shape: (413, 12) X_test shape: (104, 12) y_train shape: (413,) y_test shape: (104,)\n" + ] + } + ], + "source": [ + "y = fires_dt['area']\n", + "print(y.head())\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "print(\"X_train shape:\", X_train.shape, \"X_test shape:\", X_test.shape, \"y_train shape:\", y_train.shape, \"y_test shape:\", y_test.shape)" + ] }, { "cell_type": "markdown", @@ -180,10 +251,801 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
ColumnTransformer(transformers=[('num', RobustScaler(),\n",
+       "                                 ['coord_x', 'coord_y', 'ffmc', 'dmc', 'dc',\n",
+       "                                  'isi', 'temp', 'rh', 'wind', 'rain']),\n",
+       "                                ('cat', OneHotEncoder(handle_unknown='ignore'),\n",
+       "                                 ['month', 'day'])])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "ColumnTransformer(transformers=[('num', RobustScaler(),\n", + " ['coord_x', 'coord_y', 'ffmc', 'dmc', 'dc',\n", + " 'isi', 'temp', 'rh', 'wind', 'rain']),\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'),\n", + " ['month', 'day'])])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "categorical_columns = ['month', 'day']\n", + "numerical_columns = ['coord_x', 'coord_y', 'ffmc', 'dmc', 'dc', 'isi', 'temp', 'rh', 'wind', 'rain']\n", + "preproc1 = ColumnTransformer(\n", + " transformers=[\n", + " ('num', RobustScaler(), numerical_columns),\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)\n", + " ]\n", + ")\n", + "preproc1" + ] }, { "cell_type": "markdown", @@ -199,10 +1061,966 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
ColumnTransformer(transformers=[('num_nonlinear',\n",
+       "                                 Pipeline(steps=[('log',\n",
+       "                                                  FunctionTransformer(func=<ufunc 'log1p'>,\n",
+       "                                                                      validate=True)),\n",
+       "                                                 ('scale', RobustScaler())]),\n",
+       "                                 ['isi', 'rain', 'ffmc']),\n",
+       "                                ('num_standard', RobustScaler(),\n",
+       "                                 ['coord_x', 'coord_y', 'dmc', 'dc', 'temp',\n",
+       "                                  'rh', 'wind']),\n",
+       "                                ('cat', OneHotEncoder(handle_unknown='ignore'),\n",
+       "                                 ['month', 'day'])])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "ColumnTransformer(transformers=[('num_nonlinear',\n", + " Pipeline(steps=[('log',\n", + " FunctionTransformer(func=,\n", + " validate=True)),\n", + " ('scale', RobustScaler())]),\n", + " ['isi', 'rain', 'ffmc']),\n", + " ('num_standard', RobustScaler(),\n", + " ['coord_x', 'coord_y', 'dmc', 'dc', 'temp',\n", + " 'rh', 'wind']),\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'),\n", + " ['month', 'day'])])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "log_transformer = FunctionTransformer(np.log1p, validate=True)\n", + "preproc2 = ColumnTransformer(\n", + " transformers=[\n", + " ('num_nonlinear', Pipeline([\n", + " ('log', log_transformer),\n", + " ('scale', RobustScaler())\n", + " ]), ['isi', 'rain', 'ffmc']),\n", + " ('num_standard', RobustScaler(), ['coord_x', 'coord_y', 'dmc', 'dc', 'temp', 'rh', 'wind']),\n", + " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)\n", + " ]\n", + ")\n", + "preproc2" + ] }, { "cell_type": "markdown", @@ -227,40 +2045,52 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# Pipeline A = preproc1 + baseline\n" + "baseline_model = Ridge(random_state=135)\n", + "advanced_model = RandomForestRegressor(random_state=12)\n", + "pipe_a = Pipeline([\n", + " ('preprocessing', preproc1),\n", + " ('regressor', baseline_model)\n", + "])\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "# Pipeline B = preproc2 + baseline\n" + "pipe_b = Pipeline([\n", + " ('preprocessing', preproc2),\n", + " ('regressor', baseline_model)\n", + "])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# Pipeline C = preproc1 + advanced model\n" + "pipe_c = Pipeline([\n", + " ('preprocessing', preproc1),\n", + " ('regressor', advanced_model)\n", + "])\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "# Pipeline D = preproc2 + advanced model\n", - "\n", - " " + "pipe_d = Pipeline([\n", + " ('preprocessing', preproc2),\n", + " ('regressor', advanced_model)\n", + "])" ] }, { @@ -276,46 +2106,100 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "param_grid_ridge = {\n", + " 'regressor__alpha': [0.01, 0.1, 1.0, 10.0]\n", + "}\n", + "rf_params = {\n", + " 'regressor__max_depth': [None, 5, 10, 20]\n", + "}" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PipeA Best Param--> {'regressor__alpha': 10.0} Best MSE 4275.346523695291\n" + ] + } + ], + "source": [ + "grid_a = GridSearchCV(pipe_a, param_grid_ridge, cv=5, scoring='neg_mean_squared_error')\n", + "grid_a.fit(X, y)\n", + "print(\"PipeA Best Param-->\",grid_a.best_params_,\"Best MSE\",-grid_a.best_score_)" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PipeB Best Param--> {'regressor__alpha': 10.0} Best MSE 4281.378807307956\n" + ] + } + ], + "source": [ + "grid_b = GridSearchCV(pipe_b, param_grid_ridge, cv=5, scoring='neg_mean_squared_error')\n", + "grid_b.fit(X, y)\n", + "print(\"PipeB Best Param-->\",grid_b.best_params_,\"Best MSE\",-grid_b.best_score_)" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PipeC Best Param--> {'regressor__max_depth': 5} Best MSE 5205.356028243962\n" + ] + } + ], + "source": [ + "grid_c = GridSearchCV(pipe_c, rf_params, cv=5, scoring='neg_mean_squared_error')\n", + "grid_c.fit(X, y)\n", + "print(\"PipeC Best Param-->\",grid_c.best_params_,\"Best MSE\",-grid_c.best_score_)" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PipeD Best Param--> {'regressor__max_depth': 5} Best MSE 5211.078267470271\n" + ] + } + ], + "source": [ + "grid_d = GridSearchCV(pipe_d, rf_params, cv=5, scoring='neg_mean_squared_error')\n", + "grid_d.fit(X, y)\n", + "print(\"PipeD Best Param-->\",grid_d.best_params_,\"Best MSE\",-grid_d.best_score_)" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Evaluate\n", - "\n", - "+ Which model has the best performance?" + "+ Which model has the best performance?\n", + "Model A is the best" ] }, { @@ -329,17 +2213,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "grids = [grid_a, grid_b, grid_c, grid_d]\n", + "grid_names = ['PipelineA', 'PipelineB', 'PipelineC', 'PipelineD']\n", + "best_idx = np.argmax([g.best_score_ for g in grids])\n", + "best_pipeline = grids[best_idx].best_estimator_" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pipeline_a\n" + ] + } + ], + "source": [ + "filename = 'Pipeline_a'\n", + "with open(filename, 'wb') as file:\n", + " pickle.dump(best_pipeline, file)\n", + "print(filename)" + ] }, { "cell_type": "markdown", @@ -358,10 +2260,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'shap'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[19]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mshap\u001b[39;00m\n\u001b[32m 2\u001b[39m data_transform = best_pipeline.named_steps[\u001b[33m'\u001b[39m\u001b[33mpreprocessing\u001b[39m\u001b[33m'\u001b[39m].transform(X_test)\n\u001b[32m 3\u001b[39m explainer = shap.explainers.TreeExplainer(\n\u001b[32m 4\u001b[39m best_pipeline.named_steps[\u001b[33m'\u001b[39m\u001b[33mregressor\u001b[39m\u001b[33m'\u001b[39m],\n\u001b[32m 5\u001b[39m data_transform,\n\u001b[32m 6\u001b[39m feature_names = best_pipeline.named_steps[\u001b[33m'\u001b[39m\u001b[33mpreprocessing\u001b[39m\u001b[33m'\u001b[39m].get_feature_names_out()\n\u001b[32m 7\u001b[39m )\n", + "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'shap'" + ] + } + ], + "source": [ + "import shap\n", + "data_transform = best_pipeline.named_steps['preprocessing'].transform(X_test)\n", + "explainer = shap.explainers.TreeExplainer(\n", + " best_pipeline.named_steps['regressor'],\n", + " data_transform,\n", + " feature_names = best_pipeline.named_steps['preprocessing'].get_feature_names_out()\n", + " )\n", + "shap_values = explainer(data_transform)" + ] }, { "cell_type": "code", @@ -423,7 +2346,7 @@ ], "metadata": { "kernelspec": { - "display_name": "env", + "display_name": "production-env", "language": "python", "name": "python3" }, @@ -437,7 +2360,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.11.14" } }, "nbformat": 4,