From 59c5c66752465a8cafef2c0396dedf7a7e7895ab Mon Sep 17 00:00:00 2001 From: Baruni Prabaharan <239645217+barunip@users.noreply.github.com> Date: Sat, 13 Dec 2025 13:37:43 -0500 Subject: [PATCH] Committing Assignment 1 responses --- 02_activities/assignments/assignment_1.ipynb | 546 +++++++++++++++++-- 1 file changed, 513 insertions(+), 33 deletions(-) diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 28d4df017..49a94963c 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,10 +56,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -91,12 +369,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 178 rows.\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "\n", + "from sklearn.datasets import load_wine\n", + "\n", + "# Load Wine dataset\n", + "wine = load_wine()\n", + "\n", + "# Get number of rows\n", + "num_rows = wine.data.shape[0]\n", + "print(f\"The dataset contains {num_rows} rows.\")\n" ] }, { @@ -109,12 +405,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 13 columns.\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "\n", + "from sklearn.datasets import load_wine\n", + "\n", + "# Load Wine dataset\n", + "wine = load_wine()\n", + "\n", + "# Get number of columns\n", + "num_columns = wine.data.shape[1]\n", + "print(f\"The dataset contains {num_columns} columns.\")" ] }, { @@ -127,12 +441,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "47989426", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pandas dtype of 'class': int64\n", + "NumPy dtype of 'class': int64\n", + "Python type of elements: \n", + "Levels (unique values) of 'class': [np.int64(0), np.int64(1), np.int64(2)]\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "\n", + "from sklearn.datasets import load_wine\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Load Wine dataset\n", + "wine = load_wine()\n", + "\n", + "# Create a pandas Series for the response variable 'class'\n", + "y = pd.Series(wine.target, name='class')\n", + "\n", + "# Query the variable type (dtype)\n", + "print(f\"pandas dtype of 'class': {y.dtype}\") # e.g., int64\n", + "\n", + "# Also show the underlying NumPy dtype and Python type of elements, should be the same\n", + "print(f\"NumPy dtype of 'class': {y.to_numpy().dtype}\") \n", + "print(f\"Python type of elements: {type(y.iloc[0])}\") \n", + "\n", + "# (b) Query the levels (unique values)\n", + "levels = sorted(y.unique())\n", + "print(f\"Levels (unique values) of 'class': {levels}\")\n", + "\n", + "\n" ] }, { @@ -146,12 +494,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset has 13 predictor variables.\n" + ] + } + ], "source": [ - "# Your answer here" + "# Your answer here\n", + "from sklearn.datasets import load_wine\n", + "\n", + "# Load Wine dataset\n", + "wine = load_wine()\n", + "\n", + "# Number of predictor variables = number of columns in data\n", + "num_predictors = wine.data.shape[1]\n", + "print(f\"The dataset has {num_predictors} predictor variables.\")\n" ] }, { @@ -175,10 +539,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -196,7 +587,7 @@ "id": "9981ca48", "metadata": {}, "source": [ - "(i) Why is it important to standardize the predictor variables?" + "(i) Why is it important to standardize the predictor variables?\n" ] }, { @@ -204,7 +595,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "Standardization is important because KNN relies on distances, and features with larger scales would otherwise dominate the calculation." ] }, { @@ -220,7 +611,8 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "\n", + "Standardization is only meaningful for continuous numeric predictors but 'class' contains discrete categories.If we standardized 'class', it would distort its categorical meaning and break classification.KNN uses 'class' only for labeling, not for distance calculations." ] }, { @@ -236,7 +628,10 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "\n", + "random.seed(42) \n", + "np.random.seed(42) \n", + " It makes results reproducible across runs and machines (e.g.same train/test split), which is important for debugging, comparison, and auditability." ] }, { @@ -254,14 +649,42 @@ "execution_count": null, "id": "72c101f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training set size: 133 samples\n", + "Testing set size: 45 samples\n" + ] + } + ], "source": [ "# set a seed for reproducibility\n", "np.random.seed(123)\n", "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", "\n", - "# Your code here ..." + "# Your code here ...\n", + "\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Set a seed for reproducibility\n", + "np.random.seed(123)\n", + "RANDOM_STATE = 123 # use the same random_state in sklearn\n", + "\n", + "\n", + "# 75% train / 25% test split (stratified to preserve class proportions)\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " predictors_standardized,\n", + " wine_df['class'],\n", + " test_size=0.25,\n", + " stratify=wine_df['class'],\n", + " random_state=RANDOM_STATE\n", + ")\n", + "print(f\"Training set size: {X_train.shape[0]} samples\")\n", + "print(f\"Testing set size: {X_test.shape[0]} samples\")\n" ] }, { @@ -284,12 +707,74 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KNeighborsClassifier()\n", + "{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]}\n", + "Best n_neighbors: 13\n", + "Best 10-fold CV accuracy: 0.9851648351648352\n", + "Test accuracy with best model: 0.9111111111111111\n" + ] + } + ], "source": [ - "# Your code here..." + "# Your code here...\n", + "\n", + "\n", + "# Step 1: Initialize the KNN classifier\n", + "knn = KNeighborsClassifier() # Leave default values \n", + "print(knn)\n", + "\n", + "\n", + "# Step 2: Define the parameter grid for n_neighbors\n", + "param_grid = {\n", + " 'n_neighbors': list(range(1, 51)) # Values from 1 to 50\n", + "}\n", + "\n", + "print(param_grid)\n", + "\n", + "\n", + "# 3) Build a pipeline: Standardize features -> KNN classifier\n", + "from sklearn.pipeline import Pipeline\n", + "pipe = Pipeline([\n", + " (\"scaler\", StandardScaler()),\n", + " (\"knn\", KNeighborsClassifier())\n", + "])\n", + "\n", + "# 4) Parameter grid for n_neighbors from 1 to 50\n", + "param_grid = {\n", + " \"knn__n_neighbors\": list(range(1, 51))\n", + "}\n", + "\n", + "# 5) Grid search with 10-fold CV, optimize accuracy\n", + "grid = GridSearchCV(\n", + " estimator=pipe,\n", + " param_grid=param_grid,\n", + " cv=10,\n", + " scoring=\"accuracy\",\n", + " n_jobs=-1,\n", + " return_train_score=False\n", + ")\n", + "\n", + "# 6) Fit on training data\n", + "grid.fit(X_train, y_train)\n", + "\n", + "best_k = grid.best_params_[\"knn__n_neighbors\"]\n", + "best_cv_score = grid.best_score_\n", + "\n", + "# 7) Evaluate best model on the held-out test set\n", + "best_model = grid.best_estimator_\n", + "test_accuracy = accuracy_score(y_test, best_model.predict(X_test))\n", + "\n", + "print(\"Best n_neighbors:\", best_k)\n", + "print(\"Best 10-fold CV accuracy:\", best_cv_score)\n", + "print(\"Test accuracy with best model:\", test_accuracy)\n" ] }, { @@ -365,7 +850,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "lcr-env", "language": "python", "name": "python3" }, @@ -379,12 +864,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.11.14" } }, "nbformat": 4,