diff --git a/assignment_1.ipynb b/assignment_1.ipynb new file mode 100644 index 000000000..d964fcc52 --- /dev/null +++ b/assignment_1.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Assignment 1\n", + "Classification using KNN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import standard libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import random\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as mcolors\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.metrics import recall_score, precision_score\n", + "from sklearn.model_selection import cross_validate\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.datasets import load_wine\n", + "\n", + "# Load the Wine dataset\n", + "wine_data = load_wine()\n", + "\n", + "# Convert to DataFrame\n", + "wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)\n", + "\n", + "# Bind the 'class' (wine target) to the DataFrame\n", + "wine_df['class'] = wine_data.target\n", + "\n", + "# Display the DataFrame\n", + "wine_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Question 1: Data Inspection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# (i) How many observations (rows) does the dataset contain?\n", + "print(wine_df.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# (ii) How many variables (columns) does the dataset contain?\n", + "print(wine_df.shape[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# (iii) Variable type and unique values of 'class'\n", + "print(wine_df['class'].dtype)\n", + "print(wine_df['class'].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# (iv) Number of predictor variables (all columns except 'class')\n", + "print(len(wine_df.columns) - 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Question 2: Standardization and Data Splitting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Select predictors (excluding the last column)\n", + "predictors = wine_df.iloc[:, :-1]\n", + "\n", + "# Standardize the predictors\n", + "scaler = StandardScaler()\n", + "predictors_standardized = pd.DataFrame(scaler.fit_transform(predictors), columns=predictors.columns)\n", + "\n", + "# Display the head of the standardized predictors\n", + "print(predictors_standardized.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**(i)** Standardization is important for KNN because the algorithm calculates distances between data points. If features are on very different scales, the feature with the largest scale will dominate the distance calculation, biasing the model. Standardizing puts all features on the same scale so each contributes equally.\n", + "\n", + "**(ii)** We do not standardize the response variable `class` because it is a categorical label we are trying to predict, not a numeric input feature. Transforming it would distort its meaning.\n", + "\n", + "**(iii)** Setting a random seed ensures reproducibility — anyone running the code will get the same train/test split and results. The specific value of the seed does not matter; what matters is that one is set consistently." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# (iii) Set a random seed for reproducibility\n", + "np.random.seed(123)\n", + "\n", + "# (iv) Split the data into training (75%) and testing (25%) sets\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " predictors_standardized,\n", + " wine_df['class'],\n", + " test_size=0.25,\n", + " random_state=123\n", + ")\n", + "\n", + "print(\"Training set size:\", X_train.shape)\n", + "print(\"Test set size:\", X_test.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Question 3: Model Initialization and Cross-Validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the KNN classifier\n", + "knn = KNeighborsClassifier()\n", + "\n", + "# Define the parameter grid for n_neighbors (1 to 50)\n", + "param_grid = {'n_neighbors': list(range(1, 51))}\n", + "\n", + "# Set up GridSearchCV with 10-fold cross-validation\n", + "grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')\n", + "\n", + "# Fit on training data\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "# Best n_neighbors\n", + "print(\"Best n_neighbors:\", grid_search.best_params_['n_neighbors'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Question 4: Model Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Fit KNN with the best n_neighbors found from grid search\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "best_knn = KNeighborsClassifier(n_neighbors=best_k)\n", + "best_knn.fit(X_train, y_train)\n", + "\n", + "# Predict on the test set\n", + "y_pred = best_knn.predict(X_test)\n", + "\n", + "# Evaluate with accuracy\n", + "print(\"Best n_neighbors:\", best_k)\n", + "print(\"Test Accuracy:\", accuracy_score(y_test, y_pred))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}