diff --git a/01_materials/notebooks/Classification-1.ipynb b/01_materials/notebooks/Classification-1.ipynb index 7b6959a7a..27fa3d629 100644 --- a/01_materials/notebooks/Classification-1.ipynb +++ b/01_materials/notebooks/Classification-1.ipynb @@ -23,7 +23,15 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Matplotlib is building the font cache; this may take a moment.\n" + ] + } + ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", @@ -468,13 +476,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "\n", + "\n", "RangeIndex: 569 entries, 0 to 568\n", - "Data columns (total 33 columns):\n", + "Data columns (total 32 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 569 non-null int64 \n", - " 1 diagnosis 569 non-null object \n", + " 1 diagnosis 569 non-null str \n", " 2 radius_mean 569 non-null float64\n", " 3 texture_mean 569 non-null float64\n", " 4 perimeter_mean 569 non-null float64\n", @@ -505,9 +513,8 @@ " 29 concave points_worst 569 non-null float64\n", " 30 symmetry_worst 569 non-null float64\n", " 31 fractal_dimension_worst 569 non-null float64\n", - " 32 Unnamed: 32 0 non-null float64\n", - "dtypes: float64(31), int64(1), object(1)\n", - "memory usage: 146.8+ KB\n" + "dtypes: float64(30), int64(1), str(1)\n", + "memory usage: 142.4 KB\n" ] } ], @@ -531,7 +538,9 @@ { "data": { "text/plain": [ - "array(['M', 'B'], dtype=object)" + "\n", + "['M', 'B']\n", + "Length: 2, dtype: str" ] }, "execution_count": 4, @@ -558,7 +567,9 @@ { "data": { "text/plain": [ - "array(['Malignant', 'Benign'], dtype=object)" + "\n", + "['Malignant', 'Benign']\n", + "Length: 2, dtype: str" ] }, "execution_count": 5, @@ -596,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -608,7 +619,7 @@ "dtype: float64" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -626,7 +637,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -638,7 +649,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 21, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -2326,7 +2337,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "lcr-env (3.11.14)", "language": "python", "name": "python3" }, @@ -2340,7 +2351,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.11.14" } }, "nbformat": 4, diff --git a/01_materials/notebooks/Classification-2.ipynb b/01_materials/notebooks/Classification-2.ipynb index 96db650b8..50f16a16d 100644 --- a/01_materials/notebooks/Classification-2.ipynb +++ b/01_materials/notebooks/Classification-2.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -2789,7 +2789,7 @@ ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "lcr-env (3.11.14)", "language": "python", "name": "python3" }, @@ -2803,7 +2803,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.14" } }, "nbformat": 4, diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 1d25bbcb3..157eb6e12 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4a3485d6-ba58-4660-a983-5680821c5719", "metadata": {}, "outputs": [], @@ -56,10 +56,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alcoholmalic_acidashalcalinity_of_ashmagnesiumtotal_phenolsflavanoidsnonflavanoid_phenolsproanthocyaninscolor_intensityhueod280/od315_of_diluted_winesprolineclass
014.231.712.4315.6127.02.803.060.282.295.641.043.921065.00
113.201.782.1411.2100.02.652.760.261.284.381.053.401050.00
213.162.362.6718.6101.02.803.240.302.815.681.033.171185.00
314.371.952.5016.8113.03.853.490.242.187.800.863.451480.00
413.242.592.8721.0118.02.802.690.391.824.321.042.93735.00
.............................................
17313.715.652.4520.595.01.680.610.521.067.700.641.74740.02
17413.403.912.4823.0102.01.800.750.431.417.300.701.56750.02
17513.274.282.2620.0120.01.590.690.431.3510.200.591.56835.02
17613.172.592.3720.0120.01.650.680.531.469.300.601.62840.02
17714.134.102.7424.596.02.050.760.561.359.200.611.60560.02
\n", + "

178 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n", + "0 14.23 1.71 2.43 15.6 127.0 2.80 \n", + "1 13.20 1.78 2.14 11.2 100.0 2.65 \n", + "2 13.16 2.36 2.67 18.6 101.0 2.80 \n", + "3 14.37 1.95 2.50 16.8 113.0 3.85 \n", + "4 13.24 2.59 2.87 21.0 118.0 2.80 \n", + ".. ... ... ... ... ... ... \n", + "173 13.71 5.65 2.45 20.5 95.0 1.68 \n", + "174 13.40 3.91 2.48 23.0 102.0 1.80 \n", + "175 13.27 4.28 2.26 20.0 120.0 1.59 \n", + "176 13.17 2.59 2.37 20.0 120.0 1.65 \n", + "177 14.13 4.10 2.74 24.5 96.0 2.05 \n", + "\n", + " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n", + "0 3.06 0.28 2.29 5.64 1.04 \n", + "1 2.76 0.26 1.28 4.38 1.05 \n", + "2 3.24 0.30 2.81 5.68 1.03 \n", + "3 3.49 0.24 2.18 7.80 0.86 \n", + "4 2.69 0.39 1.82 4.32 1.04 \n", + ".. ... ... ... ... ... \n", + "173 0.61 0.52 1.06 7.70 0.64 \n", + "174 0.75 0.43 1.41 7.30 0.70 \n", + "175 0.69 0.43 1.35 10.20 0.59 \n", + "176 0.68 0.53 1.46 9.30 0.60 \n", + "177 0.76 0.56 1.35 9.20 0.61 \n", + "\n", + " od280/od315_of_diluted_wines proline class \n", + "0 3.92 1065.0 0 \n", + "1 3.40 1050.0 0 \n", + "2 3.17 1185.0 0 \n", + "3 3.45 1480.0 0 \n", + "4 2.93 735.0 0 \n", + ".. ... ... ... \n", + "173 1.74 740.0 2 \n", + "174 1.56 750.0 2 \n", + "175 1.56 835.0 2 \n", + "176 1.62 840.0 2 \n", + "177 1.60 560.0 2 \n", + "\n", + "[178 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.datasets import load_wine\n", "\n", @@ -91,12 +369,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "56916892", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "178\n" + ] + } + ], "source": [ - "# Your answer here" + "row_count = wine_df.shape[0]\n", + "print(row_count)" ] }, { @@ -109,12 +396,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "df0ef103", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14\n" + ] + } + ], "source": [ - "# Your answer here" + "print(wine_df.shape[1])" ] }, { @@ -128,11 +423,31 @@ { "cell_type": "code", "execution_count": null, - "id": "47989426", + "id": "6ab6270d", "metadata": {}, "outputs": [], "source": [ - "# Your answer here" + "# Variable type is categorical" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "47989426", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 1, 2]\n", + "Categories (3, int64): [0, 1, 2]\n" + ] + } + ], + "source": [ + "# The levels or values are 0, 1, 2\n", + "print(wine_df['class'].unique())" ] }, { @@ -146,12 +461,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "bd7b0910", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13\n" + ] + } + ], "source": [ - "# Your answer here" + "num_predictors = wine_df.shape[1] - 1\n", + "print(num_predictors)" ] }, { @@ -175,10 +499,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "cc899b59", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n", + "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n", + "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n", + "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n", + "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n", + "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n", + "\n", + " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n", + "0 0.808997 1.034819 -0.659563 1.224884 \n", + "1 0.568648 0.733629 -0.820719 -0.544721 \n", + "2 0.808997 1.215533 -0.498407 2.135968 \n", + "3 2.491446 1.466525 -0.981875 1.032155 \n", + "4 0.808997 0.663351 0.226796 0.401404 \n", + "\n", + " color_intensity hue od280/od315_of_diluted_wines proline \n", + "0 0.251717 0.362177 1.847920 1.013009 \n", + "1 -0.293321 0.406051 1.113449 0.965242 \n", + "2 0.269020 0.318304 0.788587 1.395148 \n", + "3 1.186068 -0.427544 1.184071 2.334574 \n", + "4 -0.319276 0.362177 0.449601 -0.037874 \n" + ] + } + ], "source": [ "# Select predictors (excluding the last column)\n", "predictors = wine_df.iloc[:, :-1]\n", @@ -204,7 +555,7 @@ "id": "403ef0bb", "metadata": {}, "source": [ - "> Your answer here..." + "# It is important to standardize the predictor variables to increase accuracy and make sure that the variables are being compared on the same scale. " ] }, { @@ -220,7 +571,7 @@ "id": "fdee5a15", "metadata": {}, "source": [ - "> Your answer here..." + "# We did not standardize the response variable because it is not a measurement, but a category that we gave the wine. " ] }, { @@ -236,7 +587,18 @@ "id": "f0676c21", "metadata": {}, "source": [ - "> Your answer here..." + "# Setting a random seed is important for reliability, to make sure the data is reproducible. The particular seed value is not important because it does not set take away all the randomness but instead sets the starting point - read this article: http://misailo.cs.illinois.edu/papers/icst22.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "59668ab8", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "np.random.seed(42)" ] }, { @@ -261,7 +623,12 @@ "\n", "# split the data into a training and testing set. hint: use train_test_split !\n", "\n", - "# Your code here ..." + "X = wine_df.drop(columns=predictors_standardized)\n", + "Y = wine_df['class']\n", + "\n", + "X_train, X_test, Y_train, Y_test = train_test_split(X, Y,\n", + " test_size=0.2,\n", + " random_state=123)" ] }, { @@ -284,12 +651,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "08818c64", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n" + ] + } + ], "source": [ - "# Your code here..." + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "knn = KNeighborsClassifier()\n", + "\n", + "param_grid = {'n_neighbors': list(range(1, 51))}\n", + "grid_search = GridSearchCV(\n", + " estimator=knn,\n", + " param_grid=param_grid,\n", + " cv=10,\n", + " scoring='accuracy')\n", + "\n", + "grid_search.fit(X_train, Y_train)\n", + "\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "print(best_k)" ] }, { @@ -308,9 +697,60 @@ "execution_count": null, "id": "ffefa9f2", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best n_neighbors: 1\n" + ] + } + ], + "source": [ + "#test\n", + "best_k = grid_search.best_params_['n_neighbors']\n", + "print(\"Best n_neighbors:\", best_k)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "be050743", + "metadata": {}, "outputs": [], "source": [ - "# Your code here..." + "#train model\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "\n", + "final_knn = KNeighborsClassifier(n_neighbors=best_k)\n", + "final_knn.fit(X_train, Y_train)\n", + "\n", + "#predict\n", + "Y_pred = final_knn.predict(X_test)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e80c8d46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test accuracy: 1.0\n" + ] + } + ], + "source": [ + "#test accuracyy\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "test_accuracy = accuracy_score(Y_test, Y_pred)\n", + "print(\"Test accuracy:\", test_accuracy)\n", + "\n", + "#very accurate!" ] }, { @@ -365,7 +805,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.4", + "display_name": "lcr-env (3.11.14)", "language": "python", "name": "python3" }, @@ -379,12 +819,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" - }, - "vscode": { - "interpreter": { - "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e" - } + "version": "3.11.14" } }, "nbformat": 4,