diff --git a/DataCleaning/suzzzal_data_imputation/data_imputation.ipynb b/DataCleaning/suzzzal_data_imputation/data_imputation.ipynb new file mode 100644 index 0000000..29a79a4 --- /dev/null +++ b/DataCleaning/suzzzal_data_imputation/data_imputation.ipynb @@ -0,0 +1,796 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1240844f", + "metadata": {}, + "source": [ + " ", + "\n", + " ", + " " + ] + }, + { + "cell_type": "markdown", + "id": "cb43cadf", + "metadata": {}, + "source": [ + " ", + " ", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ef0adb0e", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", + "from sklearn.impute import SimpleImputer, KNNImputer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2d2ead02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
P_NAMEP_STATUSP_MASSP_MASS_ERROR_MINP_MASS_ERROR_MAXP_RADIUSP_RADIUS_ERROR_MINP_RADIUS_ERROR_MAXP_YEARP_UPDATED...P_HABZONE_CONP_TYPE_TEMPP_HABITABLEP_ESIS_CONSTELLATIONS_CONSTELLATION_ABRS_CONSTELLATION_ENGP_RADIUS_ESTP_MASS_ESTP_SEMI_MAJOR_AXIS_EST
011 Com b3.06165.86330-476.74200476.74200NaNNaNNaN20072014-05-14...0Hot00.083813Coma BerenicesComBerenice's Hair12.0827096165.863301.29
111 UMi b3.04684.78480-794.57001794.57001NaNNaNNaN20092018-09-06...0Hot00.082414Ursa MinorUMiLittle Bear12.2296414684.784801.53
214 And b3.01525.57440NaNNaNNaNNaNNaN20082014-05-14...0Hot00.081917AndromedaAndAndromeda12.8485161525.574400.83
314 Her b3.01481.07850-47.6742047.67420NaNNaNNaN20022018-09-06...0Cold00.145241HerculesHerHercules12.8652611481.078502.93
416 Cyg B b3.0565.73385-25.4262425.42624NaNNaNNaN19962018-09-06...1Warm00.368627CygnusCygSwan13.421749565.733851.66
\n", + "

5 rows × 112 columns

\n", + "
" + ], + "text/plain": [ + " P_NAME P_STATUS P_MASS P_MASS_ERROR_MIN P_MASS_ERROR_MAX \\\n", + "0 11 Com b 3.0 6165.86330 -476.74200 476.74200 \n", + "1 11 UMi b 3.0 4684.78480 -794.57001 794.57001 \n", + "2 14 And b 3.0 1525.57440 NaN NaN \n", + "3 14 Her b 3.0 1481.07850 -47.67420 47.67420 \n", + "4 16 Cyg B b 3.0 565.73385 -25.42624 25.42624 \n", + "\n", + " P_RADIUS P_RADIUS_ERROR_MIN P_RADIUS_ERROR_MAX P_YEAR P_UPDATED ... \\\n", + "0 NaN NaN NaN 2007 2014-05-14 ... \n", + "1 NaN NaN NaN 2009 2018-09-06 ... \n", + "2 NaN NaN NaN 2008 2014-05-14 ... \n", + "3 NaN NaN NaN 2002 2018-09-06 ... \n", + "4 NaN NaN NaN 1996 2018-09-06 ... \n", + "\n", + " P_HABZONE_CON P_TYPE_TEMP P_HABITABLE P_ESI S_CONSTELLATION \\\n", + "0 0 Hot 0 0.083813 Coma Berenices \n", + "1 0 Hot 0 0.082414 Ursa Minor \n", + "2 0 Hot 0 0.081917 Andromeda \n", + "3 0 Cold 0 0.145241 Hercules \n", + "4 1 Warm 0 0.368627 Cygnus \n", + "\n", + " S_CONSTELLATION_ABR S_CONSTELLATION_ENG P_RADIUS_EST P_MASS_EST \\\n", + "0 Com Berenice's Hair 12.082709 6165.86330 \n", + "1 UMi Little Bear 12.229641 4684.78480 \n", + "2 And Andromeda 12.848516 1525.57440 \n", + "3 Her Hercules 12.865261 1481.07850 \n", + "4 Cyg Swan 13.421749 565.73385 \n", + "\n", + " P_SEMI_MAJOR_AXIS_EST \n", + "0 1.29 \n", + "1 1.53 \n", + "2 0.83 \n", + "3 2.93 \n", + "4 1.66 \n", + "\n", + "[5 rows x 112 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('../../datasets/full_data.csv')\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e7d0eb63", + "metadata": {}, + "source": [ + " ", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bdeeb104", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "P_ATMOSPHERE 100.000000\n", + "P_ALT_NAMES 100.000000\n", + "P_DETECTION_RADIUS 100.000000\n", + "P_GEO_ALBEDO 100.000000\n", + "P_DETECTION_MASS 100.000000\n", + "S_MAGNETIC_FIELD 100.000000\n", + "S_DISC 100.000000\n", + "P_TEMP_MEASURED 99.876482\n", + "P_GEO_ALBEDO_ERROR_MIN 99.876482\n", + "P_GEO_ALBEDO_ERROR_MAX 99.876482\n", + "P_TPERI_ERROR_MAX 88.339921\n", + "P_TPERI_ERROR_MIN 88.339921\n", + "P_TPERI 88.117589\n", + "P_OMEGA_ERROR_MIN 82.880435\n", + "P_OMEGA_ERROR_MAX 82.880435\n", + "P_ESCAPE 82.559289\n", + "P_POTENTIAL 82.559289\n", + "P_DENSITY 82.559289\n", + "P_GRAVITY 82.559289\n", + "P_OMEGA 81.571146\n", + "P_INCLINATION_ERROR_MAX 79.990119\n", + "P_INCLINATION_ERROR_MIN 79.940711\n", + "P_INCLINATION 79.150198\n", + "P_ECCENTRICITY_ERROR_MIN 76.012846\n", + "P_ECCENTRICITY_ERROR_MAX 76.012846\n", + "S_TYPE 66.156126\n", + "P_ECCENTRICITY 65.909091\n", + "P_IMPACT_PARAMETER_ERROR_MIN 65.242095\n", + "P_IMPACT_PARAMETER_ERROR_MAX 65.242095\n", + "P_IMPACT_PARAMETER 65.192688\n", + "dtype: float64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "missing_pct = df.isnull().mean() * 100\n", + "missing_pct.sort_values(ascending=False).head(30)" + ] + }, + { + "cell_type": "markdown", + "id": "775d2fe8", + "metadata": {}, + "source": [ + " ", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "277c6c2a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4048, 92)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "threshold = 80\n", + "cols_to_keep = missing_pct[missing_pct <= threshold].index\n", + "df_filtered = df[cols_to_keep]\n", + "df_filtered.shape" + ] + }, + { + "cell_type": "markdown", + "id": "0d0f1145", + "metadata": {}, + "source": [ + " ", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7b9ce26e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(78, 14)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_cols = df_filtered.select_dtypes(include=['int64', 'float64']).columns\n", + "cat_cols = df_filtered.select_dtypes(include=['object', 'category']).columns\n", + "len(num_cols), len(cat_cols)" + ] + }, + { + "cell_type": "markdown", + "id": "a6ed4d11", + "metadata": {}, + "source": [ + " ", + " ", + " ", + " ", + "-", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a70a4bcf", + "metadata": {}, + "outputs": [], + "source": [ + "def make_preprocessor(num_imputer):\n", + " num_pipeline = Pipeline([\n", + " ('imputer', num_imputer),\n", + " ('scaler', StandardScaler())\n", + " ])\n", + "\n", + " cat_pipeline = Pipeline([\n", + " ('imputer', SimpleImputer(strategy='most_frequent')),\n", + " ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n", + " ])\n", + "\n", + " return ColumnTransformer([\n", + " ('num', num_pipeline, num_cols),\n", + " ('cat', cat_pipeline, cat_cols)\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4a511168", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['mean', 'median', 'knn']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "imputation_strategies = {\n", + " 'mean': SimpleImputer(strategy='mean', add_indicator=True),\n", + " 'median': SimpleImputer(strategy='median', add_indicator=True),\n", + " 'knn': KNNImputer(n_neighbors=5, weights='distance')\n", + "}\n", + "\n", + "list(imputation_strategies.keys())" + ] + }, + { + "cell_type": "markdown", + "id": "f7ded51c", + "metadata": {}, + "source": [ + " ", + " ", + " ", + " ", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "bd122941", + "metadata": {}, + "source": [ + " ", + " ", + "\n", + " ", + " ", + " ", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "373c3d60", + "metadata": {}, + "source": [ + " ", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "01fc282c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4048, 14424)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "# Build preprocessor using KNN imputation for numerical features\n", + "knn_preprocessor = make_preprocessor(\n", + " KNNImputer(n_neighbors=5, weights='distance')\n", + ")\n", + "\n", + "# Apply preprocessing (fit on full filtered dataset for demonstration)\n", + "X_clean_knn = knn_preprocessor.fit_transform(df_filtered)\n", + "\n", + "X_clean_knn.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b873869a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...14414144151441614417144181441914420144211442214423
00.05.377444-0.8081520.5620881.703192-1.4131071.028977-1.947139-0.0227880.025305...0.00.00.00.00.00.00.00.00.00.0
10.03.949894-1.4708071.0446330.041843-0.5620060.367639-1.407238-0.0212020.025257...0.00.00.00.00.00.00.00.00.00.0
20.00.904860-0.1460140.0799201.5213680.094414-0.116902-1.677188-0.0239560.025306...0.00.00.00.00.00.00.00.00.00.0
30.00.8619720.086431-0.0893460.9528790.192663-0.204938-3.296892-0.0107240.025269...0.00.00.00.00.00.00.00.00.00.0
40.0-0.0202910.132817-0.1231240.4896720.194241-0.206154-4.916596-0.0188490.025294...0.00.00.00.00.00.00.00.00.00.0
\n", + "

5 rows × 14424 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 0.0 5.377444 -0.808152 0.562088 1.703192 -1.413107 1.028977 \n", + "1 0.0 3.949894 -1.470807 1.044633 0.041843 -0.562006 0.367639 \n", + "2 0.0 0.904860 -0.146014 0.079920 1.521368 0.094414 -0.116902 \n", + "3 0.0 0.861972 0.086431 -0.089346 0.952879 0.192663 -0.204938 \n", + "4 0.0 -0.020291 0.132817 -0.123124 0.489672 0.194241 -0.206154 \n", + "\n", + " 7 8 9 ... 14414 14415 14416 14417 14418 \\\n", + "0 -1.947139 -0.022788 0.025305 ... 0.0 0.0 0.0 0.0 0.0 \n", + "1 -1.407238 -0.021202 0.025257 ... 0.0 0.0 0.0 0.0 0.0 \n", + "2 -1.677188 -0.023956 0.025306 ... 0.0 0.0 0.0 0.0 0.0 \n", + "3 -3.296892 -0.010724 0.025269 ... 0.0 0.0 0.0 0.0 0.0 \n", + "4 -4.916596 -0.018849 0.025294 ... 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + " 14419 14420 14421 14422 14423 \n", + "0 0.0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + "[5 rows x 14424 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Convert the transformed array back to a DataFrame for inspection\n", + "# (Feature names are omitted for simplicity)\n", + "df_clean_knn = pd.DataFrame(X_clean_knn)\n", + "\n", + "df_clean_knn.head()" + ] + }, + { + "cell_type": "markdown", + "id": "791cac62", + "metadata": {}, + "source": [ + "##After KNN Imputation\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "59bf9f01", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0.0\n", + "1 0.0\n", + "2 0.0\n", + "3 0.0\n", + "4 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_clean_knn.isnull().mean().sort_values(ascending=False).head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}