From a6329e853ded8b89b9ea04bd9d51391e49a5e1aa Mon Sep 17 00:00:00 2001
From: Victor Varaschin <vvaraschin@gmail.com>
Date: Sun, 3 May 2020 17:19:22 -0400
Subject: [PATCH] Use SimpleImputer for categorical values with
 strategy=most_frequent

---
 03_classification.ipynb | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/03_classification.ipynb b/03_classification.ipynb
index 9d1a2f509..b92ab10d8 100644
--- a/03_classification.ipynb
+++ b/03_classification.ipynb
@@ -2957,23 +2957,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We will also need an imputer for the string categorical columns (the regular `SimpleImputer` does not work on those):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 115,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Inspired from stackoverflow.com/questions/25239958\n",
-    "class MostFrequentImputer(BaseEstimator, TransformerMixin):\n",
-    "    def fit(self, X, y=None):\n",
-    "        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],\n",
-    "                                        index=X.columns)\n",
-    "        return self\n",
-    "    def transform(self, X, y=None):\n",
-    "        return X.fillna(self.most_frequent_)"
+    "We will also need an imputer for the string categorical columns - we can again use the `SimpleImputer`, but with `strategy=“most_frequent”`:"
    ]
   },
   {
@@ -3011,7 +2995,7 @@
    "source": [
     "cat_pipeline = Pipeline([\n",
     "        (\"select_cat\", DataFrameSelector([\"Pclass\", \"Sex\", \"Embarked\"])),\n",
-    "        (\"imputer\", MostFrequentImputer()),\n",
+    "        (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n",
     "        (\"cat_encoder\", OneHotEncoder(sparse=False)),\n",
     "    ])"
    ]
@@ -4477,4 +4461,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
+}
\ No newline at end of file