From a6329e853ded8b89b9ea04bd9d51391e49a5e1aa Mon Sep 17 00:00:00 2001 From: Victor Varaschin Date: Sun, 3 May 2020 17:19:22 -0400 Subject: [PATCH] Use SimpleImputer for categorical values with strategy=most_frequent --- 03_classification.ipynb | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/03_classification.ipynb b/03_classification.ipynb index 9d1a2f509..b92ab10d8 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -2957,23 +2957,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will also need an imputer for the string categorical columns (the regular `SimpleImputer` does not work on those):" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "metadata": {}, - "outputs": [], - "source": [ - "# Inspired from stackoverflow.com/questions/25239958\n", - "class MostFrequentImputer(BaseEstimator, TransformerMixin):\n", - " def fit(self, X, y=None):\n", - " self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],\n", - " index=X.columns)\n", - " return self\n", - " def transform(self, X, y=None):\n", - " return X.fillna(self.most_frequent_)" + "We will also need an imputer for the string categorical columns - we can again use the `SimpleImputer`, but with `strategy=“most_frequent”`:" ] }, { @@ -3011,7 +2995,7 @@ "source": [ "cat_pipeline = Pipeline([\n", " (\"select_cat\", DataFrameSelector([\"Pclass\", \"Sex\", \"Embarked\"])),\n", - " (\"imputer\", MostFrequentImputer()),\n", + " (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n", " (\"cat_encoder\", OneHotEncoder(sparse=False)),\n", " ])" ] @@ -4477,4 +4461,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} +} \ No newline at end of file