Skip to content

Commit a6329e8

Browse files
committed
Use SimpleImputer for categorical values with strategy=most_frequent
1 parent 0322dce commit a6329e8

File tree

1 file changed

+3
-19
lines changed

1 file changed

+3
-19
lines changed

03_classification.ipynb

+3-19
Original file line numberDiff line numberDiff line change
@@ -2957,23 +2957,7 @@
29572957
"cell_type": "markdown",
29582958
"metadata": {},
29592959
"source": [
2960-
"We will also need an imputer for the string categorical columns (the regular `SimpleImputer` does not work on those):"
2961-
]
2962-
},
2963-
{
2964-
"cell_type": "code",
2965-
"execution_count": 115,
2966-
"metadata": {},
2967-
"outputs": [],
2968-
"source": [
2969-
"# Inspired from stackoverflow.com/questions/25239958\n",
2970-
"class MostFrequentImputer(BaseEstimator, TransformerMixin):\n",
2971-
" def fit(self, X, y=None):\n",
2972-
" self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],\n",
2973-
" index=X.columns)\n",
2974-
" return self\n",
2975-
" def transform(self, X, y=None):\n",
2976-
" return X.fillna(self.most_frequent_)"
2960+
"We will also need an imputer for the string categorical columns - we can again use the `SimpleImputer`, but with `strategy=“most_frequent”`:"
29772961
]
29782962
},
29792963
{
@@ -3011,7 +2995,7 @@
30112995
"source": [
30122996
"cat_pipeline = Pipeline([\n",
30132997
" (\"select_cat\", DataFrameSelector([\"Pclass\", \"Sex\", \"Embarked\"])),\n",
3014-
" (\"imputer\", MostFrequentImputer()),\n",
2998+
" (\"imputer\", SimpleImputer(strategy=\"most_frequent\")),\n",
30152999
" (\"cat_encoder\", OneHotEncoder(sparse=False)),\n",
30163000
" ])"
30173001
]
@@ -4477,4 +4461,4 @@
44774461
},
44784462
"nbformat": 4,
44794463
"nbformat_minor": 1
4480-
}
4464+
}

0 commit comments

Comments
 (0)