PERF: Fixed cut regression, improve Categorical (pandas-dev#34952)

smithto1 · Jun 24, 2020 · 314ac9a · 314ac9a
1 parent 3f4f564
commit 314ac9a
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 0 deletions.
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -34,6 +34,7 @@ def setup(self):
         self.values_all_int8 = np.ones(N, "int8")
         self.categorical = pd.Categorical(self.values, self.categories)
         self.series = pd.Series(self.categorical)
+        self.intervals = pd.interval_range(0, 1, periods=N // 10)
 
     def time_regular(self):
         pd.Categorical(self.values, self.categories)
@@ -44,6 +45,9 @@ def time_fastpath(self):
     def time_datetimes(self):
         pd.Categorical(self.datetimes)
 
+    def time_interval(self):
+        pd.Categorical(self.datetimes, categories=self.datetimes)
+
     def time_datetimes_with_nat(self):
         pd.Categorical(self.datetimes_with_nat)
 

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -828,6 +828,8 @@ Performance improvements
 - Performance improvement for groupby methods :meth:`~pandas.core.groupby.groupby.Groupby.first`
   and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`)
 - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`).
+- Performance improvement when constructing :class:`Categorical` objects (:issue:`33921`)
+- Fixed performance regression in :func:`pandas.qcut` and :func:`pandas.cut` (:issue:`33921`)
 - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
 - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`)
 - Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2611,6 +2611,11 @@ def _get_codes_for_values(values, categories):
         values = ensure_object(values)
         categories = ensure_object(categories)
 
+    if isinstance(categories, ABCIndexClass):
+        return coerce_indexer_dtype(categories.get_indexer_for(values), categories)
+
+    # Only hit here when we've already coerced to object dtypee.
+
     hash_klass, vals = _get_data_algo(values)
     _, cats = _get_data_algo(categories)
     t = hash_klass(len(cats))

diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -643,3 +643,45 @@ def test_constructor_string_and_tuples(self):
         c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
         expected_index = pd.Index([("a", "b"), ("b", "a"), "c"])
         assert c.categories.equals(expected_index)
+
+    def test_interval(self):
+        idx = pd.interval_range(0, 10, periods=10)
+        cat = pd.Categorical(idx, categories=idx)
+        expected_codes = np.arange(10, dtype="int8")
+        tm.assert_numpy_array_equal(cat.codes, expected_codes)
+        tm.assert_index_equal(cat.categories, idx)
+
+        # infer categories
+        cat = pd.Categorical(idx)
+        tm.assert_numpy_array_equal(cat.codes, expected_codes)
+        tm.assert_index_equal(cat.categories, idx)
+
+        # list values
+        cat = pd.Categorical(list(idx))
+        tm.assert_numpy_array_equal(cat.codes, expected_codes)
+        tm.assert_index_equal(cat.categories, idx)
+
+        # list values, categories
+        cat = pd.Categorical(list(idx), categories=list(idx))
+        tm.assert_numpy_array_equal(cat.codes, expected_codes)
+        tm.assert_index_equal(cat.categories, idx)
+
+        # shuffled
+        values = idx.take([1, 2, 0])
+        cat = pd.Categorical(values, categories=idx)
+        tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8"))
+        tm.assert_index_equal(cat.categories, idx)
+
+        # extra
+        values = pd.interval_range(8, 11, periods=3)
+        cat = pd.Categorical(values, categories=idx)
+        expected_codes = np.array([8, 9, -1], dtype="int8")
+        tm.assert_numpy_array_equal(cat.codes, expected_codes)
+        tm.assert_index_equal(cat.categories, idx)
+
+        # overlapping
+        idx = pd.IntervalIndex([pd.Interval(0, 2), pd.Interval(0, 1)])
+        cat = pd.Categorical(idx, categories=idx)
+        expected_codes = np.array([0, 1], dtype="int8")
+        tm.assert_numpy_array_equal(cat.codes, expected_codes)
+        tm.assert_index_equal(cat.categories, idx)