Skip to content

Commit

Permalink
PERF: Fixed cut regression, improve Categorical (pandas-dev#34952)
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger authored Jun 24, 2020
1 parent 3f4f564 commit 314ac9a
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 0 deletions.
4 changes: 4 additions & 0 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def setup(self):
self.values_all_int8 = np.ones(N, "int8")
self.categorical = pd.Categorical(self.values, self.categories)
self.series = pd.Series(self.categorical)
self.intervals = pd.interval_range(0, 1, periods=N // 10)

def time_regular(self):
pd.Categorical(self.values, self.categories)
Expand All @@ -44,6 +45,9 @@ def time_fastpath(self):
def time_datetimes(self):
pd.Categorical(self.datetimes)

def time_interval(self):
pd.Categorical(self.datetimes, categories=self.datetimes)

def time_datetimes_with_nat(self):
pd.Categorical(self.datetimes_with_nat)

Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,8 @@ Performance improvements
- Performance improvement for groupby methods :meth:`~pandas.core.groupby.groupby.Groupby.first`
and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`)
- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`).
- Performance improvement when constructing :class:`Categorical` objects (:issue:`33921`)
- Fixed performance regression in :func:`pandas.qcut` and :func:`pandas.cut` (:issue:`33921`)
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).
- Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`)
- Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`)
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2611,6 +2611,11 @@ def _get_codes_for_values(values, categories):
values = ensure_object(values)
categories = ensure_object(categories)

if isinstance(categories, ABCIndexClass):
return coerce_indexer_dtype(categories.get_indexer_for(values), categories)

# Only hit here when we've already coerced to object dtypee.

hash_klass, vals = _get_data_algo(values)
_, cats = _get_data_algo(categories)
t = hash_klass(len(cats))
Expand Down
42 changes: 42 additions & 0 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,3 +643,45 @@ def test_constructor_string_and_tuples(self):
c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
expected_index = pd.Index([("a", "b"), ("b", "a"), "c"])
assert c.categories.equals(expected_index)

def test_interval(self):
idx = pd.interval_range(0, 10, periods=10)
cat = pd.Categorical(idx, categories=idx)
expected_codes = np.arange(10, dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)

# infer categories
cat = pd.Categorical(idx)
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)

# list values
cat = pd.Categorical(list(idx))
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)

# list values, categories
cat = pd.Categorical(list(idx), categories=list(idx))
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)

# shuffled
values = idx.take([1, 2, 0])
cat = pd.Categorical(values, categories=idx)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8"))
tm.assert_index_equal(cat.categories, idx)

# extra
values = pd.interval_range(8, 11, periods=3)
cat = pd.Categorical(values, categories=idx)
expected_codes = np.array([8, 9, -1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)

# overlapping
idx = pd.IntervalIndex([pd.Interval(0, 2), pd.Interval(0, 1)])
cat = pd.Categorical(idx, categories=idx)
expected_codes = np.array([0, 1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)

0 comments on commit 314ac9a

Please sign in to comment.