Skip to content

Commit 01c3e8b

Browse files
authored
PERF: skip libjoin fastpath for MultiIndex (#54765)
* PERF: skip libjoin fastpath for MultiIndex * fix levels sort
1 parent beb719b commit 01c3e8b

File tree

1 file changed

+15
-27
lines changed

1 file changed

+15
-27
lines changed

pandas/core/indexes/base.py

+15-27
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@
124124
from pandas.core.dtypes.generic import (
125125
ABCDataFrame,
126126
ABCDatetimeIndex,
127+
ABCIntervalIndex,
127128
ABCMultiIndex,
128129
ABCPeriodIndex,
129130
ABCSeries,
@@ -3492,8 +3493,6 @@ def _intersection(self, other: Index, sort: bool = False):
34923493
and other.is_monotonic_increasing
34933494
and self._can_use_libjoin
34943495
and other._can_use_libjoin
3495-
and not isinstance(self, ABCMultiIndex)
3496-
and not isinstance(other, ABCMultiIndex)
34973496
):
34983497
try:
34993498
res_indexer, indexer, _ = self._inner_indexer(other)
@@ -4632,28 +4631,13 @@ def join(
46324631

46334632
_validate_join_method(how)
46344633

4635-
if not self.is_unique and not other.is_unique:
4636-
return self._join_non_unique(other, how=how, sort=sort)
4637-
elif not self.is_unique or not other.is_unique:
4638-
if self.is_monotonic_increasing and other.is_monotonic_increasing:
4639-
# Note: 2023-08-15 we *do* have tests that get here with
4640-
# Categorical, string[python] (can use libjoin)
4641-
# and Interval (cannot)
4642-
if self._can_use_libjoin and other._can_use_libjoin:
4643-
# otherwise we will fall through to _join_via_get_indexer
4644-
# GH#39133
4645-
# go through object dtype for ea till engine is supported properly
4646-
return self._join_monotonic(other, how=how)
4647-
else:
4648-
return self._join_non_unique(other, how=how, sort=sort)
4649-
elif (
4650-
# GH48504: exclude MultiIndex to avoid going through MultiIndex._values
4651-
self.is_monotonic_increasing
4634+
if (
4635+
not isinstance(self.dtype, CategoricalDtype)
4636+
and self.is_monotonic_increasing
46524637
and other.is_monotonic_increasing
46534638
and self._can_use_libjoin
46544639
and other._can_use_libjoin
4655-
and not isinstance(self, ABCMultiIndex)
4656-
and not isinstance(self.dtype, CategoricalDtype)
4640+
and (self.is_unique or other.is_unique)
46574641
):
46584642
# Categorical is monotonic if data are ordered as categories, but join can
46594643
# not handle this in case of not lexicographically monotonic GH#38502
@@ -4662,6 +4646,8 @@ def join(
46624646
except TypeError:
46634647
# object dtype; non-comparable objects
46644648
pass
4649+
elif not self.is_unique or not other.is_unique:
4650+
return self._join_non_unique(other, how=how, sort=sort)
46654651

46664652
return self._join_via_get_indexer(other, how, sort)
46674653

@@ -4797,6 +4783,9 @@ def _join_non_unique(
47974783
join_idx = self.take(left_idx)
47984784
right = other.take(right_idx)
47994785
join_index = join_idx.putmask(mask, right)
4786+
if isinstance(join_index, ABCMultiIndex) and how == "outer":
4787+
# test_join_index_levels
4788+
join_index = join_index._sort_levels_monotonic()
48004789
return join_index, left_idx, right_idx
48014790

48024791
@final
@@ -5042,10 +5031,10 @@ def _can_use_libjoin(self) -> bool:
50425031
or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray))
50435032
or self.dtype == "string[python]"
50445033
)
5045-
# For IntervalIndex, the conversion to numpy converts
5046-
# to object dtype, which negates the performance benefit of libjoin
5047-
# TODO: exclude RangeIndex and MultiIndex as these also make copies?
5048-
return not isinstance(self.dtype, IntervalDtype)
5034+
# Exclude index types where the conversion to numpy converts to object dtype,
5035+
# which negates the performance benefit of libjoin
5036+
# TODO: exclude RangeIndex? Seems to break test_concat_datetime_timezone
5037+
return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex))
50495038

50505039
# --------------------------------------------------------------------
50515040
# Uncategorized Methods
@@ -5180,8 +5169,7 @@ def _get_join_target(self) -> np.ndarray:
51805169
# present
51815170
return self._values.to_numpy()
51825171

5183-
# TODO: exclude ABCRangeIndex, ABCMultiIndex cases here as those create
5184-
# copies.
5172+
# TODO: exclude ABCRangeIndex case here as it copies
51855173
target = self._get_engine_target()
51865174
if not isinstance(target, np.ndarray):
51875175
raise ValueError("_can_use_libjoin should return False.")

0 commit comments

Comments
 (0)