diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2d74be6f503a2..87dba1bfa3a1f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -807,6 +807,7 @@ Reshaping - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) +- Bug in :meth:`DataFrame.unstack` where when sort is False, in frames with NA columns, unstacking causing errors or improper orders (:issue:`61221`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) Sparse diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d2a838b616426..c63e3d28dad2d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -134,6 +134,10 @@ def __init__( self.removed_level_full = index.levels[self.level] if not self.sort: unique_codes = unique(self.index.codes[self.level]) + # Bug Fix GH 61221 + # The -1 in the unsorted unique codes causes for errors + # saving the NA location to be used in the repeater + unique_codes = unique_codes[unique_codes != -1] self.removed_level = self.removed_level.take(unique_codes) self.removed_level_full = self.removed_level_full.take(unique_codes) @@ -170,7 +174,14 @@ def _indexer_and_to_sort( codes = list(self.index.codes) if not self.sort: # Create new codes considering that labels are already sorted - codes = [factorize(code)[0] for code in codes] + # setting nans back to nan to maintain the -1 values + if self.lift: + codes = [ + factorize(np.where(code == -1, np.nan, code))[0] for code in codes + ] + else: + codes = [factorize(code)[0] for code in codes] + levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) @@ -189,9 +200,15 @@ def sorted_labels(self) -> list[np.ndarray]: return to_sort def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: - indexer, _ = self._indexer_and_to_sort - sorted_values = algos.take_nd(values, indexer, axis=0) - return sorted_values + if self.sort: + indexer, _ = self._indexer_and_to_sort + sorted_values = algos.take_nd(values, indexer, axis=0) + return sorted_values + level_sizes = tuple(len(level) for level in self.new_index_levels) + group_ids = get_group_index( + self.sorted_labels[:-1], level_sizes, sort=False, xnull=False + ) + return values[np.argsort(group_ids, kind="mergesort")] def _make_selectors(self) -> None: new_levels = self.new_index_levels @@ -381,11 +398,22 @@ def _repeater(self) -> np.ndarray: # In this case, we remap the new codes to the original level: repeater = self.removed_level_full.get_indexer(self.removed_level) if self.lift: - repeater = np.insert(repeater, 0, -1) + if not self.sort: + na_index = (self.index.codes[self.level] == -1).nonzero()[0][0] + repeater = np.insert(repeater, na_index, -1) + else: + repeater = np.insert(repeater, 0, -1) else: # Otherwise, we just use each level item exactly once: stride = len(self.removed_level) + self.lift - repeater = np.arange(stride) - self.lift + if self.sort or not self.lift: + repeater = np.arange(stride) - self.lift + else: + na_index = (self.index.codes[self.level] == -1).nonzero()[0][0] + repeater = np.arange(stride) - self.lift + if na_index: + repeater[na_index] = -1 + repeater[:na_index] += 1 return repeater @@ -565,7 +593,6 @@ def _unstack_frame( unstacker = _Unstacker( obj.index, level=level, constructor=obj._constructor, sort=sort ) - if not obj._can_fast_transpose: mgr = obj._mgr.unstack(unstacker, fill_value=fill_value) return obj._constructor_from_mgr(mgr, axes=mgr.axes) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 22fdfd3a01408..6037acbc6dcfb 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1605,6 +1605,106 @@ def test_stack_sort_false(future_stack): tm.assert_frame_equal(result, expected) +def test_unstack_sort_false_na1(): + # GH 61221 + # Test unstacking with NA as the last value + + levels1 = ["b", "a"] + levels2 = Index([1, 2, 3, None]) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", 1.0): [0, 4], + ("value", 2.0): [1, 5], + ("value", 3.0): [2, 6], + ("value", pd.NA): [3, 7], + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [("value", 1.0), ("value", 2.0), ("value", 3.0), ("value", pd.NA)], + names=[None, "level2"], + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_unstack_sort_false_na2(): + # GH 61221 + # Test unstacking with NA as the first value + + levels1 = ["b", "a"] + levels2 = Index([None, 1, 2, 3]) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", pd.NA): [0, 4], + ("value", 1.0): [1, 5], + ("value", 2.0): [2, 6], + ("value", 3.0): [3, 7], + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [("value", pd.NA), ("value", 1.0), ("value", 2.0), ("value", 3.0)], + names=[None, "level2"], + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_unstack_sort_false_na3(): + # GH 61221 + # Test unstacking with NA in the middle + + levels1 = ["b", "a"] + levels2 = Index([1, None, 2, 3]) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", 1.0): [0, 4], + ("value", pd.NA): [1, 5], + ("value", 2.0): [2, 6], + ("value", 3.0): [3, 7], + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [("value", 1.0), ("value", pd.NA), ("value", 2.0), ("value", 3.0)], + names=[None, "level2"], + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_unstack_sort_false_na_mixed(): + # GH 61221 + # Test unstacking to see if order is maintained. + + levels1 = ["b", "a"] + levels2 = Index([3, None, 1, 2]) + index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"]) + df = DataFrame({"value": range(len(index))}, index=index) + result = df.unstack(level="level2", sort=False) + expected = DataFrame( + { + ("value", 3.0): [0, 4], + ("value", pd.NA): [1, 5], + ("value", 1.0): [2, 6], + ("value", 2.0): [3, 7], + }, + index=Index(["b", "a"], name="level1"), + columns=MultiIndex.from_tuples( + [("value", 3.0), ("value", pd.NA), ("value", 1.0), ("value", 2.0)], + names=[None, "level2"], + ), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated") def test_stack_sort_false_multi_level(future_stack): # GH 15105 diff --git a/web/pandas/static/img/books/pandas_cookbook_3.jpeg b/web/pandas/static/img/books/pandas_cookbook_3.jpeg new file mode 100644 index 0000000000000..cf1c27037de68 Binary files /dev/null and b/web/pandas/static/img/books/pandas_cookbook_3.jpeg differ