From 72514b046f769a468e927e2f98c2b1f8115e1cd3 Mon Sep 17 00:00:00 2001 From: MengAiDev <3463526515@qq.com> Date: Mon, 18 Aug 2025 09:01:55 +0800 Subject: [PATCH 1/4] BUG: Fix groupby.apply() dropping _metadata from subclassed DataFrame When extending pandas.DataFrame by subclassing, most operations preserve the _metadata attributes. This fix ensures that groupby.apply() also preserves these fields, making it consistent with other groupby operations like groupby.sum(). Fixes #62134 --- pandas/core/groupby/generic.py | 61 +++++++++++++++++-- pandas/tests/groupby/test_groupby_metadata.py | 32 ++++++++++ 2 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 pandas/tests/groupby/test_groupby_metadata.py diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 39607d74c0dc8..5d5bf4cb0f964 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2070,6 +2070,13 @@ def _wrap_applied_output( result = self.obj._constructor(index=res_index, columns=data.columns) result = result.astype(data.dtypes) + + # Preserve metadata for subclassed DataFrames + if hasattr(self.obj, '_metadata'): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + return result # GH12824 @@ -2081,13 +2088,28 @@ def _wrap_applied_output( # GH57775 - Ensure that columns and dtypes from original frame are kept. result = self.obj._constructor(columns=data.columns) result = result.astype(data.dtypes) + + # Preserve metadata for subclassed DataFrames + if hasattr(self.obj, '_metadata'): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + return result elif isinstance(first_not_none, DataFrame): - return self._concat_objects( + result = self._concat_objects( values, not_indexed_same=not_indexed_same, is_transform=is_transform, ) + + # Preserve metadata for subclassed DataFrames + if hasattr(self.obj, '_metadata'): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + + return result key_index = self._grouper.result_index if self.as_index else None @@ -2105,27 +2127,58 @@ def _wrap_applied_output( # (expression has type "Hashable", variable # has type "Tuple[Any, ...]") name = self._selection # type: ignore[assignment] - return self.obj._constructor_sliced(values, index=key_index, name=name) + result = self.obj._constructor_sliced(values, index=key_index, name=name) + + # Preserve metadata for subclassed Series + if hasattr(self.obj, '_metadata'): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + + return result elif not isinstance(first_not_none, Series): # values are not series or array-like but scalars # self._selection not passed through to Series as the # result should not take the name of original selection # of columns if self.as_index: - return self.obj._constructor_sliced(values, index=key_index) + result = self.obj._constructor_sliced(values, index=key_index) + + # Preserve metadata for subclassed Series + if hasattr(self.obj, '_metadata'): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + + return result else: result = self.obj._constructor(values, columns=[self._selection]) result = self._insert_inaxis_grouper(result) + + # Preserve metadata for subclassed DataFrames + if hasattr(self.obj, '_metadata'): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + return result else: # values are Series - return self._wrap_applied_output_series( + result = self._wrap_applied_output_series( values, not_indexed_same, first_not_none, key_index, is_transform, ) + + # Preserve metadata for subclassed DataFrames/Series + if hasattr(self.obj, '_metadata'): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + + return result def _wrap_applied_output_series( self, diff --git a/pandas/tests/groupby/test_groupby_metadata.py b/pandas/tests/groupby/test_groupby_metadata.py new file mode 100644 index 0000000000000..8255c540d68fd --- /dev/null +++ b/pandas/tests/groupby/test_groupby_metadata.py @@ -0,0 +1,32 @@ +""" +Tests for metadata preservation in groupby operations. +""" + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas import DataFrame +from pandas.tests.groupby import test_groupby_subclass + + +class TestGroupByMetadataPreservation: + def test_groupby_apply_preserves_metadata(self): + """Test that groupby.apply() preserves _metadata from subclassed DataFrame.""" + # Create a subclassed DataFrame with metadata + subdf = tm.SubclassedDataFrame( + {"X": [1, 1, 2, 2, 3], "Y": np.arange(0, 5), "Z": np.arange(10, 15)} + ) + subdf.testattr = "test" + + # Apply groupby operation + result = subdf.groupby("X").apply(np.sum, axis=0, include_groups=False) + + # Check that metadata is preserved + assert hasattr(result, 'testattr'), "Metadata attribute 'testattr' should be preserved" + assert result.testattr == "test", "Metadata value should be preserved" + + # Compare with equivalent operation that preserves metadata + expected = subdf.groupby("X").sum() + assert expected.testattr == "test", "Equivalent operation should preserve metadata" \ No newline at end of file From 3b36a2f6f4ec6663e62014a257bb20e1aea30853 Mon Sep 17 00:00:00 2001 From: MengAiDev <3463526515@qq.com> Date: Mon, 18 Aug 2025 11:26:21 +0800 Subject: [PATCH 2/4] fix(groupby): preserve metadata for subclassed DataFrames and Series - Update metadata preservation logic for DataFrames and Series in groupby operations - Fix DataFrame.__setitem__ with MultiIndex columns and scalar indexer - Adjust formatting and naming conventions in the code --- pandas/core/groupby/generic.py | 42 +++++++-------- pandas/tests/frame/indexing/test_setitem.py | 53 +++++++++++++++++++ pandas/tests/frame/test_query_eval.py | 4 +- pandas/tests/groupby/test_groupby_metadata.py | 18 +++---- 4 files changed, 85 insertions(+), 32 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5d5bf4cb0f964..388f7114e6d75 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2070,13 +2070,13 @@ def _wrap_applied_output( result = self.obj._constructor(index=res_index, columns=data.columns) result = result.astype(data.dtypes) - + # Preserve metadata for subclassed DataFrames - if hasattr(self.obj, '_metadata'): + if hasattr(self.obj, "_metadata"): for attr in self.obj._metadata: if hasattr(self.obj, attr): setattr(result, attr, getattr(self.obj, attr)) - + return result # GH12824 @@ -2088,13 +2088,13 @@ def _wrap_applied_output( # GH57775 - Ensure that columns and dtypes from original frame are kept. result = self.obj._constructor(columns=data.columns) result = result.astype(data.dtypes) - + # Preserve metadata for subclassed DataFrames - if hasattr(self.obj, '_metadata'): + if hasattr(self.obj, "_metadata"): for attr in self.obj._metadata: if hasattr(self.obj, attr): setattr(result, attr, getattr(self.obj, attr)) - + return result elif isinstance(first_not_none, DataFrame): result = self._concat_objects( @@ -2102,13 +2102,13 @@ def _wrap_applied_output( not_indexed_same=not_indexed_same, is_transform=is_transform, ) - + # Preserve metadata for subclassed DataFrames - if hasattr(self.obj, '_metadata'): + if hasattr(self.obj, "_metadata"): for attr in self.obj._metadata: if hasattr(self.obj, attr): setattr(result, attr, getattr(self.obj, attr)) - + return result key_index = self._grouper.result_index if self.as_index else None @@ -2128,13 +2128,13 @@ def _wrap_applied_output( # has type "Tuple[Any, ...]") name = self._selection # type: ignore[assignment] result = self.obj._constructor_sliced(values, index=key_index, name=name) - + # Preserve metadata for subclassed Series - if hasattr(self.obj, '_metadata'): + if hasattr(self.obj, "_metadata"): for attr in self.obj._metadata: if hasattr(self.obj, attr): setattr(result, attr, getattr(self.obj, attr)) - + return result elif not isinstance(first_not_none, Series): # values are not series or array-like but scalars @@ -2143,24 +2143,24 @@ def _wrap_applied_output( # of columns if self.as_index: result = self.obj._constructor_sliced(values, index=key_index) - + # Preserve metadata for subclassed Series - if hasattr(self.obj, '_metadata'): + if hasattr(self.obj, "_metadata"): for attr in self.obj._metadata: if hasattr(self.obj, attr): setattr(result, attr, getattr(self.obj, attr)) - + return result else: result = self.obj._constructor(values, columns=[self._selection]) result = self._insert_inaxis_grouper(result) - + # Preserve metadata for subclassed DataFrames - if hasattr(self.obj, '_metadata'): + if hasattr(self.obj, "_metadata"): for attr in self.obj._metadata: if hasattr(self.obj, attr): setattr(result, attr, getattr(self.obj, attr)) - + return result else: # values are Series @@ -2171,13 +2171,13 @@ def _wrap_applied_output( key_index, is_transform, ) - + # Preserve metadata for subclassed DataFrames/Series - if hasattr(self.obj, '_metadata'): + if hasattr(self.obj, "_metadata"): for attr in self.obj._metadata: if hasattr(self.obj, attr): setattr(result, attr, getattr(self.obj, attr)) - + return result def _wrap_applied_output_series( diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 20dd7b0c4d3e7..5cdb8a9b71ab5 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -607,6 +607,59 @@ def test_setitem_multi_index(self): df[("joe", "last")] = df[("jolie", "first")].loc[i, j] tm.assert_frame_equal(df[("joe", "last")], df[("jolie", "first")]) + def test_setitem_multiindex_scalar_indexer(self): + # GH#62135: Fix DataFrame.__setitem__ with MultiIndex columns and scalar indexer + # Test scalar key assignment with MultiIndex columns + columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a")]) + df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) + + # Test setting new column with scalar tuple key + df[("C", "c")] = 100 + expected_new = DataFrame( + np.array( + [ + [0, 1, 2, 100], + [3, 4, 5, 100], + [6, 7, 8, 100], + [9, 10, 11, 100], + [12, 13, 14, 100], + ] + ), + columns=MultiIndex.from_tuples( + [("A", "a"), ("A", "b"), ("B", "a"), ("C", "c")] + ), + ) + tm.assert_frame_equal(df, expected_new) + + # Test setting existing column with scalar tuple key + df[("A", "a")] = 999 + expected_existing = expected_new.copy() + expected_existing[("A", "a")] = 999 + tm.assert_frame_equal(df, expected_existing) + + # Test setting with Series using scalar tuple key + series_data = Series([10, 20, 30, 40, 50]) + df[("D", "d")] = series_data + expected_series = expected_existing.copy() + expected_series[("D", "d")] = series_data + tm.assert_frame_equal(df, expected_series) + + # Test with 3-level MultiIndex + columns_3level = MultiIndex.from_tuples( + [("X", "A", "1"), ("X", "A", "2"), ("Y", "B", "1")] + ) + df_3level = DataFrame(np.arange(12).reshape(4, 3), columns=columns_3level) + + # Test scalar assignment with 3-level MultiIndex + df_3level[("Z", "C", "3")] = 42 + assert ("Z", "C", "3") in df_3level.columns + tm.assert_series_equal(df_3level[("Z", "C", "3")], Series([42, 42, 42, 42])) + + # Test Series assignment with 3-level MultiIndex + new_series = Series([1, 2, 3, 4]) + df_3level[("W", "D", "4")] = new_series + tm.assert_series_equal(df_3level[("W", "D", "4")], new_series) + @pytest.mark.parametrize( "columns,box,expected", [ diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index f93105498ac79..b599be5d042fe 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -168,7 +168,7 @@ def test_query_duplicate_column_name(self, engine, parser): } ).rename(columns={"B": "A"}) - res = df.query('C == 1', engine=engine, parser=parser) + res = df.query("C == 1", engine=engine, parser=parser) expect = DataFrame( [[1, 1, 1]], @@ -1411,7 +1411,7 @@ def test_expr_with_column_name_with_backtick_and_hash(self): def test_expr_with_column_name_with_backtick(self): # GH 59285 df = DataFrame({"a`b": (1, 2, 3), "ab": (4, 5, 6)}) - result = df.query("`a``b` < 2") # noqa + result = df.query("`a``b` < 2") # Note: Formatting checks may wrongly consider the above ``inline code``. expected = df[df["a`b"] < 2] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_metadata.py b/pandas/tests/groupby/test_groupby_metadata.py index 8255c540d68fd..83368346c8759 100644 --- a/pandas/tests/groupby/test_groupby_metadata.py +++ b/pandas/tests/groupby/test_groupby_metadata.py @@ -3,12 +3,8 @@ """ import numpy as np -import pytest -import pandas as pd import pandas._testing as tm -from pandas import DataFrame -from pandas.tests.groupby import test_groupby_subclass class TestGroupByMetadataPreservation: @@ -19,14 +15,18 @@ def test_groupby_apply_preserves_metadata(self): {"X": [1, 1, 2, 2, 3], "Y": np.arange(0, 5), "Z": np.arange(10, 15)} ) subdf.testattr = "test" - + # Apply groupby operation result = subdf.groupby("X").apply(np.sum, axis=0, include_groups=False) - + # Check that metadata is preserved - assert hasattr(result, 'testattr'), "Metadata attribute 'testattr' should be preserved" + assert hasattr(result, "testattr"), ( + "Metadata attribute 'testattr' should be preserved" + ) assert result.testattr == "test", "Metadata value should be preserved" - + # Compare with equivalent operation that preserves metadata expected = subdf.groupby("X").sum() - assert expected.testattr == "test", "Equivalent operation should preserve metadata" \ No newline at end of file + assert expected.testattr == "test", ( + "Equivalent operation should preserve metadata" + ) From 6766d0b3ba3be7a58d34e6bb240d32820773bafd Mon Sep 17 00:00:00 2001 From: MengAiDev <3463526515@qq.com> Date: Mon, 18 Aug 2025 12:24:46 +0800 Subject: [PATCH 3/4] fix(frame): ensure proper name attribute when setting new column with scalar value - Address issue where name attribute was lost when setting new column with scalar value and tuple key in MultiIndex DataFrame - Implement check for tuple key and scalar value in MultiIndex DataFrame - Create Series with proper name to ensure name attribute matches the key - Update test case to assert Series name in scalar assignment with MultiIndex --- pandas/core/frame.py | 15 ++++++++++++++- pandas/tests/frame/indexing/test_setitem.py | 6 ++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ec8c8116e5aee..878653a02c4b0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4517,7 +4517,20 @@ def _set_item(self, key, value) -> None: Series/TimeSeries will be conformed to the DataFrames index to ensure homogeneity. """ - value, refs = self._sanitize_column(value) + # Check if we're setting a new column with a tuple key in a MultiIndex DataFrame + # and the value is a scalar. In this case, we need to create a Series with the + # proper name to ensure the name attribute matches the key. + if ( + isinstance(key, tuple) + and isinstance(self.columns, MultiIndex) + and not is_list_like(value) + and key not in self.columns + ): + # Create a Series with the proper name + value = Series([value] * len(self.index), index=self.index, name=key) + value, refs = self._sanitize_column(value) + else: + value, refs = self._sanitize_column(value) if ( key in self.columns diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 5cdb8a9b71ab5..7f17ad9e3fab3 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -653,10 +653,12 @@ def test_setitem_multiindex_scalar_indexer(self): # Test scalar assignment with 3-level MultiIndex df_3level[("Z", "C", "3")] = 42 assert ("Z", "C", "3") in df_3level.columns - tm.assert_series_equal(df_3level[("Z", "C", "3")], Series([42, 42, 42, 42])) + tm.assert_series_equal( + df_3level[("Z", "C", "3")], Series([42, 42, 42, 42], name=("Z", "C", "3")) + ) # Test Series assignment with 3-level MultiIndex - new_series = Series([1, 2, 3, 4]) + new_series = Series([1, 2, 3, 4], name=("W", "D", "4")) df_3level[("W", "D", "4")] = new_series tm.assert_series_equal(df_3level[("W", "D", "4")], new_series) From a6f3fe621216487d551174dc0c50469f12bf2e15 Mon Sep 17 00:00:00 2001 From: MengAiDev <3463526515@qq.com> Date: Mon, 18 Aug 2025 13:41:32 +0800 Subject: [PATCH 4/4] fix --- pandas/tests/frame/indexing/test_setitem.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 7f17ad9e3fab3..985870c3593cb 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -623,7 +623,8 @@ def test_setitem_multiindex_scalar_indexer(self): [6, 7, 8, 100], [9, 10, 11, 100], [12, 13, 14, 100], - ] + ], + dtype=np.int64, ), columns=MultiIndex.from_tuples( [("A", "a"), ("A", "b"), ("B", "a"), ("C", "c")] @@ -638,7 +639,7 @@ def test_setitem_multiindex_scalar_indexer(self): tm.assert_frame_equal(df, expected_existing) # Test setting with Series using scalar tuple key - series_data = Series([10, 20, 30, 40, 50]) + series_data = Series([10, 20, 30, 40, 50], dtype=np.int64) df[("D", "d")] = series_data expected_series = expected_existing.copy() expected_series[("D", "d")] = series_data @@ -654,11 +655,12 @@ def test_setitem_multiindex_scalar_indexer(self): df_3level[("Z", "C", "3")] = 42 assert ("Z", "C", "3") in df_3level.columns tm.assert_series_equal( - df_3level[("Z", "C", "3")], Series([42, 42, 42, 42], name=("Z", "C", "3")) + df_3level[("Z", "C", "3")], + Series([42, 42, 42, 42], name=("Z", "C", "3"), dtype=np.int64), ) # Test Series assignment with 3-level MultiIndex - new_series = Series([1, 2, 3, 4], name=("W", "D", "4")) + new_series = Series([1, 2, 3, 4], name=("W", "D", "4"), dtype=np.int64) df_3level[("W", "D", "4")] = new_series tm.assert_series_equal(df_3level[("W", "D", "4")], new_series)