diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index ffe6c7730bcdc..0d4a4ed1b8804 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -83,7 +83,9 @@ class BlockManager: def __init__( self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=... ) -> None: ... - def get_slice(self, slobj: slice, axis: int = ...) -> Self: ... + def get_slice( + self, slobj: slice, axis: int = ..., using_cow: bool = False + ) -> Self: ... def _rebuild_blknos_and_blklocs(self) -> None: ... class BlockValuesRefs: diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index fdfb8e1c99f6e..c33ce7638e1f6 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -845,7 +845,7 @@ cdef class BlockManager: # ------------------------------------------------------------------- # Indexing - cdef BlockManager _slice_mgr_rows(self, slice slobj): + cdef BlockManager _slice_mgr_rows(self, slice slobj, bint using_cow): cdef: Block blk, nb BlockManager mgr @@ -856,7 +856,10 @@ cdef class BlockManager: nb = blk.slice_block_rows(slobj) nbs.append(nb) - new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] + if using_cow: + new_axes = [self.axes[0]._view(), self.axes[1]._getitem_slice(slobj)] + else: + new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False) # We can avoid having to rebuild blklocs/blknos @@ -867,17 +870,21 @@ cdef class BlockManager: mgr._blklocs = blklocs.copy() return mgr - def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: + def get_slice( + self, slobj: slice, axis: int = 0, using_cow: bool = False + ) -> BlockManager: if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) elif axis == 1: - return self._slice_mgr_rows(slobj) + return self._slice_mgr_rows(slobj, using_cow) else: raise IndexError("Requested axis not found in manager") new_axes = list(self.axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) + if using_cow: + new_axes[1 - axis] = self.axes[1 - axis]._view() return type(self)(tuple(new_blocks), new_axes, verify_integrity=False) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1674960f21b19..81fc6630cc844 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4339,7 +4339,7 @@ def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self: """ assert isinstance(slobj, slice), type(slobj) axis = self._get_block_manager_axis(axis) - new_mgr = self._mgr.get_slice(slobj, axis=axis) + new_mgr = self._mgr.get_slice(slobj, axis=axis, using_cow=using_copy_on_write()) result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) result = result.__finalize__(self) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 9987908f407b3..2747a9bac1701 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -671,7 +671,9 @@ def fast_xs(self, loc: int) -> SingleArrayManager: result = np.array(values, dtype=dtype) return SingleArrayManager([result], [self._axes[1]]) - def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager: + def get_slice( + self, slobj: slice, axis: AxisInt = 0, using_cow: bool = False + ) -> ArrayManager: axis = self._normalize_axis(axis) if axis == 0: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 13c039cef3f91..fdbe3710431fc 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -225,6 +225,7 @@ def blklocs(self) -> npt.NDArray[np.intp]: def make_empty(self, axes=None) -> Self: """return an empty BlockManager with the items axis of len 0""" if axes is None: + # TODO shallow copy remaining axis? axes = [Index([])] + self.axes[1:] # preserve dtype if possible @@ -381,6 +382,7 @@ def apply( applied = getattr(b, f)(**kwargs) result_blocks = extend_blocks(applied, result_blocks) + # TODO shallow copy axes (in from_blocks or here?) out = type(self).from_blocks(result_blocks, self.axes) return out @@ -539,6 +541,7 @@ def get_numeric_data(self, copy: bool = False) -> Self: # Avoid somewhat expensive _combine if copy: return self.copy(deep=True) + # TODO(CoW) need to return a shallow copy here? return self return self._combine(numeric_blocks, copy) @@ -570,6 +573,7 @@ def _combine( new_blocks.append(nb) axes = list(self.axes) + # TODO shallow copy of axes? if index is not None: axes[-1] = index axes[0] = self.items.take(indexer) @@ -641,6 +645,7 @@ def consolidate(self) -> Self: if self.is_consolidated(): return self + # TODO shallow copy is not needed here? bm = type(self)(self.blocks, self.axes, verify_integrity=False) bm._is_consolidated = False bm._consolidate_inplace() @@ -685,6 +690,7 @@ def reindex_indexer( if indexer is None: if new_axis is self.axes[axis] and not copy: + # TODO(CoW) need to handle CoW? return self result = self.copy(deep=copy) @@ -723,6 +729,8 @@ def reindex_indexer( new_axes = list(self.axes) new_axes[axis] = new_axis + if self.ndim == 2 and using_copy_on_write(): + new_axes[1 - axis] = self.axes[1 - axis]._view() new_mgr = type(self).from_blocks(new_blocks, new_axes) if axis == 1: @@ -1005,6 +1013,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager: ndim=1, refs=self.blocks[0].refs, ) + # TODO shallow copy columns return SingleBlockManager(block, self.axes[0]) dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) @@ -1033,6 +1042,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager: bp = BlockPlacement(slice(0, len(result))) block = new_block(result, placement=bp, ndim=1) + # TODO shallow copy columns return SingleBlockManager(block, self.axes[0]) def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: @@ -1047,6 +1057,7 @@ def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: nb = type(block)( values, placement=bp, ndim=1, refs=block.refs if track_ref else None ) + # TODO shallow copy index? (might already be done where this gets called) return SingleBlockManager(nb, self.axes[1]) def iget_values(self, i: int) -> ArrayLike: @@ -1447,6 +1458,7 @@ def idelete(self, indexer) -> BlockManager: nbs = self._slice_take_blocks_ax0(taker, only_slice=True, ref_inplace_op=True) new_columns = self.items[~is_deleted] + # TODO shallow copy index? axes = [new_columns, self.axes[1]] return type(self)(tuple(nbs), axes, verify_integrity=False) @@ -1484,6 +1496,7 @@ def grouped_reduce(self, func: Callable) -> Self: nrows = result_blocks[0].values.shape[-1] index = Index(range(nrows)) + # TODO shallow copy columns? return type(self).from_blocks(result_blocks, [self.axes[0], index]) def reduce(self, func: Callable) -> Self: @@ -1507,6 +1520,7 @@ def reduce(self, func: Callable) -> Self: res_blocks.extend(nbs) index = Index([None]) # placeholder + # TODO shallow copy self.items new_mgr = type(self).from_blocks(res_blocks, [self.items, index]) return new_mgr @@ -1548,6 +1562,7 @@ def quantile( assert self.ndim >= 2 assert is_list_like(qs) # caller is responsible for this + # TODO shallow copy axes new_axes = list(self.axes) new_axes[1] = Index(qs, dtype=np.float64) @@ -1820,6 +1835,7 @@ def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self: offset += len(mgr.items) + # TODO relevant axis already shallow-copied at caller? new_mgr = cls(tuple(blocks), axes) return new_mgr @@ -1889,6 +1905,7 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: arr = ensure_block_shape(blk.values, ndim=2) bp = BlockPlacement(0) new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs) + # TODO shallow copy index axes = [columns, self.axes[0]] return BlockManager([new_blk], axes=axes, verify_integrity=False) diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index cf9466c0bdf0b..e51bcab6dd0c6 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -89,6 +89,7 @@ def operate_blockwise( # assert len(slocs) == nlocs, (len(slocs), nlocs) # assert slocs == set(range(nlocs)), slocs + # TODO shallow copy axes? new_mgr = type(right)(tuple(res_blks), axes=right.axes, verify_integrity=False) return new_mgr diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index c4d5e9dbce72a..032deafeca556 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -59,6 +59,9 @@ def test_subset_column_selection(backend, using_copy_on_write): subset = df[["a", "c"]] + if using_copy_on_write: + assert subset.index is not df.index + if using_copy_on_write: # the subset shares memory ... assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) @@ -111,6 +114,9 @@ def test_subset_row_slice(backend, using_copy_on_write): subset = df[1:3] subset._mgr._verify_integrity() + if using_copy_on_write: + assert subset.columns is not df.columns + assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) if using_copy_on_write: @@ -156,6 +162,9 @@ def test_subset_column_slice( subset = df.iloc[:, 1:] subset._mgr._verify_integrity() + if using_copy_on_write: + assert subset.index is not df.index + if using_copy_on_write: assert np.shares_memory(get_array(subset, "b"), get_array(df, "b")) @@ -219,6 +228,10 @@ def test_subset_loc_rows_columns( subset = df.loc[row_indexer, column_indexer] + if using_copy_on_write: + assert subset.index is not df.index + assert subset.columns is not df.columns + # a few corner cases _do_ actually modify the parent (with both row and column # slice, and in case of ArrayManager or BlockManager with single block) mutate_parent = ( @@ -283,6 +296,10 @@ def test_subset_iloc_rows_columns( subset = df.iloc[row_indexer, column_indexer] + if using_copy_on_write: + assert subset.index is not df.index + assert subset.columns is not df.columns + # a few corner cases _do_ actually modify the parent (with both row and column # slice, and in case of ArrayManager or BlockManager with single block) mutate_parent = ( @@ -761,6 +778,10 @@ def test_null_slice(backend, method, using_copy_on_write, warn_copy_on_write): df2 = method(df) + if using_copy_on_write: + assert df2.index is not df.index + assert df2.columns is not df.columns + # we always return new objects (shallow copy), regardless of CoW or not assert df2 is not df @@ -790,6 +811,9 @@ def test_null_slice_series(backend, method, using_copy_on_write, warn_copy_on_wr s2 = method(s) + if using_copy_on_write: + assert s2.index is not s.index + # we always return new objects, regardless of CoW or not assert s2 is not s @@ -947,6 +971,9 @@ def test_column_as_series( s = df["a"] + if using_copy_on_write: + assert s.index is not df.index + assert np.shares_memory(get_array(s, "a"), get_array(df, "a")) if using_copy_on_write or using_array_manager: @@ -1043,6 +1070,10 @@ def test_column_as_series_no_item_cache( s1 = method(df) s2 = method(df) + if using_copy_on_write: + assert s1.index is not df.index + assert s1.index is not s2.index + is_iloc = "iloc" in request.node.name if using_copy_on_write or warn_copy_on_write or is_iloc: assert s1 is not s2 diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 73bb9b4a71741..c1e1fa6511948 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -34,6 +34,9 @@ def test_copy(using_copy_on_write): assert not df_copy._mgr.blocks[0].refs.has_reference() assert not df_copy._mgr.blocks[1].refs.has_reference() + assert df_copy.index is not df.index + assert df_copy.columns is not df.columns + # mutating copy doesn't mutate original df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 1 diff --git a/pandas/tests/series/methods/test_align.py b/pandas/tests/series/methods/test_align.py index cb60cd2e5bcf3..15bd3e9f9a51a 100644 --- a/pandas/tests/series/methods/test_align.py +++ b/pandas/tests/series/methods/test_align.py @@ -131,6 +131,8 @@ def test_align_same_index(datetime_series, using_copy_on_write): assert a.index is datetime_series.index assert b.index is datetime_series.index else: + assert a.index is not datetime_series.index + assert b.index is not datetime_series.index assert a.index.is_(datetime_series.index) assert b.index.is_(datetime_series.index)