diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 3fdd15462b51e..9ff63a909d0ab 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -2372,11 +2372,15 @@ integers: df.select_dtypes(include=["number", "bool"], exclude=["unsignedinteger"]) -To select string columns you must use the ``object`` dtype: +To select string columns include ``str``: .. ipython:: python - df.select_dtypes(include=["object"]) + df.select_dtypes(include=[str]) + +.. note:: + + This is a change in pandas 3.0. Previously strings were stored in ``object`` dtype columns, so would be selected with ``include=[object]``. See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#hardcoded-use-of-object-dtype. To see all the child dtypes of a generic ``dtype`` like ``numpy.number`` you can define a function that returns a tree of child dtypes: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index eb938a7140e29..6ba2612e42757 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -717,6 +717,7 @@ Other Deprecations - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) - Deprecated allowing ``fill_value`` that cannot be held in the original dtype (excepting NA values for integer and bool dtypes) in :meth:`Series.unstack` and :meth:`DataFrame.unstack` (:issue:`12189`, :issue:`53868`) - Deprecated allowing ``fill_value`` that cannot be held in the original dtype (excepting NA values for integer and bool dtypes) in :meth:`Series.shift` and :meth:`DataFrame.shift` (:issue:`53802`) +- Deprecated backward-compatibility behavior for :meth:`DataFrame.select_dtypes` matching "str" dtype when ``np.object_`` is specified (:issue:`61916`) - Deprecated option "future.no_silent_downcasting", as it is no longer used. In a future version accessing this option will raise (:issue:`59502`) - Deprecated slicing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` using a ``datetime.date`` object, explicitly cast to :class:`Timestamp` instead (:issue:`35830`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9c41b82bbbc8e..86b463bfc7b81 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5237,6 +5237,27 @@ def predicate(arr: ArrayLike) -> bool: return True + blk_dtypes = [blk.dtype for blk in self._mgr.blocks] + if ( + np.object_ in include + and str not in include + and str not in exclude + and any( + isinstance(dtype, StringDtype) and dtype.na_value is np.nan + for dtype in blk_dtypes + ) + ): + # GH#61916 + warnings.warn( + "For backward compatibility, 'str' dtypes are included by " + "select_dtypes when 'object' dtype is specified. " + "This behavior is deprecated and will be removed in a future " + "version. Explicitly pass 'str' to `include` to select them, " + "or to `exclude` to remove them and silence this warning.", + Pandas4Warning, + stacklevel=find_stack_level(), + ) + mgr = self._mgr._get_data_subset(predicate).copy(deep=False) return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 1ba6b9c437726..c6aff45582dd7 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.errors import Pandas4Warning + from pandas.core.dtypes.dtypes import ExtensionDtype import pandas as pd @@ -102,7 +104,12 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string): ri = df.select_dtypes(include=[str]) tm.assert_frame_equal(ri, ei) - ri = df.select_dtypes(include=["object"]) + msg = "For backward compatibility, 'str' dtypes are included" + warn = None + if using_infer_string: + warn = Pandas4Warning + with tm.assert_produces_warning(warn, match=msg): + ri = df.select_dtypes(include=["object"]) ei = df[["a"]] tm.assert_frame_equal(ri, ei) @@ -312,15 +319,18 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_strin ) df["g"] = df.f.diff() assert not hasattr(np, "u8") - r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) - # if using_infer_string: - # TODO warn + + msg = "For backward compatibility, 'str' dtypes are included" + warn = None + if using_infer_string: + warn = Pandas4Warning + with tm.assert_produces_warning(warn, match=msg): + r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) e = df[["a", "b"]] tm.assert_frame_equal(r, e) - r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) - # if using_infer_string: - # TODO warn + with tm.assert_produces_warning(warn, match=msg): + r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) e = df[["a", "b", "g"]] tm.assert_frame_equal(r, e) @@ -497,7 +507,12 @@ def test_select_dtype_object_and_str(self, using_infer_string): ) # with "object" -> only select the object or default str dtype column - result = df.select_dtypes(include=["object"]) + msg = "For backward compatibility, 'str' dtypes are included" + warn = None + if using_infer_string: + warn = Pandas4Warning + with tm.assert_produces_warning(warn, match=msg): + result = df.select_dtypes(include=["object"]) expected = df[["a"]] tm.assert_frame_equal(result, expected)