From a9937388d3fa7bd7d9f2ca007a4430258c8c272d Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 29 Dec 2024 14:32:14 -0500 Subject: [PATCH 1/5] Backport PR #60615: TST(string dtype): Resolve some HDF5 xfails --- pandas/io/pytables.py | 2 + .../tests/io/pytables/test_file_handling.py | 45 ++++++++++++++----- pandas/tests/io/pytables/test_subclass.py | 3 -- pandas/tests/io/test_common.py | 3 -- 4 files changed, 36 insertions(+), 17 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2e38303caa354..1293c9a0b8499 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5274,6 +5274,8 @@ def _dtype_to_kind(dtype_str: str) -> str: kind = "integer" elif dtype_str == "object": kind = "object" + elif dtype_str == "str": + kind = "str" else: raise ValueError(f"cannot interpret dtype of [{dtype_str}]") diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 9daf2a5910a08..1878f2a392e13 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -36,12 +36,11 @@ pytestmark = [ pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) -def test_mode(setup_path, tmp_path, mode): +def test_mode(setup_path, tmp_path, mode, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -90,10 +89,12 @@ def test_mode(setup_path, tmp_path, mode): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) + if using_infer_string: + df.columns = df.columns.astype("str") tm.assert_frame_equal(result, df) -def test_default_mode(tmp_path, setup_path): +def test_default_mode(tmp_path, setup_path, using_infer_string): # read_hdf uses default mode df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -103,7 +104,10 @@ def test_default_mode(tmp_path, setup_path): path = tmp_path / setup_path df.to_hdf(path, key="df", mode="w") result = read_hdf(path, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) def test_reopen_handle(tmp_path, setup_path): @@ -162,7 +166,7 @@ def test_reopen_handle(tmp_path, setup_path): assert not store.is_open -def test_open_args(setup_path): +def test_open_args(setup_path, using_infer_string): with tm.ensure_clean(setup_path) as path: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -177,8 +181,13 @@ def test_open_args(setup_path): store["df"] = df store.append("df2", df) - tm.assert_frame_equal(store["df"], df) - tm.assert_frame_equal(store["df2"], df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + + tm.assert_frame_equal(store["df"], expected) + tm.assert_frame_equal(store["df2"], expected) store.close() @@ -193,7 +202,7 @@ def test_flush(setup_path): store.flush(fsync=True) -def test_complibs_default_settings(tmp_path, setup_path): +def test_complibs_default_settings(tmp_path, setup_path, using_infer_string): # GH15943 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -206,7 +215,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complevel=9) result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -217,7 +230,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complib="zlib") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -228,7 +245,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -302,6 +323,7 @@ def test_complibs(tmp_path, lvl, lib, request): assert node.filters.complib == lib +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) @@ -319,6 +341,7 @@ def test_encoding(setup_path): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "val", [ diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index bbe1cd77e0d9f..03622faa2b5a8 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -19,7 +17,6 @@ class TestHDFStoreSubclass: # GH 33748 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_supported_for_subclass_dataframe(self, tmp_path): data = {"a": [1, 2], "b": [3, 4]} sdf = tm.SubclassedDataFrame(data, dtype=np.intp) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index d1e42b297f143..a0dd64f1cb82b 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas.util._test_decorators as td @@ -379,7 +377,6 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll From 4acd264bc8cce7065c6ca07df7314afbb72b5e88 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 14 Jan 2025 09:34:40 -0500 Subject: [PATCH 2/5] Backport PR #60713: TST(string dtype): Resolve xfail in test_base.py --- pandas/core/arrays/string_.py | 5 +++++ pandas/tests/indexes/test_base.py | 9 +++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e163a9df8ee10..3efb48c86e92c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -531,6 +531,11 @@ def _str_map_nan_semantics( else: return self._str_map_str_or_object(dtype, na_value, arr, f, mask) + def view(self, dtype: Dtype | None = None) -> ArrayLike: + if dtype is not None: + raise TypeError("Cannot change data-type for string array.") + return super().view(dtype=dtype) + # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is # incompatible with definition in base class "ExtensionArray" diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e3b8a60354b61..a94e4728a9751 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -354,14 +354,11 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "str" and not index.dtype.storage == "python": - # TODO(infer_string): Make the errors consistent - with pytest.raises(NotImplementedError, match="i8"): - index.view("i8") else: msg = ( - "Cannot change data-type for array of references|" - "Cannot change data-type for object array|" + r"Cannot change data-type for array of references\.|" + r"Cannot change data-type for object array\.|" + r"Cannot change data-type for array of strings\.|" ) with pytest.raises(TypeError, match=msg): index.view("i8") From fa7224fa8888bf106ffca923001f602b92f6ada2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 29 Jan 2025 17:07:04 -0500 Subject: [PATCH 3/5] Backport PR #60709: ENH(string dtype): Make str.decode return str dtype --- doc/source/whatsnew/v2.3.0.rst | 3 ++- pandas/core/strings/accessor.py | 10 +++++++--- pandas/io/pytables.py | 4 +++- pandas/io/sas/sas7bdat.py | 6 ++++++ pandas/tests/io/sas/test_sas7bdat.py | 16 ++++++---------- pandas/tests/strings/test_strings.py | 9 +++++---- 6 files changed, 29 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 3ee54bcc1c6e2..bff5a3edfb71d 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -34,7 +34,8 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been - updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`) + updated to work correctly with NumPy >= 2 (:issue:`57739`) +- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 563dce3008480..3c4cf60ab262a 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -13,6 +13,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs import lib from pandas._typing import ( AlignJoin, @@ -387,7 +389,9 @@ def cons_row(x): # This is a mess. _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) - if self._is_string: + if _dtype is not None: + pass + elif self._is_string: if is_bool_dtype(vdtype): _dtype = result.dtype elif returns_string: @@ -2012,9 +2016,9 @@ def decode(self, encoding, errors: str = "strict"): decoder = codecs.getdecoder(encoding) f = lambda x: decoder(x, errors)[0] arr = self._data.array - # assert isinstance(arr, (StringArray,)) result = arr._str_map(f) - return self._wrap_result(result) + dtype = "str" if get_option("future.infer_string") else None + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def encode(self, encoding, errors: str = "strict"): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1480e0a171147..dbe2db9f9625b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5208,7 +5208,9 @@ def _unconvert_string_array( dtype = f"U{itemsize}" if isinstance(data[0], bytes): - data = Series(data, copy=False).str.decode(encoding, errors=errors)._values + ser = Series(data, copy=False).str.decode(encoding, errors=errors) + data = ser.to_numpy() + data.flags.writeable = True else: data = data.astype(dtype, copy=False).astype(object, copy=False) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index c5bdfb5541788..1d424425cd927 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -25,6 +25,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs.byteswap import ( read_double_with_byteswap, read_float_with_byteswap, @@ -722,6 +724,7 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt = {} js, jb = 0, 0 + infer_string = get_option("future.infer_string") for j in range(self.column_count): name = self.column_names[j] @@ -738,6 +741,9 @@ def _chunk_to_dataframe(self) -> DataFrame: rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False) if self.convert_text and (self.encoding is not None): rslt[name] = self._decode_string(rslt[name].str) + if infer_string: + rslt[name] = rslt[name].astype("str") + js += 1 else: self.close() diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 493971f9f56ef..96aaa1e9bcb21 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import IS64 from pandas.errors import EmptyDataError import pandas.util._test_decorators as td @@ -18,10 +16,6 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture def dirpath(datapath): @@ -254,11 +248,13 @@ def test_zero_variables(datapath): pd.read_sas(fname) -def test_zero_rows(datapath): +@pytest.mark.parametrize("encoding", [None, "utf8"]) +def test_zero_rows(datapath, encoding): # GH 18198 fname = datapath("io", "sas", "data", "zero_rows.sas7bdat") - result = pd.read_sas(fname) - expected = pd.DataFrame([{"char_field": "a", "num_field": 1.0}]).iloc[:0] + result = pd.read_sas(fname, encoding=encoding) + str_value = b"a" if encoding is None else "a" + expected = pd.DataFrame([{"char_field": str_value, "num_field": 1.0}]).iloc[:0] tm.assert_frame_equal(result, expected) @@ -414,7 +410,7 @@ def test_0x40_control_byte(datapath): fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat") df = pd.read_sas(fname, encoding="ascii") fname = datapath("io", "sas", "data", "0x40controlbyte.csv") - df0 = pd.read_csv(fname, dtype="object") + df0 = pd.read_csv(fname, dtype="str") tm.assert_frame_equal(df, df0) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 7c396e65b6120..59a06a421f53e 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -95,6 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) + empty_inferred_str = Series(dtype="str") if is_object_or_nan_string_dtype(any_string_dtype): empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) @@ -154,7 +155,7 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) - tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii")) + tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) @@ -564,7 +565,7 @@ def test_string_slice_out_of_bounds(any_string_dtype): def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") - expected = ser.map(lambda x: x.decode("utf-8")).astype(object) + expected = Series(["a", "b", "a\xe4"], dtype="str") tm.assert_series_equal(result, expected) @@ -594,7 +595,7 @@ def test_decode_errors_kwarg(): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") - expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object) + expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str") tm.assert_series_equal(result, expected) @@ -749,5 +750,5 @@ def test_get_with_dict_label(): def test_series_str_decode(): # GH 22613 result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict") - expected = Series(["x", "y"], dtype="object") + expected = Series(["x", "y"], dtype="str") tm.assert_series_equal(result, expected) From 644ab1d836e0976e1f1a46a68529b2695fcefef6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 30 Jan 2025 16:30:49 -0500 Subject: [PATCH 4/5] Bump version of upload-artifact --- .github/actions/run-tests/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index fd7c3587f2254..e4b209d83913d 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -7,7 +7,7 @@ runs: shell: bash -el {0} - name: Publish test results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Test results path: test-data.xml From 7d722dfd2bf33cca809deb197ca60645f4739e97 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 5 Feb 2025 09:04:01 +0100 Subject: [PATCH 5/5] Update doc/source/whatsnew/v2.3.0.rst --- doc/source/whatsnew/v2.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index bff5a3edfb71d..0b7c2bac1be6a 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -34,7 +34,7 @@ Other enhancements - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been - updated to work correctly with NumPy >= 2 (:issue:`57739`) + updated to raise FutureWarning with NumPy >= 2 (:issue:`60340`) - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)