pandas-dev · phofl · Nov 19, 2023 · Oct 15, 2023 · Oct 15, 2023 · Oct 15, 2023
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -2022,6 +2022,14 @@ def warn_copy_on_write() -> bool:
     )
 
 
+@pytest.fixture
+def using_infer_string() -> bool:
+    """
+    Fixture to check if infer_string is enabled.
+    """
+    return pd.options.future.infer_string
+
+
 warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"]
 if zoneinfo is not None:
     warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw"))  # type: ignore[arg-type]

diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py
@@ -586,13 +586,15 @@ def test_strftime_dt64_days(self):
         # dtype may be S10 or U10 depending on python version
         tm.assert_index_equal(result, expected)
 
-    def test_strftime_period_days(self):
+    def test_strftime_period_days(self, using_infer_string):
         period_index = period_range("20150301", periods=5)
         result = period_index.strftime("%Y/%m/%d")
         expected = Index(
             ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"],
             dtype="=U10",
         )
+        if using_infer_string:
+            expected = expected.astype("string[pyarrow_numpy]")
         tm.assert_index_equal(result, expected)
 
     def test_strftime_dt64_microsecond_resolution(self):

diff --git a/pandas/tests/series/indexing/test_delitem.py b/pandas/tests/series/indexing/test_delitem.py
@@ -31,19 +31,16 @@ def test_delitem(self):
         del s[0]
         tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64")))
 
-    def test_delitem_object_index(self):
+    def test_delitem_object_index(self, using_infer_string):
         # Index(dtype=object)
-        s = Series(1, index=["a"])
+        dtype = "string[pyarrow_numpy]" if using_infer_string else object
+        s = Series(1, index=Index(["a"], dtype=dtype))
         del s["a"]
-        tm.assert_series_equal(
-            s, Series(dtype="int64", index=Index([], dtype="object"))
-        )
+        tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype)))
         s["a"] = 1
-        tm.assert_series_equal(s, Series(1, index=["a"]))
+        tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype)))
         del s["a"]
-        tm.assert_series_equal(
-            s, Series(dtype="int64", index=Index([], dtype="object"))
-        )
+        tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype)))
 
     def test_delitem_missing_key(self):
         # empty

diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py
@@ -71,7 +71,7 @@ def test_getitem_unrecognized_scalar(self):
     def test_getitem_negative_out_of_bounds(self):
         ser = Series(["a"] * 10, index=["a"] * 10)
 
-        msg = "index -11 is out of bounds for axis 0 with size 10"
+        msg = "index -11 is out of bounds for axis 0 with size 10|index out of bounds"
         warn_msg = "Series.__getitem__ treating keys as positions is deprecated"
         with pytest.raises(IndexError, match=msg):
             with tm.assert_produces_warning(FutureWarning, match=warn_msg):
@@ -363,7 +363,9 @@ def test_getitem_no_matches(self, box):
         key = Series(["C"], dtype=object)
         key = box(key)
 
-        msg = r"None of \[Index\(\['C'\], dtype='object'\)\] are in the \[index\]"
+        msg = (
+            r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]"
+        )
         with pytest.raises(KeyError, match=msg):
             ser[key]
 
@@ -437,7 +439,7 @@ def test_getitem_boolean_empty(self):
 
         # GH#5877
         # indexing with empty series
-        ser = Series(["A", "B"])
+        ser = Series(["A", "B"], dtype=object)
         expected = Series(dtype=object, index=Index([], dtype="int64"))
         result = ser[Series([], dtype=object)]
         tm.assert_series_equal(result, expected)

diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
@@ -2,6 +2,7 @@
     date,
     datetime,
 )
+from decimal import Decimal
 
 import numpy as np
 import pytest
@@ -175,7 +176,8 @@ class TestSetitemScalarIndexer:
     def test_setitem_negative_out_of_bounds(self):
         ser = Series(["a"] * 10, index=["a"] * 10)
 
-        msg = "index -11 is out of bounds for axis 0 with size 10"
+        # string index falls back to positional
+        msg = "index -11|-1 is out of bounds for axis 0 with size 10"
         warn_msg = "Series.__setitem__ treating keys as positions is deprecated"
         with pytest.raises(IndexError, match=msg):
             with tm.assert_produces_warning(FutureWarning, match=warn_msg):
@@ -527,8 +529,12 @@ def test_setitem_empty_series_timestamp_preserves_dtype(self):
             Timedelta("9 days").to_pytimedelta(),
         ],
     )
-    def test_append_timedelta_does_not_cast(self, td):
+    def test_append_timedelta_does_not_cast(self, td, using_infer_string, request):
         # GH#22717 inserting a Timedelta should _not_ cast to int64
+        if using_infer_string and not isinstance(td, Timedelta):
+            # TODO: GH#56010
+            request.applymarker(pytest.mark.xfail(reason="inferred as string"))
+
         expected = Series(["x", td], index=[0, "td"], dtype=object)
 
         ser = Series(["x"])
@@ -595,13 +601,21 @@ def test_setitem_enlarge_with_na(
         expected = Series(expected_values, dtype=target_dtype)
         tm.assert_series_equal(ser, expected)
 
-    def test_setitem_enlargement_object_none(self, nulls_fixture):
+    def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string):
         # GH#48665
         ser = Series(["a", "b"])
         ser[3] = nulls_fixture
-        expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3])
+        dtype = (
+            "string[pyarrow_numpy]"
+            if using_infer_string and not isinstance(nulls_fixture, Decimal)
+            else object
+        )
+        expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3], dtype=dtype)
         tm.assert_series_equal(ser, expected)
-        assert ser[3] is nulls_fixture
+        if using_infer_string:
+            ser[3] is np.nan
+        else:
+            assert ser[3] is nulls_fixture
 
 
 def test_setitem_scalar_into_readonly_backing_data():
@@ -845,20 +859,28 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace):
 
         self._check_inplace(is_inplace, orig, arr, obj)
 
-    def test_index_where(self, obj, key, expected, warn, val):
+    def test_index_where(self, obj, key, expected, warn, val, using_infer_string):
         mask = np.zeros(obj.shape, dtype=bool)
         mask[key] = True
 
-        res = Index(obj).where(~mask, val)
-        expected_idx = Index(expected, dtype=expected.dtype)
-        tm.assert_index_equal(res, expected_idx)
+        if using_infer_string and obj.dtype == object:
+            with pytest.raises(TypeError, match="Scalar must"):
+                Index(obj).where(~mask, val)
+        else:
+            res = Index(obj).where(~mask, val)
+            expected_idx = Index(expected, dtype=expected.dtype)
+            tm.assert_index_equal(res, expected_idx)
 
-    def test_index_putmask(self, obj, key, expected, warn, val):
+    def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string):
         mask = np.zeros(obj.shape, dtype=bool)
         mask[key] = True
 
-        res = Index(obj).putmask(mask, val)
-        tm.assert_index_equal(res, Index(expected, dtype=expected.dtype))
+        if using_infer_string and obj.dtype == object:
+            with pytest.raises(TypeError, match="Scalar must"):
+                Index(obj).putmask(mask, val)
+        else:
+            res = Index(obj).putmask(mask, val)
+            tm.assert_index_equal(res, Index(expected, dtype=expected.dtype))
 
 
 @pytest.mark.parametrize(

diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas.core.dtypes.common import is_integer
 
 import pandas as pd
@@ -230,6 +232,7 @@ def test_where_ndframe_align():
     tm.assert_series_equal(out, expected)
 
 
+@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string")
 def test_where_setitem_invalid():
     # GH 2702
     # make sure correct exceptions are raised on invalid list assignment

diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
@@ -75,7 +75,7 @@ def test_astype_dict_like(self, dtype_class):
 
         dt1 = dtype_class({"abc": str})
         result = ser.astype(dt1)
-        expected = Series(["0", "2", "4", "6", "8"], name="abc")
+        expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object)
         tm.assert_series_equal(result, expected)
 
         dt2 = dtype_class({"abc": "float64"})
@@ -163,10 +163,12 @@ def test_astype_empty_constructor_equality(self, dtype):
             Series([string.digits * 10, rand_str(63), rand_str(64), np.nan, 1.0]),
         ],
     )
-    def test_astype_str_map(self, dtype, series):
+    def test_astype_str_map(self, dtype, series, using_infer_string):
         # see GH#4405
         result = series.astype(dtype)
         expected = series.map(str)
+        if using_infer_string:
+            expected = expected.astype(object)
         tm.assert_series_equal(result, expected)
 
     def test_astype_float_to_period(self):
@@ -276,13 +278,13 @@ def test_astype_str_cast_dt64(self):
         ts = Series([Timestamp("2010-01-04 00:00:00")])
         res = ts.astype(str)
 
-        expected = Series(["2010-01-04"])
+        expected = Series(["2010-01-04"], dtype=object)
         tm.assert_series_equal(res, expected)
 
         ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")])
         res = ts.astype(str)
 
-        expected = Series(["2010-01-04 00:00:00-05:00"])
+        expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object)
         tm.assert_series_equal(res, expected)
 
     def test_astype_str_cast_td64(self):
@@ -291,7 +293,7 @@ def test_astype_str_cast_td64(self):
         td = Series([Timedelta(1, unit="d")])
         ser = td.astype(str)
 
-        expected = Series(["1 days"])
+        expected = Series(["1 days"], dtype=object)
         tm.assert_series_equal(ser, expected)
 
     def test_dt64_series_astype_object(self):
@@ -338,7 +340,7 @@ def test_astype_from_float_to_str(self, dtype):
         # https://github.com/pandas-dev/pandas/issues/36451
         ser = Series([0.1], dtype=dtype)
         result = ser.astype(str)
-        expected = Series(["0.1"])
+        expected = Series(["0.1"], dtype=object)
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize(
@@ -409,7 +411,7 @@ def test_astype_cast_object_int(self):
 
         tm.assert_series_equal(result, Series(np.arange(1, 5)))
 
-    def test_astype_unicode(self):
+    def test_astype_unicode(self, using_infer_string):
         # see GH#7758: A bit of magic is required to set
         # default encoding to utf-8
         digits = string.digits
@@ -426,12 +428,14 @@ def test_astype_unicode(self):
             item = "野菜食べないとやばい"
             ser = Series([item.encode()])
             result = ser.astype(np.str_)
-            expected = Series([item])
+            expected = Series([item], dtype=object)
             tm.assert_series_equal(result, expected)
 
         for ser in test_series:
             res = ser.astype(np.str_)
             expec = ser.map(str)
+            if using_infer_string:
+                expec = expec.astype(object)
             tm.assert_series_equal(res, expec)
 
         # Restore the former encoding
@@ -527,12 +531,12 @@ def test_astype_categorical_to_other(self):
         expected = ser
         tm.assert_series_equal(ser.astype("category"), expected)
         tm.assert_series_equal(ser.astype(CategoricalDtype()), expected)
-        msg = r"Cannot cast object dtype to float64"
+        msg = r"Cannot cast object|string dtype to float64"
         with pytest.raises(ValueError, match=msg):
             ser.astype("float64")
 
         cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
-        exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
+        exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object)
         tm.assert_series_equal(cat.astype("str"), exp)
         s2 = Series(Categorical(["1", "2", "3", "4"]))
         exp2 = Series([1, 2, 3, 4]).astype("int")

diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py
@@ -53,7 +53,7 @@ def test_combine_first(self):
         # mixed types
         index = tm.makeStringIndex(20)
         floats = Series(np.random.default_rng(2).standard_normal(20), index=index)
-        strings = Series(tm.makeStringIndex(10), index=index[::2])
+        strings = Series(tm.makeStringIndex(10), index=index[::2], dtype=object)
 
         combined = strings.combine_first(floats)
 

diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py
@@ -186,6 +186,7 @@ def test_convert_dtypes(
         self,
         test_cases,
         params,
+        using_infer_string,
     ):
         data, maindtype, expected_default, expected_other = test_cases
         if (
@@ -219,6 +220,16 @@ def test_convert_dtypes(
         for spec, dtype in expected_other.items():
             if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])):
                 expected_dtype = dtype
+        if (
+            using_infer_string
+            and expected_default == "string"
+            and expected_dtype == object
+            and params[0]
+            and not params[1]
+        ):
+            # If we would convert with convert strings then infer_objects converts
+            # with the option
+            expected_dtype = "string[pyarrow_numpy]"
 
         expected = pd.Series(data, dtype=expected_dtype)
         tm.assert_series_equal(result, expected)