Skip to content

Adjust Series specific tests for string option #55538

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Nov 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2022,6 +2022,14 @@ def warn_copy_on_write() -> bool:
)


@pytest.fixture
def using_infer_string() -> bool:
"""
Fixture to check if infer_string is enabled.
"""
return pd.options.future.infer_string


warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"]
if zoneinfo is not None:
warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type]
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/series/accessors/test_dt_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,13 +586,15 @@ def test_strftime_dt64_days(self):
# dtype may be S10 or U10 depending on python version
tm.assert_index_equal(result, expected)

def test_strftime_period_days(self):
def test_strftime_period_days(self, using_infer_string):
period_index = period_range("20150301", periods=5)
result = period_index.strftime("%Y/%m/%d")
expected = Index(
["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"],
dtype="=U10",
)
if using_infer_string:
expected = expected.astype("string[pyarrow_numpy]")
tm.assert_index_equal(result, expected)

def test_strftime_dt64_microsecond_resolution(self):
Expand Down
15 changes: 6 additions & 9 deletions pandas/tests/series/indexing/test_delitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,16 @@ def test_delitem(self):
del s[0]
tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64")))

def test_delitem_object_index(self):
def test_delitem_object_index(self, using_infer_string):
# Index(dtype=object)
s = Series(1, index=["a"])
dtype = "string[pyarrow_numpy]" if using_infer_string else object
s = Series(1, index=Index(["a"], dtype=dtype))
del s["a"]
tm.assert_series_equal(
s, Series(dtype="int64", index=Index([], dtype="object"))
)
tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype)))
s["a"] = 1
tm.assert_series_equal(s, Series(1, index=["a"]))
tm.assert_series_equal(s, Series(1, index=Index(["a"], dtype=dtype)))
del s["a"]
tm.assert_series_equal(
s, Series(dtype="int64", index=Index([], dtype="object"))
)
tm.assert_series_equal(s, Series(dtype="int64", index=Index([], dtype=dtype)))

def test_delitem_missing_key(self):
# empty
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/series/indexing/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_getitem_unrecognized_scalar(self):
def test_getitem_negative_out_of_bounds(self):
ser = Series(["a"] * 10, index=["a"] * 10)

msg = "index -11 is out of bounds for axis 0 with size 10"
msg = "index -11 is out of bounds for axis 0 with size 10|index out of bounds"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use a "|".join pattern

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't feel super strong here but I prefer the pipe pattern for 2 options

warn_msg = "Series.__getitem__ treating keys as positions is deprecated"
with pytest.raises(IndexError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
Expand Down Expand Up @@ -363,7 +363,9 @@ def test_getitem_no_matches(self, box):
key = Series(["C"], dtype=object)
key = box(key)

msg = r"None of \[Index\(\['C'\], dtype='object'\)\] are in the \[index\]"
msg = (
r"None of \[Index\(\['C'\], dtype='object|string'\)\] are in the \[index\]"
)
with pytest.raises(KeyError, match=msg):
ser[key]

Expand Down Expand Up @@ -437,7 +439,7 @@ def test_getitem_boolean_empty(self):

# GH#5877
# indexing with empty series
ser = Series(["A", "B"])
ser = Series(["A", "B"], dtype=object)
expected = Series(dtype=object, index=Index([], dtype="int64"))
result = ser[Series([], dtype=object)]
tm.assert_series_equal(result, expected)
Expand Down
46 changes: 34 additions & 12 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
date,
datetime,
)
from decimal import Decimal

import numpy as np
import pytest
Expand Down Expand Up @@ -175,7 +176,8 @@ class TestSetitemScalarIndexer:
def test_setitem_negative_out_of_bounds(self):
ser = Series(["a"] * 10, index=["a"] * 10)

msg = "index -11 is out of bounds for axis 0 with size 10"
# string index falls back to positional
msg = "index -11|-1 is out of bounds for axis 0 with size 10"
warn_msg = "Series.__setitem__ treating keys as positions is deprecated"
with pytest.raises(IndexError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
Expand Down Expand Up @@ -527,8 +529,12 @@ def test_setitem_empty_series_timestamp_preserves_dtype(self):
Timedelta("9 days").to_pytimedelta(),
],
)
def test_append_timedelta_does_not_cast(self, td):
def test_append_timedelta_does_not_cast(self, td, using_infer_string, request):
# GH#22717 inserting a Timedelta should _not_ cast to int64
if using_infer_string and not isinstance(td, Timedelta):
# TODO: GH#56010
request.applymarker(pytest.mark.xfail(reason="inferred as string"))

expected = Series(["x", td], index=[0, "td"], dtype=object)

ser = Series(["x"])
Expand Down Expand Up @@ -595,13 +601,21 @@ def test_setitem_enlarge_with_na(
expected = Series(expected_values, dtype=target_dtype)
tm.assert_series_equal(ser, expected)

def test_setitem_enlargement_object_none(self, nulls_fixture):
def test_setitem_enlargement_object_none(self, nulls_fixture, using_infer_string):
# GH#48665
ser = Series(["a", "b"])
ser[3] = nulls_fixture
expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3])
dtype = (
"string[pyarrow_numpy]"
if using_infer_string and not isinstance(nulls_fixture, Decimal)
else object
)
expected = Series(["a", "b", nulls_fixture], index=[0, 1, 3], dtype=dtype)
tm.assert_series_equal(ser, expected)
assert ser[3] is nulls_fixture
if using_infer_string:
ser[3] is np.nan
else:
assert ser[3] is nulls_fixture


def test_setitem_scalar_into_readonly_backing_data():
Expand Down Expand Up @@ -845,20 +859,28 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace):

self._check_inplace(is_inplace, orig, arr, obj)

def test_index_where(self, obj, key, expected, warn, val):
def test_index_where(self, obj, key, expected, warn, val, using_infer_string):
mask = np.zeros(obj.shape, dtype=bool)
mask[key] = True

res = Index(obj).where(~mask, val)
expected_idx = Index(expected, dtype=expected.dtype)
tm.assert_index_equal(res, expected_idx)
if using_infer_string and obj.dtype == object:
with pytest.raises(TypeError, match="Scalar must"):
Index(obj).where(~mask, val)
else:
res = Index(obj).where(~mask, val)
expected_idx = Index(expected, dtype=expected.dtype)
tm.assert_index_equal(res, expected_idx)

def test_index_putmask(self, obj, key, expected, warn, val):
def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string):
mask = np.zeros(obj.shape, dtype=bool)
mask[key] = True

res = Index(obj).putmask(mask, val)
tm.assert_index_equal(res, Index(expected, dtype=expected.dtype))
if using_infer_string and obj.dtype == object:
with pytest.raises(TypeError, match="Scalar must"):
Index(obj).putmask(mask, val)
else:
res = Index(obj).putmask(mask, val)
tm.assert_index_equal(res, Index(expected, dtype=expected.dtype))


@pytest.mark.parametrize(
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/series/indexing/test_where.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_string_dtype

from pandas.core.dtypes.common import is_integer

import pandas as pd
Expand Down Expand Up @@ -230,6 +232,7 @@ def test_where_ndframe_align():
tm.assert_series_equal(out, expected)


@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set ints into string")
def test_where_setitem_invalid():
# GH 2702
# make sure correct exceptions are raised on invalid list assignment
Expand Down
24 changes: 14 additions & 10 deletions pandas/tests/series/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_astype_dict_like(self, dtype_class):

dt1 = dtype_class({"abc": str})
result = ser.astype(dt1)
expected = Series(["0", "2", "4", "6", "8"], name="abc")
expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object)
tm.assert_series_equal(result, expected)

dt2 = dtype_class({"abc": "float64"})
Expand Down Expand Up @@ -163,10 +163,12 @@ def test_astype_empty_constructor_equality(self, dtype):
Series([string.digits * 10, rand_str(63), rand_str(64), np.nan, 1.0]),
],
)
def test_astype_str_map(self, dtype, series):
def test_astype_str_map(self, dtype, series, using_infer_string):
# see GH#4405
result = series.astype(dtype)
expected = series.map(str)
if using_infer_string:
expected = expected.astype(object)
tm.assert_series_equal(result, expected)

def test_astype_float_to_period(self):
Expand Down Expand Up @@ -276,13 +278,13 @@ def test_astype_str_cast_dt64(self):
ts = Series([Timestamp("2010-01-04 00:00:00")])
res = ts.astype(str)

expected = Series(["2010-01-04"])
expected = Series(["2010-01-04"], dtype=object)
tm.assert_series_equal(res, expected)

ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")])
res = ts.astype(str)

expected = Series(["2010-01-04 00:00:00-05:00"])
expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object)
tm.assert_series_equal(res, expected)

def test_astype_str_cast_td64(self):
Expand All @@ -291,7 +293,7 @@ def test_astype_str_cast_td64(self):
td = Series([Timedelta(1, unit="d")])
ser = td.astype(str)

expected = Series(["1 days"])
expected = Series(["1 days"], dtype=object)
tm.assert_series_equal(ser, expected)

def test_dt64_series_astype_object(self):
Expand Down Expand Up @@ -338,7 +340,7 @@ def test_astype_from_float_to_str(self, dtype):
# https://github.com/pandas-dev/pandas/issues/36451
ser = Series([0.1], dtype=dtype)
result = ser.astype(str)
expected = Series(["0.1"])
expected = Series(["0.1"], dtype=object)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
Expand Down Expand Up @@ -409,7 +411,7 @@ def test_astype_cast_object_int(self):

tm.assert_series_equal(result, Series(np.arange(1, 5)))

def test_astype_unicode(self):
def test_astype_unicode(self, using_infer_string):
# see GH#7758: A bit of magic is required to set
# default encoding to utf-8
digits = string.digits
Expand All @@ -426,12 +428,14 @@ def test_astype_unicode(self):
item = "野菜食べないとやばい"
ser = Series([item.encode()])
result = ser.astype(np.str_)
expected = Series([item])
expected = Series([item], dtype=object)
tm.assert_series_equal(result, expected)

for ser in test_series:
res = ser.astype(np.str_)
expec = ser.map(str)
if using_infer_string:
expec = expec.astype(object)
tm.assert_series_equal(res, expec)

# Restore the former encoding
Expand Down Expand Up @@ -527,12 +531,12 @@ def test_astype_categorical_to_other(self):
expected = ser
tm.assert_series_equal(ser.astype("category"), expected)
tm.assert_series_equal(ser.astype(CategoricalDtype()), expected)
msg = r"Cannot cast object dtype to float64"
msg = r"Cannot cast object|string dtype to float64"
with pytest.raises(ValueError, match=msg):
ser.astype("float64")

cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object)
tm.assert_series_equal(cat.astype("str"), exp)
s2 = Series(Categorical(["1", "2", "3", "4"]))
exp2 = Series([1, 2, 3, 4]).astype("int")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_combine_first(self):
# mixed types
index = tm.makeStringIndex(20)
floats = Series(np.random.default_rng(2).standard_normal(20), index=index)
strings = Series(tm.makeStringIndex(10), index=index[::2])
strings = Series(tm.makeStringIndex(10), index=index[::2], dtype=object)

combined = strings.combine_first(floats)

Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/series/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ def test_convert_dtypes(
self,
test_cases,
params,
using_infer_string,
):
data, maindtype, expected_default, expected_other = test_cases
if (
Expand Down Expand Up @@ -219,6 +220,16 @@ def test_convert_dtypes(
for spec, dtype in expected_other.items():
if all(params_dict[key] is val for key, val in zip(spec[::2], spec[1::2])):
expected_dtype = dtype
if (
using_infer_string
and expected_default == "string"
and expected_dtype == object
and params[0]
and not params[1]
):
# If we would convert with convert strings then infer_objects converts
# with the option
expected_dtype = "string[pyarrow_numpy]"

expected = pd.Series(data, dtype=expected_dtype)
tm.assert_series_equal(result, expected)
Expand Down
Loading