Skip to content

Commit 6159928

Browse files
Merge remote-tracking branch 'upstream/2.3.x' into backport-60985
2 parents 3ba6011 + f97ee3a commit 6159928

27 files changed

+245
-101
lines changed

pandas/core/arrays/base.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2386,7 +2386,14 @@ def _groupby_op(
23862386
if op.how not in ["any", "all"]:
23872387
# Fail early to avoid conversion to object
23882388
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
2389-
npvalues = self.to_numpy(object, na_value=np.nan)
2389+
2390+
arr = self
2391+
if op.how == "sum":
2392+
# https://github.com/pandas-dev/pandas/issues/60229
2393+
# All NA should result in the empty string.
2394+
if min_count == 0:
2395+
arr = arr.fillna("")
2396+
npvalues = arr.to_numpy(object, na_value=np.nan)
23902397
else:
23912398
raise NotImplementedError(
23922399
f"function is not implemented for this dtype: {self.dtype}"

pandas/core/indexes/base.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6387,6 +6387,36 @@ def _find_common_type_compat(self, target) -> DtypeObj:
63876387
"""
63886388
target_dtype, _ = infer_dtype_from(target)
63896389

6390+
if using_string_dtype():
6391+
# special case: if left or right is a zero-length RangeIndex or
6392+
# Index[object], those can be created by the default empty constructors
6393+
# -> for that case ignore this dtype and always return the other
6394+
# (https://github.com/pandas-dev/pandas/pull/60797)
6395+
from pandas.core.indexes.range import RangeIndex
6396+
6397+
if len(self) == 0 and (
6398+
isinstance(self, RangeIndex) or self.dtype == np.object_
6399+
):
6400+
if target_dtype.kind == "M":
6401+
if hasattr(target_dtype, "tz"):
6402+
target_dtype_ns = DatetimeTZDtype("ns", tz=target_dtype.tz)
6403+
else:
6404+
target_dtype_ns = np.dtype("datetime64[ns]") # type: ignore[assignment]
6405+
try:
6406+
Index(target, dtype=target_dtype_ns, copy=False)
6407+
except OutOfBoundsDatetime:
6408+
return np.dtype(object)
6409+
except Exception:
6410+
pass
6411+
return target_dtype_ns
6412+
return target_dtype
6413+
if (
6414+
isinstance(target, Index)
6415+
and len(target) == 0
6416+
and (isinstance(target, RangeIndex) or target_dtype == np.object_)
6417+
):
6418+
return self.dtype
6419+
63906420
# special case: if one dtype is uint64 and the other a signed int, return object
63916421
# See https://github.com/pandas-dev/pandas/issues/26778 for discussion
63926422
# Now it's:
@@ -7005,6 +7035,14 @@ def insert(self, loc: int, item) -> Index:
70057035

70067036
arr = self._values
70077037

7038+
if using_string_dtype() and len(self) == 0 and self.dtype == np.object_:
7039+
# special case: if we are an empty object-dtype Index, also
7040+
# take into account the inserted item for the resulting dtype
7041+
# (https://github.com/pandas-dev/pandas/pull/60797)
7042+
dtype = self._find_common_type_compat(item)
7043+
if dtype != self.dtype:
7044+
return self.astype(dtype).insert(loc, item)
7045+
70087046
try:
70097047
if isinstance(arr, ExtensionArray):
70107048
res_values = arr.insert(loc, item)

pandas/tests/frame/constructors/test_from_dict.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
from pandas import (
97
DataFrame,
108
Index,
@@ -44,7 +42,6 @@ def test_constructor_single_row(self):
4442
)
4543
tm.assert_frame_equal(result, expected)
4644

47-
@pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
4845
def test_constructor_list_of_series(self):
4946
data = [
5047
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),

pandas/tests/frame/indexing/test_coercion.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,7 @@ def test_26395(indexer_al):
103103
df["D"] = 0
104104

105105
indexer_al(df)["C", "D"] = 2
106-
expected = DataFrame(
107-
{"D": [0, 0, 2]},
108-
index=["A", "B", "C"],
109-
columns=pd.Index(["D"], dtype=object),
110-
dtype=np.int64,
111-
)
106+
expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
112107
tm.assert_frame_equal(df, expected)
113108

114109
with tm.assert_produces_warning(
@@ -118,7 +113,7 @@ def test_26395(indexer_al):
118113
expected = DataFrame(
119114
{"D": [0, 0, 44.5]},
120115
index=["A", "B", "C"],
121-
columns=pd.Index(["D"], dtype=object),
116+
columns=["D"],
122117
dtype=np.float64,
123118
)
124119
tm.assert_frame_equal(df, expected)
@@ -130,7 +125,7 @@ def test_26395(indexer_al):
130125
expected = DataFrame(
131126
{"D": [0, 0, "hello"]},
132127
index=["A", "B", "C"],
133-
columns=pd.Index(["D"], dtype=object),
128+
columns=["D"],
134129
dtype=object,
135130
)
136131
tm.assert_frame_equal(df, expected)

pandas/tests/frame/indexing/test_indexing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1206,7 +1206,7 @@ def test_loc_setitem_datetimelike_with_inference(self):
12061206
result = df.dtypes
12071207
expected = Series(
12081208
[np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2,
1209-
index=Index(list("ABCDEFGH"), dtype=object),
1209+
index=list("ABCDEFGH"),
12101210
)
12111211
tm.assert_series_equal(result, expected)
12121212

pandas/tests/frame/indexing/test_insert.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,7 @@ def test_insert_with_columns_dups(self):
6767
df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
6868
df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
6969
exp = DataFrame(
70-
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]],
71-
columns=Index(["A", "A", "A"], dtype=object),
70+
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
7271
)
7372
tm.assert_frame_equal(df, exp)
7473

pandas/tests/frame/indexing/test_setitem.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -146,18 +146,32 @@ def test_setitem_different_dtype(self):
146146
)
147147
tm.assert_series_equal(result, expected)
148148

149-
def test_setitem_empty_columns(self):
150-
# GH 13522
149+
def test_setitem_overwrite_index(self):
150+
# GH 13522 - assign the index as a column and then overwrite the values
151+
# -> should not affect the index
151152
df = DataFrame(index=["A", "B", "C"])
152153
df["X"] = df.index
153154
df["X"] = ["x", "y", "z"]
154155
exp = DataFrame(
155-
data={"X": ["x", "y", "z"]},
156-
index=["A", "B", "C"],
157-
columns=Index(["X"], dtype=object),
156+
data={"X": ["x", "y", "z"]}, index=["A", "B", "C"], columns=["X"]
158157
)
159158
tm.assert_frame_equal(df, exp)
160159

160+
def test_setitem_empty_columns(self):
161+
# Starting from an empty DataFrame and setting a column should result
162+
# in a default string dtype for the columns' Index
163+
# https://github.com/pandas-dev/pandas/issues/60338
164+
165+
df = DataFrame()
166+
df["foo"] = [1, 2, 3]
167+
expected = DataFrame({"foo": [1, 2, 3]})
168+
tm.assert_frame_equal(df, expected)
169+
170+
df = DataFrame(columns=Index([]))
171+
df["foo"] = [1, 2, 3]
172+
expected = DataFrame({"foo": [1, 2, 3]})
173+
tm.assert_frame_equal(df, expected)
174+
161175
def test_setitem_dt64_index_empty_columns(self):
162176
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
163177
df = DataFrame(index=np.arange(len(rng)))
@@ -171,9 +185,7 @@ def test_setitem_timestamp_empty_columns(self):
171185
df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns")
172186

173187
expected = DataFrame(
174-
[[Timestamp("20130101", tz="UTC")]] * 3,
175-
index=range(3),
176-
columns=Index(["now"], dtype=object),
188+
[[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"]
177189
)
178190
tm.assert_frame_equal(df, expected)
179191

@@ -212,7 +224,7 @@ def test_setitem_period_preserves_dtype(self):
212224
result = DataFrame([])
213225
result["a"] = data
214226

215-
expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object))
227+
expected = DataFrame({"a": data}, columns=["a"])
216228

217229
tm.assert_frame_equal(result, expected)
218230

@@ -939,7 +951,7 @@ def test_setitem_scalars_no_index(self):
939951
# GH#16823 / GH#17894
940952
df = DataFrame()
941953
df["foo"] = 1
942-
expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64)
954+
expected = DataFrame(columns=["foo"]).astype(np.int64)
943955
tm.assert_frame_equal(df, expected)
944956

945957
def test_setitem_newcol_tuple_key(self, float_frame):

pandas/tests/frame/methods/test_dropna.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,12 +182,9 @@ def test_dropna_multiple_axes(self):
182182
with pytest.raises(TypeError, match="supplying multiple axes"):
183183
inp.dropna(how="all", axis=(0, 1), inplace=True)
184184

185-
def test_dropna_tz_aware_datetime(self, using_infer_string):
185+
def test_dropna_tz_aware_datetime(self):
186186
# GH13407
187-
188187
df = DataFrame()
189-
if using_infer_string:
190-
df.columns = df.columns.astype("str")
191188
dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
192189
dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
193190
df["Time"] = [dt1]

pandas/tests/frame/methods/test_reset_index.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
import numpy as np
55
import pytest
66

7-
from pandas._config import using_string_dtype
8-
97
from pandas.core.dtypes.common import (
108
is_float_dtype,
119
is_integer_dtype,
@@ -646,7 +644,6 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes):
646644
tm.assert_frame_equal(res, expected)
647645

648646

649-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338")
650647
@pytest.mark.parametrize(
651648
"array, dtype",
652649
[
@@ -783,3 +780,34 @@ def test_reset_index_false_index_name():
783780
result_frame.reset_index()
784781
expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False))
785782
tm.assert_frame_equal(result_frame, expected_frame)
783+
784+
785+
@pytest.mark.parametrize("columns", [None, Index([])])
786+
def test_reset_index_with_empty_frame(columns):
787+
# Currently empty DataFrame has RangeIndex or object dtype Index, but when
788+
# resetting the index we still want to end up with the default string dtype
789+
# https://github.com/pandas-dev/pandas/issues/60338
790+
791+
index = Index([], name="foo")
792+
df = DataFrame(index=index, columns=columns)
793+
result = df.reset_index()
794+
expected = DataFrame(columns=["foo"])
795+
tm.assert_frame_equal(result, expected)
796+
797+
index = Index([1, 2, 3], name="foo")
798+
df = DataFrame(index=index, columns=columns)
799+
result = df.reset_index()
800+
expected = DataFrame({"foo": [1, 2, 3]})
801+
tm.assert_frame_equal(result, expected)
802+
803+
index = MultiIndex.from_tuples([], names=["foo", "bar"])
804+
df = DataFrame(index=index, columns=columns)
805+
result = df.reset_index()
806+
expected = DataFrame(columns=["foo", "bar"])
807+
tm.assert_frame_equal(result, expected)
808+
809+
index = MultiIndex.from_tuples([(1, 2), (2, 3)], names=["foo", "bar"])
810+
df = DataFrame(index=index, columns=columns)
811+
result = df.reset_index()
812+
expected = DataFrame({"foo": [1, 2], "bar": [2, 3]})
813+
tm.assert_frame_equal(result, expected)

pandas/tests/frame/test_constructors.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121
import pytest
2222
import pytz
2323

24-
from pandas._config import using_string_dtype
25-
2624
from pandas._libs import lib
2725
from pandas.compat.numpy import np_version_gt2
2826
from pandas.errors import IntCastingNaNError
@@ -2002,7 +2000,6 @@ def test_constructor_with_datetimes4(self):
20022000
df = DataFrame({"value": dr})
20032001
assert str(df.iat[0, 0].tz) == "US/Eastern"
20042002

2005-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
20062003
def test_constructor_with_datetimes5(self):
20072004
# GH 7822
20082005
# preserver an index with a tz on dict construction

pandas/tests/frame/test_query_eval.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -757,7 +757,6 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
757757
tm.assert_frame_equal(result, expected)
758758

759759
expected = DataFrame(df_index)
760-
expected.columns = expected.columns.astype(object)
761760
result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
762761
tm.assert_frame_equal(result, expected)
763762

pandas/tests/frame/test_reductions.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,16 @@ def test_axis_1_empty(self, all_reductions, index):
846846
expected = Series([], index=index, dtype=expected_dtype)
847847
tm.assert_series_equal(result, expected)
848848

849+
@pytest.mark.parametrize("min_count", [0, 1])
850+
def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count):
851+
# https://github.com/pandas-dev/pandas/issues/60229
852+
dtype = string_dtype_no_object
853+
df = DataFrame({"a": [pd.NA]}, dtype=dtype)
854+
result = df.sum(axis=1, skipna=skipna, min_count=min_count)
855+
value = "" if skipna and min_count == 0 else pd.NA
856+
expected = Series([value], dtype=dtype)
857+
tm.assert_series_equal(result, expected)
858+
849859
@pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
850860
@pytest.mark.parametrize("numeric_only", [None, True, False])
851861
def test_sum_prod_nanops(self, method, unit, numeric_only):

pandas/tests/groupby/test_groupby.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1623,7 +1623,7 @@ def test_groupby_2d_malformed():
16231623
d["label"] = ["l1", "l2"]
16241624
tmp = d.groupby(["group"]).mean(numeric_only=True)
16251625
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
1626-
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
1626+
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
16271627
tm.assert_numpy_array_equal(tmp.values, res_values)
16281628

16291629

pandas/tests/groupby/test_reductions.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -798,6 +798,20 @@ def test_string_dtype_all_na(
798798
tm.assert_equal(result, expected)
799799

800800

801+
@pytest.mark.parametrize("min_count", [0, 1])
802+
def test_string_dtype_empty_sum(string_dtype_no_object, min_count):
803+
# https://github.com/pandas-dev/pandas/issues/60229
804+
dtype = string_dtype_no_object
805+
df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
806+
gb = df.groupby("a")
807+
result = gb.sum(min_count=min_count)
808+
value = "" if min_count == 0 else pd.NA
809+
expected = DataFrame(
810+
{"b": value}, index=pd.Index(["x"], name="a", dtype=dtype), dtype=dtype
811+
)
812+
tm.assert_frame_equal(result, expected)
813+
814+
801815
def test_max_nan_bug():
802816
df = DataFrame(
803817
{

pandas/tests/indexes/base_class/test_reshape.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def test_insert(self):
3434

3535
# test empty
3636
null_index = Index([])
37-
tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a"))
37+
tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a"))
3838

3939
def test_insert_missing(self, request, nulls_fixture, using_infer_string):
4040
if using_infer_string and nulls_fixture is pd.NA:

pandas/tests/indexes/base_class/test_setops.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,6 @@ def test_tuple_union_bug(self, method, expected, sort):
240240
def test_union_name_preservation(
241241
self, first_list, second_list, first_name, second_name, expected_name, sort
242242
):
243-
expected_dtype = object if not first_list or not second_list else "str"
244243
first = Index(first_list, name=first_name)
245244
second = Index(second_list, name=second_name)
246245
union = first.union(second, sort=sort)
@@ -251,7 +250,7 @@ def test_union_name_preservation(
251250
expected = Index(sorted(vals), name=expected_name)
252251
tm.assert_index_equal(union, expected)
253252
else:
254-
expected = Index(vals, name=expected_name, dtype=expected_dtype)
253+
expected = Index(vals, name=expected_name)
255254
tm.assert_index_equal(union.sort_values(), expected.sort_values())
256255

257256
@pytest.mark.parametrize(

pandas/tests/indexes/datetimes/test_join.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,13 +70,17 @@ def test_join_utc_convert(self, join_type):
7070
assert isinstance(result, DatetimeIndex)
7171
assert result.tz is timezone.utc
7272

73-
def test_datetimeindex_union_join_empty(self, sort):
73+
def test_datetimeindex_union_join_empty(self, sort, using_infer_string):
7474
dti = date_range(start="1/1/2001", end="2/1/2001", freq="D")
7575
empty = Index([])
7676

7777
result = dti.union(empty, sort=sort)
78-
expected = dti.astype("O")
79-
tm.assert_index_equal(result, expected)
78+
if using_infer_string:
79+
assert isinstance(result, DatetimeIndex)
80+
tm.assert_index_equal(result, dti)
81+
else:
82+
expected = dti.astype("O")
83+
tm.assert_index_equal(result, expected)
8084

8185
result = dti.join(empty)
8286
assert isinstance(result, DatetimeIndex)

0 commit comments

Comments
 (0)