Skip to content

Commit 72b7a6e

Browse files
committed
Merge branch 'main' into tst-str-gb
2 parents 4224a52 + d36c589 commit 72b7a6e

18 files changed

+188
-256
lines changed

ci/code_checks.sh

-6
Original file line numberDiff line numberDiff line change
@@ -158,15 +158,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
158158
-i "pandas.Series.sparse.sp_values SA01" \
159159
-i "pandas.Series.sparse.to_coo PR07,RT03,SA01" \
160160
-i "pandas.Series.std PR01,RT03,SA01" \
161-
-i "pandas.Series.str.lstrip RT03" \
162161
-i "pandas.Series.str.match RT03" \
163162
-i "pandas.Series.str.normalize RT03,SA01" \
164-
-i "pandas.Series.str.partition RT03" \
165163
-i "pandas.Series.str.repeat SA01" \
166164
-i "pandas.Series.str.replace SA01" \
167-
-i "pandas.Series.str.rpartition RT03" \
168-
-i "pandas.Series.str.rstrip RT03" \
169-
-i "pandas.Series.str.strip RT03" \
170165
-i "pandas.Series.str.wrap RT03,SA01" \
171166
-i "pandas.Series.str.zfill RT03" \
172167
-i "pandas.Series.struct.dtypes SA01" \
@@ -244,7 +239,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
244239
-i "pandas.api.extensions.ExtensionArray.view SA01" \
245240
-i "pandas.api.interchange.from_dataframe RT03,SA01" \
246241
-i "pandas.api.types.is_bool PR01,SA01" \
247-
-i "pandas.api.types.is_bool_dtype SA01" \
248242
-i "pandas.api.types.is_categorical_dtype SA01" \
249243
-i "pandas.api.types.is_complex PR01,SA01" \
250244
-i "pandas.api.types.is_complex_dtype SA01" \

doc/source/whatsnew/v3.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Other enhancements
5353
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
5454
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
5555
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
56+
- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
5657
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
5758
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
5859

@@ -667,6 +668,7 @@ ExtensionArray
667668
^^^^^^^^^^^^^^
668669
- Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`)
669670
- Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
671+
- Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`)
670672
- Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`)
671673

672674
Styler

pandas/core/arrays/arrow/array.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -709,7 +709,13 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray:
709709
if isinstance(
710710
other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)
711711
) or isinstance(getattr(other, "dtype", None), CategoricalDtype):
712-
result = pc_func(self._pa_array, self._box_pa(other))
712+
try:
713+
result = pc_func(self._pa_array, self._box_pa(other))
714+
except pa.ArrowNotImplementedError:
715+
# TODO: could this be wrong if other is object dtype?
716+
# in which case we need to operate pointwise?
717+
result = ops.invalid_comparison(self, other, op)
718+
result = pa.array(result, type=pa.bool_())
713719
elif is_scalar(other):
714720
try:
715721
result = pc_func(self._pa_array, self._box_pa(other))

pandas/core/arrays/string_arrow.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
BaseStringArray,
3737
StringDtype,
3838
)
39-
from pandas.core.ops import invalid_comparison
4039
from pandas.core.strings.object_array import ObjectStringArrayMixin
4140

4241
if not pa_version_under10p1:
@@ -563,10 +562,7 @@ def _convert_int_dtype(self, result):
563562
return result
564563

565564
def _cmp_method(self, other, op):
566-
try:
567-
result = super()._cmp_method(other, op)
568-
except pa.ArrowNotImplementedError:
569-
return invalid_comparison(self, other, op)
565+
result = super()._cmp_method(other, op)
570566
if op == operator.ne:
571567
return result.to_numpy(np.bool_, na_value=True)
572568
else:

pandas/core/dtypes/common.py

+8
Original file line numberDiff line numberDiff line change
@@ -1274,6 +1274,10 @@ def is_bool_dtype(arr_or_dtype) -> bool:
12741274
"""
12751275
Check whether the provided array or dtype is of a boolean dtype.
12761276
1277+
This function verifies whether a given object is a boolean data type. The input
1278+
can be an array or a dtype object. Accepted array types include instances
1279+
of ``np.array``, ``pd.Series``, ``pd.Index``, and similar array-like structures.
1280+
12771281
Parameters
12781282
----------
12791283
arr_or_dtype : array-like or dtype
@@ -1284,6 +1288,10 @@ def is_bool_dtype(arr_or_dtype) -> bool:
12841288
boolean
12851289
Whether or not the array or dtype is of a boolean dtype.
12861290
1291+
See Also
1292+
--------
1293+
api.types.is_bool : Check if an object is a boolean.
1294+
12871295
Notes
12881296
-----
12891297
An ExtensionArray is considered boolean when the ``_is_boolean``

pandas/core/frame.py

+10-13
Original file line numberDiff line numberDiff line change
@@ -6406,7 +6406,7 @@ def dropna(
64066406
64076407
thresh : int, optional
64086408
Require that many non-NA values. Cannot be combined with how.
6409-
subset : column label or sequence of labels, optional
6409+
subset : column label or iterable of labels, optional
64106410
Labels along other axis to consider, e.g. if you are dropping rows
64116411
these would be a list of columns to include.
64126412
inplace : bool, default False
@@ -6536,7 +6536,7 @@ def dropna(
65366536
@overload
65376537
def drop_duplicates(
65386538
self,
6539-
subset: Hashable | Sequence[Hashable] | None = ...,
6539+
subset: Hashable | Iterable[Hashable] | None = ...,
65406540
*,
65416541
keep: DropKeep = ...,
65426542
inplace: Literal[True],
@@ -6546,7 +6546,7 @@ def drop_duplicates(
65466546
@overload
65476547
def drop_duplicates(
65486548
self,
6549-
subset: Hashable | Sequence[Hashable] | None = ...,
6549+
subset: Hashable | Iterable[Hashable] | None = ...,
65506550
*,
65516551
keep: DropKeep = ...,
65526552
inplace: Literal[False] = ...,
@@ -6556,7 +6556,7 @@ def drop_duplicates(
65566556
@overload
65576557
def drop_duplicates(
65586558
self,
6559-
subset: Hashable | Sequence[Hashable] | None = ...,
6559+
subset: Hashable | Iterable[Hashable] | None = ...,
65606560
*,
65616561
keep: DropKeep = ...,
65626562
inplace: bool = ...,
@@ -6565,7 +6565,7 @@ def drop_duplicates(
65656565

65666566
def drop_duplicates(
65676567
self,
6568-
subset: Hashable | Sequence[Hashable] | None = None,
6568+
subset: Hashable | Iterable[Hashable] | None = None,
65696569
*,
65706570
keep: DropKeep = "first",
65716571
inplace: bool = False,
@@ -6579,7 +6579,7 @@ def drop_duplicates(
65796579
65806580
Parameters
65816581
----------
6582-
subset : column label or sequence of labels, optional
6582+
subset : column label or iterable of labels, optional
65836583
Only consider certain columns for identifying duplicates, by
65846584
default use all of the columns.
65856585
keep : {'first', 'last', ``False``}, default 'first'
@@ -6669,7 +6669,7 @@ def drop_duplicates(
66696669

66706670
def duplicated(
66716671
self,
6672-
subset: Hashable | Sequence[Hashable] | None = None,
6672+
subset: Hashable | Iterable[Hashable] | None = None,
66736673
keep: DropKeep = "first",
66746674
) -> Series:
66756675
"""
@@ -6679,7 +6679,7 @@ def duplicated(
66796679
66806680
Parameters
66816681
----------
6682-
subset : column label or sequence of labels, optional
6682+
subset : column label or iterable of labels, optional
66836683
Only consider certain columns for identifying duplicates, by
66846684
default use all of the columns.
66856685
keep : {'first', 'last', False}, default 'first'
@@ -6771,10 +6771,7 @@ def f(vals) -> tuple[np.ndarray, int]:
67716771
return labels.astype("i8"), len(shape)
67726772

67736773
if subset is None:
6774-
# https://github.com/pandas-dev/pandas/issues/28770
6775-
# Incompatible types in assignment (expression has type "Index", variable
6776-
# has type "Sequence[Any]")
6777-
subset = self.columns # type: ignore[assignment]
6774+
subset = self.columns
67786775
elif (
67796776
not np.iterable(subset)
67806777
or isinstance(subset, str)
@@ -6795,7 +6792,7 @@ def f(vals) -> tuple[np.ndarray, int]:
67956792

67966793
if len(subset) == 1 and self.columns.is_unique:
67976794
# GH#45236 This is faster than get_group_index below
6798-
result = self[subset[0]].duplicated(keep)
6795+
result = self[next(iter(subset))].duplicated(keep)
67996796
result.name = None
68006797
else:
68016798
vals = (col.values for name, col in self.items() if name in subset)

pandas/core/strings/accessor.py

+3
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,8 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False):
969969
Returns
970970
-------
971971
DataFrame/MultiIndex or Series/Index of objects
972+
Returns appropriate type based on `expand` parameter with strings
973+
split based on the `sep` parameter.
972974
973975
See Also
974976
--------
@@ -2127,6 +2129,7 @@ def encode(self, encoding, errors: str = "strict"):
21272129
Returns
21282130
-------
21292131
Series or Index of object
2132+
Series or Index with the strings being stripped from the %(side)s.
21302133
21312134
See Also
21322135
--------

pandas/tests/frame/methods/test_drop_duplicates.py

+38
Original file line numberDiff line numberDiff line change
@@ -476,3 +476,41 @@ def test_drop_duplicates_non_boolean_ignore_index(arg):
476476
msg = '^For argument "ignore_index" expected type bool, received type .*.$'
477477
with pytest.raises(ValueError, match=msg):
478478
df.drop_duplicates(ignore_index=arg)
479+
480+
481+
def test_drop_duplicates_set():
482+
# GH#59237
483+
df = DataFrame(
484+
{
485+
"AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
486+
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
487+
"C": [1, 1, 2, 2, 2, 2, 1, 2],
488+
"D": range(8),
489+
}
490+
)
491+
# single column
492+
result = df.drop_duplicates({"AAA"})
493+
expected = df[:2]
494+
tm.assert_frame_equal(result, expected)
495+
496+
result = df.drop_duplicates({"AAA"}, keep="last")
497+
expected = df.loc[[6, 7]]
498+
tm.assert_frame_equal(result, expected)
499+
500+
result = df.drop_duplicates({"AAA"}, keep=False)
501+
expected = df.loc[[]]
502+
tm.assert_frame_equal(result, expected)
503+
assert len(result) == 0
504+
505+
# multi column
506+
expected = df.loc[[0, 1, 2, 3]]
507+
result = df.drop_duplicates({"AAA", "B"})
508+
tm.assert_frame_equal(result, expected)
509+
510+
result = df.drop_duplicates({"AAA", "B"}, keep="last")
511+
expected = df.loc[[0, 5, 6, 7]]
512+
tm.assert_frame_equal(result, expected)
513+
514+
result = df.drop_duplicates({"AAA", "B"}, keep=False)
515+
expected = df.loc[[0]]
516+
tm.assert_frame_equal(result, expected)

pandas/tests/io/excel/test_readers.py

+16-30
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,6 @@
3030
read_csv,
3131
)
3232
import pandas._testing as tm
33-
from pandas.core.arrays import (
34-
ArrowStringArray,
35-
StringArray,
36-
)
3733

3834
read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
3935
engine_params = [
@@ -692,43 +688,33 @@ def test_dtype_backend_and_dtype(self, read_ext, tmp_excel):
692688
)
693689
tm.assert_frame_equal(result, df)
694690

695-
@pytest.mark.xfail(
696-
using_string_dtype(), reason="infer_string takes precedence", strict=False
697-
)
698691
def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel):
699692
# GH#36712
700693
if read_ext in (".xlsb", ".xls"):
701694
pytest.skip(f"No engine for filetype: '{read_ext}'")
702695

703-
pa = pytest.importorskip("pyarrow")
696+
df = DataFrame(
697+
{
698+
"a": np.array(["a", "b"], dtype=np.object_),
699+
"b": np.array(["x", pd.NA], dtype=np.object_),
700+
}
701+
)
702+
df.to_excel(tmp_excel, sheet_name="test", index=False)
704703

705704
with pd.option_context("mode.string_storage", string_storage):
706-
df = DataFrame(
707-
{
708-
"a": np.array(["a", "b"], dtype=np.object_),
709-
"b": np.array(["x", pd.NA], dtype=np.object_),
710-
}
711-
)
712-
df.to_excel(tmp_excel, sheet_name="test", index=False)
713705
result = pd.read_excel(
714706
tmp_excel, sheet_name="test", dtype_backend="numpy_nullable"
715707
)
716708

717-
if string_storage == "python":
718-
expected = DataFrame(
719-
{
720-
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
721-
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
722-
}
723-
)
724-
else:
725-
expected = DataFrame(
726-
{
727-
"a": ArrowStringArray(pa.array(["a", "b"])),
728-
"b": ArrowStringArray(pa.array(["x", None])),
729-
}
730-
)
731-
tm.assert_frame_equal(result, expected)
709+
expected = DataFrame(
710+
{
711+
"a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)),
712+
"b": Series(["x", None], dtype=pd.StringDtype(string_storage)),
713+
}
714+
)
715+
# the storage of the str columns' Index is also affected by the
716+
# string_storage setting -> ignore that for checking the result
717+
tm.assert_frame_equal(result, expected, check_column_type=False)
732718

733719
@pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)])
734720
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):

0 commit comments

Comments
 (0)