|
20 | 20 | isna,
|
21 | 21 | )
|
22 | 22 | import pandas._testing as tm
|
| 23 | +from pandas.tests.groupby import get_groupby_method_args |
23 | 24 | from pandas.util import _test_decorators as td
|
24 | 25 |
|
25 | 26 |
|
@@ -710,6 +711,93 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
|
710 | 711 | tm.assert_frame_equal(result, expected)
|
711 | 712 |
|
712 | 713 |
|
| 714 | +@pytest.mark.parametrize("min_count", [0, 1]) |
| 715 | +@pytest.mark.parametrize("test_series", [True, False]) |
| 716 | +def test_string_dtype_all_na( |
| 717 | + string_dtype_no_object, reduction_func, min_count, test_series |
| 718 | +): |
| 719 | + # https://github.com/pandas-dev/pandas/issues/60985 |
| 720 | + if reduction_func == "corrwith": |
| 721 | + # corrwith is deprecated. |
| 722 | + return |
| 723 | + |
| 724 | + dtype = string_dtype_no_object |
| 725 | + |
| 726 | + if reduction_func in [ |
| 727 | + "any", |
| 728 | + "all", |
| 729 | + "idxmin", |
| 730 | + "idxmax", |
| 731 | + "mean", |
| 732 | + "median", |
| 733 | + "std", |
| 734 | + "var", |
| 735 | + ]: |
| 736 | + kwargs = {} |
| 737 | + elif reduction_func in ["kurt"]: |
| 738 | + kwargs = {"min_count": min_count} |
| 739 | + elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]: |
| 740 | + kwargs = {} |
| 741 | + else: |
| 742 | + kwargs = {"min_count": min_count} |
| 743 | + |
| 744 | + expected_dtype, expected_value = dtype, pd.NA |
| 745 | + if reduction_func in ["all", "any"]: |
| 746 | + expected_dtype = "bool" |
| 747 | + # TODO: For skipna=False, bool(pd.NA) raises; should groupby? |
| 748 | + expected_value = False if reduction_func == "any" else True |
| 749 | + elif reduction_func in ["count", "nunique", "size"]: |
| 750 | + # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA? |
| 751 | + if ( |
| 752 | + test_series |
| 753 | + and reduction_func == "size" |
| 754 | + and dtype.storage == "pyarrow" |
| 755 | + and dtype.na_value is pd.NA |
| 756 | + ): |
| 757 | + expected_dtype = "Int64" |
| 758 | + else: |
| 759 | + expected_dtype = "int64" |
| 760 | + expected_value = 1 if reduction_func == "size" else 0 |
| 761 | + elif reduction_func in ["idxmin", "idxmax"]: |
| 762 | + expected_dtype, expected_value = "float64", np.nan |
| 763 | + elif min_count > 0: |
| 764 | + expected_value = pd.NA |
| 765 | + elif reduction_func == "sum": |
| 766 | + # https://github.com/pandas-dev/pandas/pull/60936 |
| 767 | + expected_value = "" |
| 768 | + |
| 769 | + df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype) |
| 770 | + obj = df["b"] if test_series else df |
| 771 | + args = get_groupby_method_args(reduction_func, obj) |
| 772 | + gb = obj.groupby(df["a"]) |
| 773 | + method = getattr(gb, reduction_func) |
| 774 | + |
| 775 | + if reduction_func in [ |
| 776 | + "mean", |
| 777 | + "median", |
| 778 | + "kurt", |
| 779 | + "prod", |
| 780 | + "quantile", |
| 781 | + "sem", |
| 782 | + "skew", |
| 783 | + "std", |
| 784 | + "var", |
| 785 | + ]: |
| 786 | + msg = f"dtype '{dtype}' does not support operation '{reduction_func}'" |
| 787 | + with pytest.raises(TypeError, match=msg): |
| 788 | + method(*args, **kwargs) |
| 789 | + return |
| 790 | + |
| 791 | + result = method(*args, **kwargs) |
| 792 | + index = pd.Index(["x"], name="a", dtype=dtype) |
| 793 | + if test_series or reduction_func == "size": |
| 794 | + name = None if not test_series and reduction_func == "size" else "b" |
| 795 | + expected = Series(expected_value, index=index, dtype=expected_dtype, name=name) |
| 796 | + else: |
| 797 | + expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype) |
| 798 | + tm.assert_equal(result, expected) |
| 799 | + |
| 800 | + |
713 | 801 | @pytest.mark.parametrize("min_count", [0, 1])
|
714 | 802 | def test_string_dtype_empty_sum(string_dtype_no_object, min_count):
|
715 | 803 | # https://github.com/pandas-dev/pandas/issues/60229
|
|
0 commit comments