Skip to content

Commit a969a17

Browse files
authored
Fix backward compatibility for pandas 1.1 and 1.2 (#2624)
1 parent 3756981 commit a969a17

File tree

15 files changed

+93
-69
lines changed

15 files changed

+93
-69
lines changed

mars/dataframe/reduction/aggregation.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -65,22 +65,22 @@ def where_function(cond, var1, var2):
6565

6666

6767
_agg_functions = {
68-
"sum": lambda x, skipna=None: x.sum(skipna=skipna),
69-
"prod": lambda x, skipna=None: x.prod(skipna=skipna),
70-
"product": lambda x, skipna=None: x.product(skipna=skipna),
71-
"min": lambda x, skipna=None: x.min(skipna=skipna),
72-
"max": lambda x, skipna=None: x.max(skipna=skipna),
73-
"all": lambda x, skipna=None: x.all(skipna=skipna),
74-
"any": lambda x, skipna=None: x.any(skipna=skipna),
68+
"sum": lambda x, skipna=True: x.sum(skipna=skipna),
69+
"prod": lambda x, skipna=True: x.prod(skipna=skipna),
70+
"product": lambda x, skipna=True: x.product(skipna=skipna),
71+
"min": lambda x, skipna=True: x.min(skipna=skipna),
72+
"max": lambda x, skipna=True: x.max(skipna=skipna),
73+
"all": lambda x, skipna=True: x.all(skipna=skipna),
74+
"any": lambda x, skipna=True: x.any(skipna=skipna),
7575
"count": lambda x: x.count(),
7676
"size": lambda x: x._reduction_size(),
77-
"mean": lambda x, skipna=None: x.mean(skipna=skipna),
78-
"var": lambda x, skipna=None, ddof=1: x.var(skipna=skipna, ddof=ddof),
79-
"std": lambda x, skipna=None, ddof=1: x.std(skipna=skipna, ddof=ddof),
80-
"sem": lambda x, skipna=None, ddof=1: x.sem(skipna=skipna, ddof=ddof),
81-
"skew": lambda x, skipna=None, bias=False: x.skew(skipna=skipna, bias=bias),
82-
"kurt": lambda x, skipna=None, bias=False: x.kurt(skipna=skipna, bias=bias),
83-
"kurtosis": lambda x, skipna=None, bias=False: x.kurtosis(skipna=skipna, bias=bias),
77+
"mean": lambda x, skipna=True: x.mean(skipna=skipna),
78+
"var": lambda x, skipna=True, ddof=1: x.var(skipna=skipna, ddof=ddof),
79+
"std": lambda x, skipna=True, ddof=1: x.std(skipna=skipna, ddof=ddof),
80+
"sem": lambda x, skipna=True, ddof=1: x.sem(skipna=skipna, ddof=ddof),
81+
"skew": lambda x, skipna=True, bias=False: x.skew(skipna=skipna, bias=bias),
82+
"kurt": lambda x, skipna=True, bias=False: x.kurt(skipna=skipna, bias=bias),
83+
"kurtosis": lambda x, skipna=True, bias=False: x.kurtosis(skipna=skipna, bias=bias),
8484
}
8585

8686

@@ -291,7 +291,7 @@ def _gen_map_chunks(
291291
else:
292292
agg_chunks_shape = (len(func_infos), in_df.chunk_shape[1])
293293

294-
agg_chunks = np.empty(agg_chunks_shape, dtype=np.object)
294+
agg_chunks = np.empty(agg_chunks_shape, dtype=object)
295295
dtypes_cache = dict()
296296
for chunk in in_df.chunks:
297297
input_index = chunk.index[1 - axis] if len(chunk.index) > 1 else 0
@@ -504,7 +504,7 @@ def _tile_tree(cls, op: "DataFrameAggregate"):
504504
ceildiv(chunks.shape[1], combine_size),
505505
)
506506

507-
new_chunks = np.empty(new_chunks_shape, dtype=np.object)
507+
new_chunks = np.empty(new_chunks_shape, dtype=object)
508508
for idx0, i in enumerate(range(0, chunks.shape[axis], combine_size)):
509509
for idx1 in range(chunks.shape[1 - axis]):
510510
func_info = axis_func_infos[idx1]
@@ -761,6 +761,8 @@ def _do_predefined_agg(cls, op: "DataFrameAggregate", input_obj, func_name, kwds
761761
if op.gpu:
762762
if kwds.pop("numeric_only", None):
763763
raise NotImplementedError("numeric_only not implemented under cudf")
764+
if isinstance(input_obj, pd.Index):
765+
kwds.pop("skipna", None)
764766
return getattr(input_obj, func_name)(**kwds)
765767

766768
@classmethod

mars/dataframe/reduction/all.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def all_series(
3131
series,
3232
axis=None,
3333
bool_only=None,
34-
skipna=None,
34+
skipna=True,
3535
level=None,
3636
combine_size=None,
3737
method=None,
@@ -54,7 +54,7 @@ def all_dataframe(
5454
df,
5555
axis=None,
5656
bool_only=None,
57-
skipna=None,
57+
skipna=True,
5858
level=None,
5959
combine_size=None,
6060
method=None,

mars/dataframe/reduction/any.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def any_series(
3131
series,
3232
axis=None,
3333
bool_only=None,
34-
skipna=None,
34+
skipna=True,
3535
level=None,
3636
combine_size=None,
3737
method=None,
@@ -54,7 +54,7 @@ def any_dataframe(
5454
df,
5555
axis=None,
5656
bool_only=None,
57-
skipna=None,
57+
skipna=True,
5858
level=None,
5959
combine_size=None,
6060
method=None,

mars/dataframe/reduction/core.py

Lines changed: 50 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,15 @@
2828
recursive_tile,
2929
)
3030
from ...core.operand import OperandStage
31-
from ...utils import tokenize
31+
from ...lib.version import parse as parse_version
3232
from ...serialization.serializables import (
3333
BoolField,
3434
AnyField,
3535
DataTypeField,
3636
Int32Field,
3737
StringField,
3838
)
39+
from ...utils import tokenize
3940
from ..core import SERIES_TYPE
4041
from ..utils import (
4142
parse_index,
@@ -47,6 +48,14 @@
4748
)
4849
from ..operands import DataFrameOperandMixin, DataFrameOperand, DATAFRAME_TYPE
4950

51+
_pd_release = parse_version(pd.__version__).release[:2]
52+
# in pandas<1.3, when aggregating with multiple levels and numeric_only is True,
53+
# object cols not ignored with min-max funcs
54+
_level_reduction_keep_object = _pd_release < (1, 3)
55+
# in pandas>=1.3, when dataframes are reduced into series, mixture of float and bool
56+
# results in object.
57+
_reduce_bool_as_object = _pd_release >= (1, 3)
58+
5059

5160
class DataFrameReductionOperand(DataFrameOperand):
5261
_axis = AnyField("axis")
@@ -211,22 +220,22 @@ def _get_series_reduction_dtype(
211220
func_name,
212221
axis=None,
213222
bool_only=False,
214-
skipna=False,
223+
skipna=True,
215224
numeric_only=False,
216225
):
217-
empty_series = build_series(dtype=dtype, ensure_string=True)
226+
test_series = build_series(dtype=dtype, ensure_string=True)
218227
if func_name == "count":
219-
reduced = empty_series.count()
228+
reduced = test_series.count()
220229
elif func_name == "nunique":
221-
reduced = empty_series.nunique()
230+
reduced = test_series.nunique()
222231
elif func_name in ("all", "any"):
223-
reduced = getattr(empty_series, func_name)(axis=axis, bool_only=bool_only)
232+
reduced = getattr(test_series, func_name)(axis=axis, bool_only=bool_only)
224233
elif func_name == "size":
225-
reduced = empty_series.size
234+
reduced = test_series.size
226235
elif func_name == "str_concat":
227-
reduced = pd.Series([empty_series.str.cat()])
236+
reduced = pd.Series([test_series.str.cat()])
228237
else:
229-
reduced = getattr(empty_series, func_name)(
238+
reduced = getattr(test_series, func_name)(
230239
axis=axis, skipna=skipna, numeric_only=numeric_only
231240
)
232241
return pd.Series(reduced).dtype
@@ -236,17 +245,17 @@ def _get_series_reduction_dtype(
236245
def _get_df_reduction_dtype(
237246
dtype, func_name, axis=None, bool_only=False, skipna=False, numeric_only=False
238247
):
239-
empty_df = build_series(dtype=dtype, ensure_string=True).to_frame()
248+
test_df = build_series(dtype=dtype, ensure_string=True).to_frame()
240249
if func_name == "count":
241-
reduced = getattr(empty_df, func_name)(axis=axis, numeric_only=numeric_only)
250+
reduced = getattr(test_df, func_name)(axis=axis, numeric_only=numeric_only)
242251
elif func_name == "nunique":
243-
reduced = getattr(empty_df, func_name)(axis=axis)
252+
reduced = getattr(test_df, func_name)(axis=axis)
244253
elif func_name in ("all", "any"):
245-
reduced = getattr(empty_df, func_name)(axis=axis, bool_only=bool_only)
254+
reduced = getattr(test_df, func_name)(axis=axis, bool_only=bool_only)
246255
elif func_name == "str_concat":
247-
reduced = empty_df.apply(lambda s: s.str.cat(), axis=axis)
256+
reduced = test_df.apply(lambda s: s.str.cat(), axis=axis)
248257
else:
249-
reduced = getattr(empty_df, func_name)(
258+
reduced = getattr(test_df, func_name)(
250259
axis=axis, skipna=skipna, numeric_only=numeric_only
251260
)
252261
if len(reduced) == 0:
@@ -304,7 +313,7 @@ def _call_groupby_level(self, df, level):
304313
def _call_dataframe(self, df):
305314
axis = getattr(self, "axis", None) or 0
306315
level = getattr(self, "level", None)
307-
skipna = getattr(self, "skipna", None)
316+
skipna = getattr(self, "skipna", True)
308317
numeric_only = getattr(self, "numeric_only", None)
309318
bool_only = getattr(self, "bool_only", None)
310319
self._axis = axis = validate_axis(axis, df)
@@ -327,9 +336,9 @@ def _call_dataframe(self, df):
327336
reduced_dtype = reduced.dtype
328337
else:
329338
reduced_cols, dtypes = [], []
330-
for col, dt in df.dtypes.items():
339+
for col, src_dt in df.dtypes.items():
331340
dt = _get_df_reduction_dtype(
332-
dt,
341+
src_dt,
333342
func_name,
334343
axis=axis,
335344
bool_only=bool_only,
@@ -339,16 +348,29 @@ def _call_dataframe(self, df):
339348
if dt is not None:
340349
reduced_cols.append(col)
341350
dtypes.append(dt)
351+
elif (
352+
_level_reduction_keep_object
353+
and numeric_only
354+
and level is not None
355+
and func_name in ("min", "max")
356+
and src_dt == np.dtype(object)
357+
): # pragma: no cover
358+
reduced_cols.append(col)
359+
dtypes.append(np.dtype(object))
342360
if len(dtypes) == 0:
343361
reduced_dtype = np.dtype("O")
344362
elif all(dt == dtypes[0] for dt in dtypes):
345363
reduced_dtype = dtypes[0]
346-
elif not all(isinstance(dt, np.dtype) and dt != bool for dt in dtypes):
347-
# todo currently we return mixed dtypes as np.dtype('O').
348-
# handle pandas Dtypes in the future more carefully.
349-
reduced_dtype = np.dtype("O")
350364
else:
351-
reduced_dtype = np.find_common_type(dtypes, [])
365+
has_bool = any(dt == bool for dt in dtypes)
366+
if _reduce_bool_as_object and has_bool:
367+
reduced_dtype = np.dtype("O")
368+
elif not all(isinstance(dt, np.dtype) for dt in dtypes):
369+
# todo currently we return mixed dtypes as np.dtype('O').
370+
# handle pandas Dtypes in the future more carefully.
371+
reduced_dtype = np.dtype("O")
372+
else:
373+
reduced_dtype = np.find_common_type(dtypes, [])
352374

353375
if level is not None:
354376
return self._call_groupby_level(df[reduced_cols], level)
@@ -370,7 +392,7 @@ def _call_dataframe(self, df):
370392
def _call_series(self, series):
371393
level = getattr(self, "level", None)
372394
axis = getattr(self, "axis", None)
373-
skipna = getattr(self, "skipna", None)
395+
skipna = getattr(self, "skipna", True)
374396
numeric_only = getattr(self, "numeric_only", None)
375397
bool_only = getattr(self, "bool_only", None)
376398
self._axis = axis = validate_axis(axis or 0, series)
@@ -442,8 +464,8 @@ def _tile_dataframe(cls, op):
442464
n_rows, n_cols = in_df.chunk_shape
443465

444466
# map to get individual results and summaries
445-
src_chunks = np.empty(in_df.chunk_shape, dtype=np.object)
446-
summary_chunks = np.empty(in_df.chunk_shape, dtype=np.object)
467+
src_chunks = np.empty(in_df.chunk_shape, dtype=object)
468+
summary_chunks = np.empty(in_df.chunk_shape, dtype=object)
447469
for c in in_df.chunks:
448470
new_chunk_op = op.copy().reset_key()
449471
new_chunk_op.stage = OperandStage.map
@@ -457,7 +479,7 @@ def _tile_dataframe(cls, op):
457479
)
458480

459481
# combine summaries into results
460-
output_chunk_array = np.empty(in_df.chunk_shape, dtype=np.object)
482+
output_chunk_array = np.empty(in_df.chunk_shape, dtype=object)
461483
if op.axis == 1:
462484
for row in range(n_rows):
463485
row_src = src_chunks[row, :]
@@ -493,7 +515,7 @@ def _tile_series(cls, op):
493515
series = op.outputs[0]
494516

495517
# map to get individual results and summaries
496-
summary_chunks = np.empty(in_series.chunk_shape, dtype=np.object)
518+
summary_chunks = np.empty(in_series.chunk_shape, dtype=object)
497519
for c in in_series.chunks:
498520
new_chunk_op = op.copy().reset_key()
499521
new_chunk_op.stage = OperandStage.map

mars/dataframe/reduction/kurtosis.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def kurt(x):
7575
def kurt_series(
7676
df,
7777
axis=None,
78-
skipna=None,
78+
skipna=True,
7979
level=None,
8080
combine_size=None,
8181
bias=False,
@@ -100,7 +100,7 @@ def kurt_series(
100100
def kurt_dataframe(
101101
df,
102102
axis=None,
103-
skipna=None,
103+
skipna=True,
104104
level=None,
105105
numeric_only=None,
106106
combine_size=None,

mars/dataframe/reduction/max.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def is_atomic(self):
2727
return True
2828

2929

30-
def max_series(df, axis=None, skipna=None, level=None, combine_size=None, method=None):
30+
def max_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None):
3131
use_inf_as_na = options.dataframe.mode.use_inf_as_na
3232
op = DataFrameMax(
3333
axis=axis,
@@ -44,7 +44,7 @@ def max_series(df, axis=None, skipna=None, level=None, combine_size=None, method
4444
def max_dataframe(
4545
df,
4646
axis=None,
47-
skipna=None,
47+
skipna=True,
4848
level=None,
4949
numeric_only=None,
5050
combine_size=None,

mars/dataframe/reduction/mean.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def mean(x):
3232
return mean
3333

3434

35-
def mean_series(df, axis=None, skipna=None, level=None, combine_size=None, method=None):
35+
def mean_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None):
3636
use_inf_as_na = options.dataframe.mode.use_inf_as_na
3737
op = DataFrameMean(
3838
axis=axis,
@@ -49,7 +49,7 @@ def mean_series(df, axis=None, skipna=None, level=None, combine_size=None, metho
4949
def mean_dataframe(
5050
df,
5151
axis=None,
52-
skipna=None,
52+
skipna=True,
5353
level=None,
5454
numeric_only=None,
5555
combine_size=None,

mars/dataframe/reduction/min.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def is_atomic(self):
2727
return True
2828

2929

30-
def min_series(df, axis=None, skipna=None, level=None, combine_size=None, method=None):
30+
def min_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None):
3131
use_inf_as_na = options.dataframe.mode.use_inf_as_na
3232
op = DataFrameMin(
3333
axis=axis,
@@ -44,7 +44,7 @@ def min_series(df, axis=None, skipna=None, level=None, combine_size=None, method
4444
def min_dataframe(
4545
df,
4646
axis=None,
47-
skipna=None,
47+
skipna=True,
4848
level=None,
4949
numeric_only=None,
5050
combine_size=None,

mars/dataframe/reduction/prod.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def prod(value):
4545

4646

4747
def prod_series(
48-
df, axis=None, skipna=None, level=None, min_count=0, combine_size=None, method=None
48+
df, axis=None, skipna=True, level=None, min_count=0, combine_size=None, method=None
4949
):
5050
use_inf_as_na = options.dataframe.mode.use_inf_as_na
5151
op = DataFrameProd(
@@ -64,7 +64,7 @@ def prod_series(
6464
def prod_dataframe(
6565
df,
6666
axis=None,
67-
skipna=None,
67+
skipna=True,
6868
level=None,
6969
min_count=0,
7070
numeric_only=None,

mars/dataframe/reduction/sem.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def sem(x):
4545

4646

4747
def sem_series(
48-
series, axis=None, skipna=None, level=None, ddof=1, combine_size=None, method=None
48+
series, axis=None, skipna=True, level=None, ddof=1, combine_size=None, method=None
4949
):
5050
use_inf_as_na = options.dataframe.mode.use_inf_as_na
5151
op = DataFrameSem(
@@ -64,7 +64,7 @@ def sem_series(
6464
def sem_dataframe(
6565
df,
6666
axis=None,
67-
skipna=None,
67+
skipna=True,
6868
level=None,
6969
ddof=1,
7070
numeric_only=None,

0 commit comments

Comments
 (0)