28
28
recursive_tile ,
29
29
)
30
30
from ...core .operand import OperandStage
31
- from ...utils import tokenize
31
+ from ...lib . version import parse as parse_version
32
32
from ...serialization .serializables import (
33
33
BoolField ,
34
34
AnyField ,
35
35
DataTypeField ,
36
36
Int32Field ,
37
37
StringField ,
38
38
)
39
+ from ...utils import tokenize
39
40
from ..core import SERIES_TYPE
40
41
from ..utils import (
41
42
parse_index ,
47
48
)
48
49
from ..operands import DataFrameOperandMixin , DataFrameOperand , DATAFRAME_TYPE
49
50
51
+ _pd_release = parse_version (pd .__version__ ).release [:2 ]
52
+ # in pandas<1.3, when aggregating with multiple levels and numeric_only is True,
53
+ # object cols not ignored with min-max funcs
54
+ _level_reduction_keep_object = _pd_release < (1 , 3 )
55
+ # in pandas>=1.3, when dataframes are reduced into series, mixture of float and bool
56
+ # results in object.
57
+ _reduce_bool_as_object = _pd_release >= (1 , 3 )
58
+
50
59
51
60
class DataFrameReductionOperand (DataFrameOperand ):
52
61
_axis = AnyField ("axis" )
@@ -211,22 +220,22 @@ def _get_series_reduction_dtype(
211
220
func_name ,
212
221
axis = None ,
213
222
bool_only = False ,
214
- skipna = False ,
223
+ skipna = True ,
215
224
numeric_only = False ,
216
225
):
217
- empty_series = build_series (dtype = dtype , ensure_string = True )
226
+ test_series = build_series (dtype = dtype , ensure_string = True )
218
227
if func_name == "count" :
219
- reduced = empty_series .count ()
228
+ reduced = test_series .count ()
220
229
elif func_name == "nunique" :
221
- reduced = empty_series .nunique ()
230
+ reduced = test_series .nunique ()
222
231
elif func_name in ("all" , "any" ):
223
- reduced = getattr (empty_series , func_name )(axis = axis , bool_only = bool_only )
232
+ reduced = getattr (test_series , func_name )(axis = axis , bool_only = bool_only )
224
233
elif func_name == "size" :
225
- reduced = empty_series .size
234
+ reduced = test_series .size
226
235
elif func_name == "str_concat" :
227
- reduced = pd .Series ([empty_series .str .cat ()])
236
+ reduced = pd .Series ([test_series .str .cat ()])
228
237
else :
229
- reduced = getattr (empty_series , func_name )(
238
+ reduced = getattr (test_series , func_name )(
230
239
axis = axis , skipna = skipna , numeric_only = numeric_only
231
240
)
232
241
return pd .Series (reduced ).dtype
@@ -236,17 +245,17 @@ def _get_series_reduction_dtype(
236
245
def _get_df_reduction_dtype (
237
246
dtype , func_name , axis = None , bool_only = False , skipna = False , numeric_only = False
238
247
):
239
- empty_df = build_series (dtype = dtype , ensure_string = True ).to_frame ()
248
+ test_df = build_series (dtype = dtype , ensure_string = True ).to_frame ()
240
249
if func_name == "count" :
241
- reduced = getattr (empty_df , func_name )(axis = axis , numeric_only = numeric_only )
250
+ reduced = getattr (test_df , func_name )(axis = axis , numeric_only = numeric_only )
242
251
elif func_name == "nunique" :
243
- reduced = getattr (empty_df , func_name )(axis = axis )
252
+ reduced = getattr (test_df , func_name )(axis = axis )
244
253
elif func_name in ("all" , "any" ):
245
- reduced = getattr (empty_df , func_name )(axis = axis , bool_only = bool_only )
254
+ reduced = getattr (test_df , func_name )(axis = axis , bool_only = bool_only )
246
255
elif func_name == "str_concat" :
247
- reduced = empty_df .apply (lambda s : s .str .cat (), axis = axis )
256
+ reduced = test_df .apply (lambda s : s .str .cat (), axis = axis )
248
257
else :
249
- reduced = getattr (empty_df , func_name )(
258
+ reduced = getattr (test_df , func_name )(
250
259
axis = axis , skipna = skipna , numeric_only = numeric_only
251
260
)
252
261
if len (reduced ) == 0 :
@@ -304,7 +313,7 @@ def _call_groupby_level(self, df, level):
304
313
def _call_dataframe (self , df ):
305
314
axis = getattr (self , "axis" , None ) or 0
306
315
level = getattr (self , "level" , None )
307
- skipna = getattr (self , "skipna" , None )
316
+ skipna = getattr (self , "skipna" , True )
308
317
numeric_only = getattr (self , "numeric_only" , None )
309
318
bool_only = getattr (self , "bool_only" , None )
310
319
self ._axis = axis = validate_axis (axis , df )
@@ -327,9 +336,9 @@ def _call_dataframe(self, df):
327
336
reduced_dtype = reduced .dtype
328
337
else :
329
338
reduced_cols , dtypes = [], []
330
- for col , dt in df .dtypes .items ():
339
+ for col , src_dt in df .dtypes .items ():
331
340
dt = _get_df_reduction_dtype (
332
- dt ,
341
+ src_dt ,
333
342
func_name ,
334
343
axis = axis ,
335
344
bool_only = bool_only ,
@@ -339,16 +348,29 @@ def _call_dataframe(self, df):
339
348
if dt is not None :
340
349
reduced_cols .append (col )
341
350
dtypes .append (dt )
351
+ elif (
352
+ _level_reduction_keep_object
353
+ and numeric_only
354
+ and level is not None
355
+ and func_name in ("min" , "max" )
356
+ and src_dt == np .dtype (object )
357
+ ): # pragma: no cover
358
+ reduced_cols .append (col )
359
+ dtypes .append (np .dtype (object ))
342
360
if len (dtypes ) == 0 :
343
361
reduced_dtype = np .dtype ("O" )
344
362
elif all (dt == dtypes [0 ] for dt in dtypes ):
345
363
reduced_dtype = dtypes [0 ]
346
- elif not all (isinstance (dt , np .dtype ) and dt != bool for dt in dtypes ):
347
- # todo currently we return mixed dtypes as np.dtype('O').
348
- # handle pandas Dtypes in the future more carefully.
349
- reduced_dtype = np .dtype ("O" )
350
364
else :
351
- reduced_dtype = np .find_common_type (dtypes , [])
365
+ has_bool = any (dt == bool for dt in dtypes )
366
+ if _reduce_bool_as_object and has_bool :
367
+ reduced_dtype = np .dtype ("O" )
368
+ elif not all (isinstance (dt , np .dtype ) for dt in dtypes ):
369
+ # todo currently we return mixed dtypes as np.dtype('O').
370
+ # handle pandas Dtypes in the future more carefully.
371
+ reduced_dtype = np .dtype ("O" )
372
+ else :
373
+ reduced_dtype = np .find_common_type (dtypes , [])
352
374
353
375
if level is not None :
354
376
return self ._call_groupby_level (df [reduced_cols ], level )
@@ -370,7 +392,7 @@ def _call_dataframe(self, df):
370
392
def _call_series (self , series ):
371
393
level = getattr (self , "level" , None )
372
394
axis = getattr (self , "axis" , None )
373
- skipna = getattr (self , "skipna" , None )
395
+ skipna = getattr (self , "skipna" , True )
374
396
numeric_only = getattr (self , "numeric_only" , None )
375
397
bool_only = getattr (self , "bool_only" , None )
376
398
self ._axis = axis = validate_axis (axis or 0 , series )
@@ -442,8 +464,8 @@ def _tile_dataframe(cls, op):
442
464
n_rows , n_cols = in_df .chunk_shape
443
465
444
466
# map to get individual results and summaries
445
- src_chunks = np .empty (in_df .chunk_shape , dtype = np . object )
446
- summary_chunks = np .empty (in_df .chunk_shape , dtype = np . object )
467
+ src_chunks = np .empty (in_df .chunk_shape , dtype = object )
468
+ summary_chunks = np .empty (in_df .chunk_shape , dtype = object )
447
469
for c in in_df .chunks :
448
470
new_chunk_op = op .copy ().reset_key ()
449
471
new_chunk_op .stage = OperandStage .map
@@ -457,7 +479,7 @@ def _tile_dataframe(cls, op):
457
479
)
458
480
459
481
# combine summaries into results
460
- output_chunk_array = np .empty (in_df .chunk_shape , dtype = np . object )
482
+ output_chunk_array = np .empty (in_df .chunk_shape , dtype = object )
461
483
if op .axis == 1 :
462
484
for row in range (n_rows ):
463
485
row_src = src_chunks [row , :]
@@ -493,7 +515,7 @@ def _tile_series(cls, op):
493
515
series = op .outputs [0 ]
494
516
495
517
# map to get individual results and summaries
496
- summary_chunks = np .empty (in_series .chunk_shape , dtype = np . object )
518
+ summary_chunks = np .empty (in_series .chunk_shape , dtype = object )
497
519
for c in in_series .chunks :
498
520
new_chunk_op = op .copy ().reset_key ()
499
521
new_chunk_op .stage = OperandStage .map
0 commit comments