@@ -185,14 +185,6 @@ def _tile_compressed(cls, op):
185
185
columns_value = df .columns_value ,
186
186
chunks = [new_chunk ], nsplits = nsplits )
187
187
188
- @classmethod
189
- def _validate_dtypes (cls , dtypes , is_gpu ):
190
- dtypes = dtypes .to_dict ()
191
- # CuDF doesn't support object type, turn it to 'str'.
192
- if is_gpu :
193
- dtypes = dict ((n , dt .name if dt != np .dtype ('object' ) else 'str' ) for n , dt in dtypes .items ())
194
- return dtypes
195
-
196
188
@classmethod
197
189
def tile (cls , op ):
198
190
if op .compression :
@@ -270,8 +262,9 @@ def _pandas_read_csv(cls, f, op):
270
262
# will replace null value with np.nan,
271
263
# which will cause failure when converting to arrow string array
272
264
csv_kwargs ['keep_default_na' ] = False
273
- df = pd .read_csv (b , sep = op .sep , names = op .names , index_col = op .index_col , usecols = usecols ,
274
- dtype = dtypes .to_dict (), nrows = op .nrows , ** csv_kwargs )
265
+ csv_kwargs ['dtype' ] = cls ._select_arrow_dtype (dtypes )
266
+ df = pd .read_csv (b , sep = op .sep , names = op .names , index_col = op .index_col ,
267
+ usecols = usecols , nrows = op .nrows , ** csv_kwargs )
275
268
if op .keep_usecols_order :
276
269
df = df [op .usecols ]
277
270
return df
@@ -287,8 +280,7 @@ def _cudf_read_csv(cls, op): # pragma: no cover
287
280
df = cudf .read_csv (op .path , byte_range = (op .offset , op .size ), sep = op .sep , usecols = usecols , ** csv_kwargs )
288
281
else :
289
282
df = cudf .read_csv (op .path , byte_range = (op .offset , op .size ), sep = op .sep , names = op .names ,
290
- usecols = usecols , dtype = cls ._validate_dtypes (op .outputs [0 ].dtypes , op .gpu ),
291
- nrows = op .nrows , ** csv_kwargs )
283
+ usecols = usecols , nrows = op .nrows , ** csv_kwargs )
292
284
293
285
if op .keep_usecols_order :
294
286
df = df [op .usecols ]
@@ -298,6 +290,11 @@ def _cudf_read_csv(cls, op): # pragma: no cover
298
290
def _contains_arrow_dtype (cls , dtypes ):
299
291
return any (isinstance (dtype , ArrowStringDtype ) for dtype in dtypes )
300
292
293
+ @classmethod
294
+ def _select_arrow_dtype (cls , dtypes ):
295
+ return dict ((c , dtype ) for c , dtype in dtypes .items () if
296
+ isinstance (dtype , ArrowStringDtype ))
297
+
301
298
@classmethod
302
299
def execute (cls , ctx , op ):
303
300
xdf = cudf if op .gpu else pd
@@ -308,15 +305,15 @@ def execute(cls, ctx, op):
308
305
if op .compression is not None :
309
306
# As we specify names and dtype, we need to skip header rows
310
307
csv_kwargs ['skiprows' ] = 1 if op .header == 'infer' else op .header
311
- dtypes = cls . _validate_dtypes ( op .outputs [0 ].dtypes , op . gpu )
312
- if contain_arrow_dtype (dtypes . values () ):
308
+ dtypes = op .outputs [0 ].dtypes
309
+ if contain_arrow_dtype (dtypes ):
313
310
# when keep_default_na is True which is default,
314
311
# will replace null value with np.nan,
315
312
# which will cause failure when converting to arrow string array
316
313
csv_kwargs ['keep_default_na' ] = False
314
+ csv_kwargs ['dtype' ] = cls ._select_arrow_dtype (dtypes )
317
315
df = xdf .read_csv (f , sep = op .sep , names = op .names , index_col = op .index_col ,
318
- usecols = op .usecols , dtype = dtypes ,
319
- nrows = op .nrows , ** csv_kwargs )
316
+ usecols = op .usecols , nrows = op .nrows , ** csv_kwargs )
320
317
if op .keep_usecols_order :
321
318
df = df [op .usecols ]
322
319
else :
0 commit comments