[BACKPORT] Fixes md.read_csv when dtypes is not inferred correctly (#1606) (#1617)

wjsi · hekaisheng · web-flow · commit 2732b1b53f59 · 2020-09-30T09:12:26.000+08:00
Co-authored-by: He Kaisheng &lt;heks93@163.com&gt;
diff --git a/mars/dataframe/datasource/read_csv.py b/mars/dataframe/datasource/read_csv.py
@@ -185,14 +185,6 @@ def _tile_compressed(cls, op):
                                      columns_value=df.columns_value,
                                      chunks=[new_chunk], nsplits=nsplits)
 
-    @classmethod
-    def _validate_dtypes(cls, dtypes, is_gpu):
-        dtypes = dtypes.to_dict()
-        # CuDF doesn't support object type, turn it to 'str'.
-        if is_gpu:
-            dtypes = dict((n, dt.name if dt != np.dtype('object') else 'str') for n, dt in dtypes.items())
-        return dtypes
-
     @classmethod
     def tile(cls, op):
         if op.compression:
@@ -270,8 +262,9 @@ def _pandas_read_csv(cls, f, op):
                 # will replace null value with np.nan,
                 # which will cause failure when converting to arrow string array
                 csv_kwargs['keep_default_na'] = False
-            df = pd.read_csv(b, sep=op.sep, names=op.names, index_col=op.index_col, usecols=usecols,
-                             dtype=dtypes.to_dict(), nrows=op.nrows, **csv_kwargs)
+                csv_kwargs['dtype'] = cls._select_arrow_dtype(dtypes)
+            df = pd.read_csv(b, sep=op.sep, names=op.names, index_col=op.index_col,
+                             usecols=usecols, nrows=op.nrows, **csv_kwargs)
             if op.keep_usecols_order:
                 df = df[op.usecols]
         return df
@@ -287,8 +280,7 @@ def _cudf_read_csv(cls, op):  # pragma: no cover
             df = cudf.read_csv(op.path, byte_range=(op.offset, op.size), sep=op.sep, usecols=usecols, **csv_kwargs)
         else:
             df = cudf.read_csv(op.path, byte_range=(op.offset, op.size), sep=op.sep, names=op.names,
-                               usecols=usecols, dtype=cls._validate_dtypes(op.outputs[0].dtypes, op.gpu),
-                               nrows=op.nrows, **csv_kwargs)
+                               usecols=usecols, nrows=op.nrows, **csv_kwargs)
 
         if op.keep_usecols_order:
             df = df[op.usecols]
@@ -298,6 +290,11 @@ def _cudf_read_csv(cls, op):  # pragma: no cover
     def _contains_arrow_dtype(cls, dtypes):
         return any(isinstance(dtype, ArrowStringDtype) for dtype in dtypes)
 
+    @classmethod
+    def _select_arrow_dtype(cls, dtypes):
+        return dict((c, dtype) for c, dtype in dtypes.items() if
+                    isinstance(dtype, ArrowStringDtype))
+
     @classmethod
     def execute(cls, ctx, op):
         xdf = cudf if op.gpu else pd
@@ -308,15 +305,15 @@ def execute(cls, ctx, op):
             if op.compression is not None:
                 # As we specify names and dtype, we need to skip header rows
                 csv_kwargs['skiprows'] = 1 if op.header == 'infer' else op.header
-                dtypes = cls._validate_dtypes(op.outputs[0].dtypes, op.gpu)
-                if contain_arrow_dtype(dtypes.values()):
+                dtypes = op.outputs[0].dtypes
+                if contain_arrow_dtype(dtypes):
                     # when keep_default_na is True which is default,
                     # will replace null value with np.nan,
                     # which will cause failure when converting to arrow string array
                     csv_kwargs['keep_default_na'] = False
+                    csv_kwargs['dtype'] = cls._select_arrow_dtype(dtypes)
                 df = xdf.read_csv(f, sep=op.sep, names=op.names, index_col=op.index_col,
-                                  usecols=op.usecols, dtype=dtypes,
-                                  nrows=op.nrows, **csv_kwargs)
+                                  usecols=op.usecols, nrows=op.nrows, **csv_kwargs)
                 if op.keep_usecols_order:
                     df = df[op.usecols]
             else:
diff --git a/mars/dataframe/datasource/tests/test_datasource_execution.py b/mars/dataframe/datasource/tests/test_datasource_execution.py
@@ -374,6 +374,27 @@ def testReadCSVExecution(self):
                                                    concat=True)[0]
             pd.testing.assert_frame_equal(pdf, mdf2)
 
+        # test nan
+        with tempfile.TemporaryDirectory() as tempdir:
+            file_path = os.path.join(tempdir, 'test.csv')
+
+            df = pd.DataFrame({
+                'col1': np.random.rand(100, ),
+                'col2': np.random.choice(['a', 'b', 'c'], (100,)),
+                'col3': np.arange(100)
+            })
+            df.iloc[20:, :] = pd.NA
+            df.to_csv(file_path)
+
+            pdf = pd.read_csv(file_path, index_col=0)
+            mdf = md.read_csv(file_path, index_col=0, head_lines=10, chunk_bytes=200)
+            result = self.executor.execute_dataframe(mdf, concat=True)[0]
+            pd.testing.assert_frame_equal(pdf, result)
+
+            # dtypes is inferred as expected
+            pd.testing.assert_series_equal(mdf.dtypes, pd.Series(['float64', 'object', 'int64'],
+                                                                 index=df.columns))
+
         # test compression
         with tempfile.TemporaryDirectory() as tempdir:
             file_path = os.path.join(tempdir, 'test.gzip')
diff --git a/mars/dataframe/groupby/aggregation.py b/mars/dataframe/groupby/aggregation.py
@@ -645,12 +645,22 @@ def _check_if_func_available(func):
 def agg(groupby, func, method='auto', *args, **kwargs):
     """
     Aggregate using one or more operations on grouped data.
-    :param groupby: Groupby data.
-    :param func: Aggregation functions.
-    :param method: 'shuffle' or 'tree', 'tree' method provide a better performance, 'shuffle' is recommended
-    if aggregated result is very large, 'auto' will use 'shuffle' method in distributed mode and use 'tree'
-    in local mode.
-    :return: Aggregated result.
+
+    Parameters
+    ----------
+    groupby : Mars Groupby
+        Groupby data.
+    func : str or list-like
+        Aggregation functions.
+    method : {'auto', 'shuffle', 'tree'}, default 'auto'
+        'tree' method provide a better performance, 'shuffle' is recommended
+        if aggregated result is very large, 'auto' will use 'shuffle' method
+        in distributed mode and use 'tree' in local mode.
+
+    Returns
+    -------
+    Series or DataFrame
+        Aggregated result.
     """
 
     # When perform a computation on the grouped data, we won't shuffle
diff --git a/mars/dataframe/sort/sort_values.py b/mars/dataframe/sort/sort_values.py
@@ -112,31 +112,48 @@ def dataframe_sort_values(df, by, axis=0, ascending=True, inplace=False, kind='q
                           na_position='last', ignore_index=False, parallel_kind='PSRS', psrs_kinds=None):
     """
     Sort by the values along either axis.
-    :param df: input DataFrame.
-    :param by: Name or list of names to sort by.
-    :param axis: Axis to be sorted.
-    :param ascending: Sort ascending vs. descending. Specify list for multiple sort orders.
-    If this is a list of bools, must match the length of the by.
-    :param inplace: If True, perform operation in-place.
-    :param kind: Choice of sorting algorithm. See also ndarray.np.sort for more information.
-    mergesort is the only stable algorithm. For DataFrames, this option is only applied
-    when sorting on a single column or label.
-    :param na_position: Puts NaNs at the beginning if first; last puts NaNs at the end.
-    :param ignore_index: If True, the resulting axis will be labeled 0, 1, …, n - 1.
-    :param parallel_kind: {'PSRS'}, optional. Parallel sorting algorithm, for the details, refer to:
-    http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html
-    :param psrs_kinds: Sorting algorithms during PSRS algorithm.
-    :return: sorted dataframe.
+
+    Parameters
+    ----------
+    df : Mars DataFrame
+         Input dataframe.
+    by : str
+         Name or list of names to sort by.
+    axis : %(axes_single_arg)s, default 0
+         Axis to be sorted.
+    ascending : bool or list of bool, default True
+         Sort ascending vs. descending. Specify list for multiple sort
+         orders.  If this is a list of bools, must match the length of
+         the by.
+    inplace : bool, default False
+         If True, perform operation in-place.
+    kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
+         Choice of sorting algorithm. See also ndarray.np.sort for more
+         information.  `mergesort` is the only stable algorithm. For
+         DataFrames, this option is only applied when sorting on a single
+         column or label.
+    na_position : {'first', 'last'}, default 'last'
+         Puts NaNs at the beginning if `first`; `last` puts NaNs at the
+         end.
+    ignore_index : bool, default False
+         If True, the resulting axis will be labeled 0, 1, …, n - 1.
+    parallel_kind : {'PSRS'}, default 'PSRS'
+         Parallel sorting algorithm, for the details, refer to:
+         http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html
+
+    Returns
+    -------
+    sorted_obj : DataFrame or None
+        DataFrame with sorted values if inplace=False, None otherwise.
 
     Examples
     --------
     >>> import mars.dataframe as md
-    >>> raw = pd.DataFrame({
+    >>> df = md.DataFrame({
     ...     'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
     ...     'col2': [2, 1, 9, 8, 7, 4],
     ...     'col3': [0, 1, 9, 4, 2, 3],
     ... })
-    >>> df = md.DataFrame(raw)
     >>> df.execute()
         col1 col2 col3
     0   A    2    0
@@ -179,7 +196,18 @@ def dataframe_sort_values(df, by, axis=0, ascending=True, inplace=False, kind='q
     1   A    1    1
     3   NaN  8    4
 
+    Putting NAs first
+
+    >>> df.sort_values(by='col1', ascending=False, na_position='first').execute()
+        col1 col2 col3
+    3   NaN  8    4
+    4   D    7    2
+    5   C    4    3
+    2   B    9    9
+    0   A    2    0
+    1   A    1    1
     """
+
     if na_position not in ['last', 'first']:  # pragma: no cover
         raise TypeError(f'invalid na_position: {na_position}')
     axis = validate_axis(axis, df)
diff --git a/mars/tensor/einsum/core.py b/mars/tensor/einsum/core.py
@@ -167,18 +167,48 @@ def einsum(subscripts, *operands, dtype=None, order='K', casting='safe', optimiz
 
     See the notes and examples for clarification.
 
-    :param subscripts: Specifies the subscripts for summation as comma separated list of subscript labels.
-    An implicit (classical Einstein summation) calculation is performed unless the explicit indicator ‘->’ is
-    included as well as subscript labels of the precise output form.
-    :param operands: These are the arrays for the operation.
-    :param dtype: If provided, forces the calculation to use the data type specified.
-    Note that you may have to also give a more liberal casting parameter to allow the conversions.
-    Default is None.
-    :param order: Controls the memory layout of the output.
-    :param casting: Controls what kind of data casting may occur. Setting this to ‘unsafe’ is not recommended,
-     as it can adversely affect accumulations.
-    :param optimize: Controls if intermediate optimization should occur.
-    :return: The calculation based on the Einstein summation convention.
+    Parameters
+    ----------
+    subscripts : str
+        Specifies the subscripts for summation as comma separated list of
+        subscript labels. An implicit (classical Einstein summation)
+        calculation is performed unless the explicit indicator '->' is
+        included as well as subscript labels of the precise output form.
+    operands : list of array_like
+        These are the arrays for the operation.
+    dtype : {data-type, None}, optional
+        If provided, forces the calculation to use the data type specified.
+        Note that you may have to also give a more liberal `casting`
+        parameter to allow the conversions. Default is None.
+    order : {'C', 'F', 'A', 'K'}, optional
+        Controls the memory layout of the output. 'C' means it should
+        be C contiguous. 'F' means it should be Fortran contiguous,
+        'A' means it should be 'F' if the inputs are all 'F', 'C' otherwise.
+        'K' means it should be as close to the layout as the inputs as
+        is possible, including arbitrarily permuted axes.
+        Default is 'K'.
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur.  Setting this to
+        'unsafe' is not recommended, as it can adversely affect accumulations.
+
+          * 'no' means the data types should not be cast at all.
+          * 'equiv' means only byte-order changes are allowed.
+          * 'safe' means only casts which can preserve values are allowed.
+          * 'same_kind' means only safe casts or casts within a kind,
+            like float64 to float32, are allowed.
+          * 'unsafe' means any data conversions may be done.
+
+        Default is 'safe'.
+    optimize : {False, True, 'greedy', 'optimal'}, optional
+        Controls if intermediate optimization should occur. No optimization
+        will occur if False and True will default to the 'greedy' algorithm.
+        Also accepts an explicit contraction list from the ``np.einsum_path``
+        function. See ``np.einsum_path`` for more details. Defaults to False.
+
+    Returns
+    -------
+    output : Mars.tensor
+        The calculation based on the Einstein summation convention.
 
     The Einstein summation convention can be used to compute
     many multi-dimensional, linear algebraic array operations. `einsum`
@@ -393,6 +423,7 @@ def einsum(subscripts, *operands, dtype=None, order='K', casting='safe', optimiz
     ...     _ = mt.einsum('ijk,ilm,njm,nlk,abc->',a,a,a,a,a, optimize=path)
 
     """
+
     all_inputs = [subscripts] + list(operands)
     inputs, outputs, operands = parse_einsum_input(all_inputs)
     subscripts = "->".join((inputs, outputs))