feat: add DataFrame.to_pandas_batches() to download large DataFrame objects

tswast · tswast · commit 359a90cc7f5a · 2023-10-26T16:21:00.000Z
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -412,6 +412,27 @@ def to_pandas(
         )
         return df, query_job
 
+    def to_pandas_batches(self):
+        """Download results one message at a time."""
+        dtypes = dict(zip(self.index_columns, self.index_dtypes))
+        dtypes.update(zip(self.value_columns, self.dtypes))
+        results_iterator, _ = self._expr.start_query()
+        for arrow_table in results_iterator.to_arrow_iterable(
+            bqstorage_client=self._expr._session.bqstoragereadclient
+        ):
+            df = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)
+            self._copy_index_to_pandas(df)
+            yield df
+
+    def _copy_index_to_pandas(self, df: pd.DataFrame):
+        """Set the index on pandas DataFrame to match this block.
+
+        Warning: This method modifies ``df`` inplace.
+        """
+        if self.index_columns:
+            df.set_index(list(self.index_columns), inplace=True)
+            df.index.names = self.index.names  # type: ignore
+
     def _compute_and_count(
         self,
         value_keys: Optional[Iterable[str]] = None,
@@ -485,10 +506,7 @@ def _compute_and_count(
         else:
             total_rows = results_iterator.total_rows
             df = self._to_dataframe(results_iterator)
-
-            if self.index_columns:
-                df.set_index(list(self.index_columns), inplace=True)
-                df.index.names = self.index.names  # type: ignore
+            self._copy_index_to_pandas(df)
 
         return df, total_rows, query_job
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -893,6 +893,10 @@ def to_pandas(
         self._set_internal_query_job(query_job)
         return df.set_axis(self._block.column_labels, axis=1, copy=False)
 
+    def to_pandas_batches(self) -> Iterable[pandas.DataFrame]:
+        """Stream DataFrame results to an iterable of pandas DataFrame"""
+        return self._block.to_pandas_batches()
+
     def _compute_dry_run(self) -> bigquery.QueryJob:
         return self._block._compute_dry_run()
 
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
@@ -83,6 +83,14 @@ def test_to_pandas_array_struct_correct_result(session):
     )
 
 
+def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
+    """Verify to_pandas_batches() APIs returns the expected dtypes."""
+    expected = scalars_df_default_index.dtypes
+    for df in scalars_df_default_index.to_pandas_batches():
+        actual = df.dtypes
+        pd.testing.assert_series_equal(actual, expected)
+
+
 @pytest.mark.parametrize(
     ("index"),
     [True, False],
diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py
@@ -131,6 +131,8 @@
             {
                 "date": pandas.ArrowDtype(pyarrow.date32()),
                 "datetime": pandas.ArrowDtype(pyarrow.timestamp("us")),
+                "float": pandas.Float64Dtype(),
+                "int": pandas.Int64Dtype(),
                 "string": "string[pyarrow]",
                 "time": pandas.ArrowDtype(pyarrow.time64("us")),
                 "timestamp": pandas.ArrowDtype(
@@ -157,6 +159,19 @@
                         ],
                         dtype=pandas.ArrowDtype(pyarrow.timestamp("us")),
                     ),
+                    "float": pandas.Series(
+                        pandas.arrays.FloatingArray(
+                            numpy.array(
+                                [1.0, float("nan"), float("nan"), -1.0], dtype="float64"
+                            ),
+                            numpy.array([False, True, False, False], dtype="bool"),
+                        ),
+                        dtype=pandas.Float64Dtype(),
+                    ),
+                    "int": pandas.Series(
+                        [1, None, -1, 2**63 - 1],
+                        dtype=pandas.Int64Dtype(),
+                    ),
                     "string": pandas.Series(
                         ["123", None, "abc", "xyz"], dtype="string[pyarrow]"
                     ),