feat: add DataFrame.to_pandas_batches() to download large DataFrame objects

tswast · tswast · commit 68f50ef60979 · 2023-10-24T15:55:02.000Z
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -412,6 +412,27 @@ def to_pandas(
         )
         return df, query_job
 
+    def to_pandas_batches(self):
+        """Download results one message at a time."""
+        dtypes = dict(zip(self.index_columns, self.index_dtypes))
+        dtypes.update(zip(self.value_columns, self.dtypes))
+        results_iterator, _ = self._expr.start_query()
+        for arrow_table in results_iterator.to_arrow_iterable(
+            bqstorage_client=self._expr._session.bqstoragereadclient
+        ):
+            df = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)
+            self._copy_index_to_pandas(df)
+            yield df
+
+    def _copy_index_to_pandas(self, df: pd.DataFrame):
+        """Set the index on pandas DataFrame to match this block.
+
+        Warning: This method modifies ``df`` inplace.
+        """
+        if self.index_columns:
+            df.set_index(list(self.index_columns), inplace=True)
+            df.index.names = self.index.names  # type: ignore
+
     def _compute_and_count(
         self,
         value_keys: Optional[Iterable[str]] = None,
@@ -485,10 +506,7 @@ def _compute_and_count(
         else:
             total_rows = results_iterator.total_rows
             df = self._to_dataframe(results_iterator)
-
-            if self.index_columns:
-                df.set_index(list(self.index_columns), inplace=True)
-                df.index.names = self.index.names  # type: ignore
+            self._copy_index_to_pandas(df)
 
         return df, total_rows, query_job
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -893,6 +893,10 @@ def to_pandas(
         self._set_internal_query_job(query_job)
         return df.set_axis(self._block.column_labels, axis=1, copy=False)
 
+    def to_pandas_batches(self) -> Iterable[pandas.DataFrame]:
+        """Stream DataFrame results to an iterable of pandas DataFrame"""
+        return self._block.to_pandas_batches()
+
     def _compute_dry_run(self) -> bigquery.QueryJob:
         return self._block._compute_dry_run()
 
diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py
@@ -21,16 +21,16 @@
 import bigframes.constants
 
 
-def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
+def arrow_to_pandas(arrow_table: pyarrow.Table | pyarrow.RecordBatch, dtypes: Dict):
     if len(dtypes) != arrow_table.num_columns:
         raise ValueError(
             f"Number of types {len(dtypes)} doesn't match number of columns "
             f"{arrow_table.num_columns}. {bigframes.constants.FEEDBACK_LINK}"
         )
 
     serieses = {}
-    for column_name, column in zip(arrow_table.column_names, arrow_table):
-        dtype = dtypes[column_name]
+    for field, column in zip(arrow_table.schema, arrow_table):
+        dtype = dtypes[field.name]
 
         if dtype == geopandas.array.GeometryDtype():
             series = geopandas.GeoSeries.from_wkt(
@@ -41,6 +41,6 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
         else:
             series = pandas.Series(column, dtype=dtype)
 
-        serieses[column_name] = series
+        serieses[field.name] = series
 
     return pandas.DataFrame(serieses)
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
@@ -83,6 +83,14 @@ def test_to_pandas_array_struct_correct_result(session):
     )
 
 
+def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
+    """Verify to_pandas_batches() APIs returns the expected dtypes."""
+    expected = scalars_df_default_index.dtypes
+    for df in scalars_df_default_index.to_pandas_batches():
+        actual = df.dtypes
+        pd.testing.assert_series_equal(actual, expected)
+
+
 @pytest.mark.parametrize(
     ("index"),
     [True, False],
diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py
@@ -0,0 +1,175 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+from typing import Dict
+
+import geopandas  # type: ignore
+import pandas
+import pandas.testing
+import pyarrow  # type: ignore
+import pytest
+
+import bigframes.session._io.pandas
+
+
+@pytest.mark.parametrize(
+    ("arrow_table", "dtypes", "expected"),
+    (
+        pytest.param(
+            pyarrow.Table.from_pydict({}),
+            {},
+            pandas.DataFrame(),
+            id="empty-df",
+        ),
+        pytest.param(
+            pyarrow.Table.from_pydict(
+                {
+                    "bool": [True, None, False],
+                    "bytes": [b"123", None, b"abc"],
+                    "date": pyarrow.array(
+                        [datetime.date(2023, 8, 29), None, datetime.date(2024, 4, 9)],
+                        type=pyarrow.date32(),
+                    ),
+                    "datetime": pyarrow.array(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                        ],
+                        type=pyarrow.timestamp("us"),
+                    ),
+                    "string": ["123", None, "abc"],
+                    "time": pyarrow.array(
+                        [
+                            datetime.time(0, 0, 0, 1),
+                            None,
+                            datetime.time(23, 59, 59, 999999),
+                        ],
+                        type=pyarrow.time64("us"),
+                    ),
+                    "timestamp": pyarrow.array(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                        ],
+                        type=pyarrow.timestamp("us", datetime.timezone.utc),
+                    ),
+                }
+            ),
+            {
+                "bool": "boolean",
+                "bytes": "object",
+                "date": pandas.ArrowDtype(pyarrow.date32()),
+                "datetime": pandas.ArrowDtype(pyarrow.timestamp("us")),
+                "string": "string[pyarrow]",
+                "time": pandas.ArrowDtype(pyarrow.time64("us")),
+                "timestamp": pandas.ArrowDtype(
+                    pyarrow.timestamp("us", datetime.timezone.utc)
+                ),
+            },
+            pandas.DataFrame(
+                {
+                    "bool": pandas.Series([True, None, False], dtype="boolean"),
+                    "bytes": [b"123", None, b"abc"],
+                    "date": pandas.Series(
+                        [datetime.date(2023, 8, 29), None, datetime.date(2024, 4, 9)],
+                        dtype=pandas.ArrowDtype(pyarrow.date32()),
+                    ),
+                    "datetime": pandas.Series(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                        ],
+                        dtype=pandas.ArrowDtype(pyarrow.timestamp("us")),
+                    ),
+                    "string": pandas.Series(
+                        ["123", None, "abc"], dtype="string[pyarrow]"
+                    ),
+                    "time": pandas.Series(
+                        [
+                            datetime.time(0, 0, 0, 1),
+                            None,
+                            datetime.time(23, 59, 59, 999999),
+                        ],
+                        dtype=pandas.ArrowDtype(pyarrow.time64("us")),
+                    ),
+                    "timestamp": pandas.Series(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                        ],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.timestamp("us", datetime.timezone.utc)
+                        ),
+                    ),
+                }
+            ),
+            id="scalar-dtypes",
+        ),
+        pytest.param(
+            pyarrow.Table.from_pydict(
+                {
+                    "geocol": [
+                        "POINT(32 210)",
+                        None,
+                        "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)",
+                    ]
+                }
+            ),
+            {"geocol": geopandas.array.GeometryDtype()},
+            pandas.DataFrame(
+                {
+                    "geocol": geopandas.GeoSeries.from_wkt(
+                        ["POINT(32 210)", None, "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)"],
+                        crs="EPSG:4326",
+                    ),
+                }
+            ),
+            id="geography-dtype",
+        ),
+    ),
+)
+def test_arrow_to_pandas(
+    arrow_table: pyarrow.Table | pyarrow.RecordBatch,
+    dtypes: Dict,
+    expected: pandas.DataFrame,
+):
+    actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)
+    pandas.testing.assert_frame_equal(actual, expected)
+
+
+@pytest.mark.parametrize(
+    ("arrow_table", "dtypes"),
+    (
+        pytest.param(
+            pyarrow.Table.from_pydict({"col1": [1], "col2": [2]}),
+            {"col1": "Int64"},
+            id="too-few-dtypes",
+        ),
+        pytest.param(
+            pyarrow.RecordBatch.from_pydict({"col1": [1]}),
+            {"col1": "Int64", "col2": "string[pyarrow]"},
+            id="too-many-dtypes",
+        ),
+    ),
+)
+def test_arrow_to_pandas_wrong_size_dtypes(
+    arrow_table: pyarrow.Table | pyarrow.RecordBatch, dtypes: Dict
+):
+    with pytest.raises(ValueError, match=f"Number of types {len(dtypes)}"):
+        bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)