From c35523c147e1548c77bbd2d26a10725ea144a278 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Mon, 23 Oct 2023 16:58:54 +0000
Subject: [PATCH 01/14] refactor: make `to_pandas()` call `to_arrow()` and use
 local dtypes in DataFrame construction

---
 bigframes/core/blocks.py          | 41 ++++------------------
 bigframes/core/indexes/index.py   |  3 +-
 bigframes/dtypes.py               |  4 +++
 bigframes/session/__init__.py     | 10 ++----
 bigframes/session/_io/__init__.py | 13 +++++++
 bigframes/session/_io/pandas.py   | 46 +++++++++++++++++++++++++
 tests/unit/test_dtypes.py         | 57 ++++++++++++++++---------------
 7 files changed, 104 insertions(+), 70 deletions(-)
 create mode 100644 bigframes/session/_io/__init__.py
 create mode 100644 bigframes/session/_io/pandas.py

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 046d2b3a44..eab4645477 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -28,11 +28,8 @@
 from typing import Iterable, List, Optional, Sequence, Tuple
 import warnings
 
-import geopandas as gpd  # type: ignore
 import google.cloud.bigquery as bigquery
-import numpy
 import pandas as pd
-import pyarrow as pa  # type: ignore
 
 import bigframes.constants as constants
 import bigframes.core as core
@@ -46,6 +43,7 @@
 import bigframes.dtypes
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
+import bigframes.session._io.pandas
 import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common
 
 # Type constraint for wherever column labels are used
@@ -372,34 +370,11 @@ def reorder_levels(self, ids: typing.Sequence[str]):
         level_names = [self.col_id_to_index_name[index_id] for index_id in ids]
         return Block(self.expr, ids, self.column_labels, level_names)
 
-    @classmethod
-    def _to_dataframe(
-        cls, result, schema: typing.Mapping[str, bigframes.dtypes.Dtype]
-    ) -> pd.DataFrame:
+    def _to_dataframe(self, result) -> pd.DataFrame:
         """Convert BigQuery data to pandas DataFrame with specific dtypes."""
-        dtypes = bigframes.dtypes.to_pandas_dtypes_overrides(result.schema)
-        df = result.to_dataframe(
-            dtypes=dtypes,
-            bool_dtype=pd.BooleanDtype(),
-            int_dtype=pd.Int64Dtype(),
-            float_dtype=pd.Float64Dtype(),
-            string_dtype=pd.StringDtype(storage="pyarrow"),
-            date_dtype=pd.ArrowDtype(pa.date32()),
-            datetime_dtype=pd.ArrowDtype(pa.timestamp("us")),
-            time_dtype=pd.ArrowDtype(pa.time64("us")),
-            timestamp_dtype=pd.ArrowDtype(pa.timestamp("us", tz="UTC")),
-        )
-
-        # Convert Geography column from StringDType to GeometryDtype.
-        for column_name, dtype in schema.items():
-            if dtype == gpd.array.GeometryDtype():
-                df[column_name] = gpd.GeoSeries.from_wkt(
-                    # https://github.com/geopandas/geopandas/issues/1879
-                    df[column_name].replace({numpy.nan: None}),
-                    # BigQuery geography type is based on the WGS84 reference ellipsoid.
-                    crs="EPSG:4326",
-                )
-        return df
+        dtypes = dict(zip(self.index_columns, self.index_dtypes))
+        dtypes.update(zip(self.value_columns, self.dtypes))
+        return self._expr._session._rows_to_dataframe(result, dtypes)
 
     def to_pandas(
         self,
@@ -480,8 +455,7 @@ def _compute_and_count(
             if sampling_method == _HEAD:
                 total_rows = int(results_iterator.total_rows * fraction)
                 results_iterator.max_results = total_rows
-                schema = dict(zip(self.value_columns, self.dtypes))
-                df = self._to_dataframe(results_iterator, schema)
+                df = self._to_dataframe(results_iterator)
 
                 if self.index_columns:
                     df.set_index(list(self.index_columns), inplace=True)
@@ -510,8 +484,7 @@ def _compute_and_count(
                 )
         else:
             total_rows = results_iterator.total_rows
-            schema = dict(zip(self.value_columns, self.dtypes))
-            df = self._to_dataframe(results_iterator, schema)
+            df = self._to_dataframe(results_iterator)
 
             if self.index_columns:
                 df.set_index(list(self.index_columns), inplace=True)
diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py
index 677bb8529c..b9ffdff21e 100644
--- a/bigframes/core/indexes/index.py
+++ b/bigframes/core/indexes/index.py
@@ -399,9 +399,10 @@ def to_pandas(self) -> pandas.Index:
         """Executes deferred operations and downloads the results."""
         # Project down to only the index column. So the query can be cached to visualize other data.
         index_columns = list(self._block.index_columns)
+        dtypes = dict(zip(index_columns, self.dtypes))
         expr = self._expr.select_columns(index_columns)
         results, _ = expr.start_query()
-        df = expr._session._rows_to_dataframe(results)
+        df = expr._session._rows_to_dataframe(results, dtypes)
         df = df.set_index(index_columns)
         index = df.index
         index.names = list(self._block._index_labels)
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index da221a95ac..5280e97f70 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -169,6 +169,10 @@ def ibis_dtype_to_bigframes_dtype(
     if isinstance(ibis_dtype, ibis_dtypes.Struct):
         return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype))
 
+    # BigQuery only supports integers of size 64 bits.
+    if isinstance(ibis_dtype, ibis_dtypes.Integer):
+        return pd.Int64Dtype()
+
     if ibis_dtype in IBIS_TO_BIGFRAMES:
         return IBIS_TO_BIGFRAMES[ibis_dtype]
     elif isinstance(ibis_dtype, ibis_dtypes.Null):
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index 5ec3da1a5a..6ac6782eea 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -1515,14 +1515,10 @@ def _get_table_size(self, destination_table):
         return table.num_bytes
 
     def _rows_to_dataframe(
-        self, row_iterator: bigquery.table.RowIterator
+        self, row_iterator: bigquery.table.RowIterator, dtypes: Dict
     ) -> pandas.DataFrame:
-        return row_iterator.to_dataframe(
-            bool_dtype=pandas.BooleanDtype(),
-            int_dtype=pandas.Int64Dtype(),
-            float_dtype=pandas.Float64Dtype(),
-            string_dtype=pandas.StringDtype(storage="pyarrow"),
-        )
+        arrow_table = row_iterator.to_arrow()
+        return bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)
 
     def _start_generic_job(self, job: formatting_helpers.GenericJob):
         if bigframes.options.display.progress_bar is not None:
diff --git a/bigframes/session/_io/__init__.py b/bigframes/session/_io/__init__.py
new file mode 100644
index 0000000000..1dc90d1848
--- /dev/null
+++ b/bigframes/session/_io/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py
new file mode 100644
index 0000000000..fdd79c0a25
--- /dev/null
+++ b/bigframes/session/_io/pandas.py
@@ -0,0 +1,46 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+import geopandas  # type: ignore
+import pandas
+import pyarrow  # type: ignore
+
+import bigframes.constants
+
+
+def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
+    if len(dtypes) != arrow_table.num_columns:
+        raise ValueError(
+            f"Number of types {len(dtypes)} doesn't match number of columns "
+            f"{arrow_table.num_columns}. {bigframes.constants.FEEDBACK_LINK}"
+        )
+
+    serieses = {}
+    for column_name, column in zip(arrow_table.column_names, arrow_table):
+        dtype = dtypes[column_name]
+
+        if dtype == geopandas.array.GeometryDtype():
+            series = geopandas.GeoSeries.from_wkt(
+                column,
+                # BigQuery geography type is based on the WGS84 reference ellipsoid.
+                crs="EPSG:4326",
+            )
+        else:
+            series = pandas.Series(column, dtype=dtype)
+
+        serieses[column_name] = series
+
+    return pandas.DataFrame(serieses)
diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py
index 3baff2e1f5..6ceaaf911b 100644
--- a/tests/unit/test_dtypes.py
+++ b/tests/unit/test_dtypes.py
@@ -29,41 +29,42 @@
         # TODO(bmil): Add ARRAY, INTERVAL, STRUCT to cover all the standard
         # BigQuery data types as they appear in Ibis:
         # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
-        (ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), np.dtype("O")),
-        (ibis_dtypes.boolean, pd.BooleanDtype()),
-        (ibis_dtypes.binary, np.dtype("O")),
-        (ibis_dtypes.date, pd.ArrowDtype(pa.date32())),
-        (ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us"))),
-        (ibis_dtypes.float64, pd.Float64Dtype()),
-        (
+        pytest.param(
+            ibis_dtypes.Decimal(precision=76, scale=38, nullable=True),
+            np.dtype("O"),
+            id="bignumeric",
+        ),
+        pytest.param(ibis_dtypes.boolean, pd.BooleanDtype(), id="bool"),
+        pytest.param(ibis_dtypes.binary, np.dtype("O"), id="bytes"),
+        pytest.param(ibis_dtypes.date, pd.ArrowDtype(pa.date32()), id="date"),
+        pytest.param(
+            ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us")), id="datetime"
+        ),
+        pytest.param(ibis_dtypes.float64, pd.Float64Dtype(), id="float"),
+        pytest.param(
             ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True),
             gpd.array.GeometryDtype(),
+            id="geography",
         ),
-        (ibis_dtypes.int64, pd.Int64Dtype()),
-        (ibis_dtypes.json, np.dtype("O")),
-        (ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), np.dtype("O")),
-        (ibis_dtypes.string, pd.StringDtype(storage="pyarrow")),
-        (ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))),
-        (
+        pytest.param(ibis_dtypes.int8, pd.Int64Dtype(), id="int8-as-int64"),
+        pytest.param(ibis_dtypes.int64, pd.Int64Dtype(), id="int64"),
+        # TODO(tswast): custom dtype (or at least string dtype) for JSON objects
+        pytest.param(ibis_dtypes.json, np.dtype("O"), id="json"),
+        pytest.param(
+            ibis_dtypes.Decimal(precision=38, scale=9, nullable=True),
+            np.dtype("O"),
+            id="numeric",
+        ),
+        pytest.param(
+            ibis_dtypes.string, pd.StringDtype(storage="pyarrow"), id="string"
+        ),
+        pytest.param(ibis_dtypes.time, pd.ArrowDtype(pa.time64("us")), id="time"),
+        pytest.param(
             ibis_dtypes.Timestamp(timezone="UTC"),
             pd.ArrowDtype(pa.timestamp("us", tz="UTC")),  # type: ignore
+            id="timestamp",
         ),
     ],
-    ids=[
-        "bignumeric",
-        "bool",
-        "bytes",
-        "date",
-        "datetime",
-        "float",
-        "geography",
-        "int64",
-        "json",
-        "numeric",
-        "string",
-        "time",
-        "timestamp",
-    ],
 )
 def test_ibis_dtype_converts(ibis_dtype, bigframes_dtype):
     """Test all the Ibis data types needed to read BigQuery tables"""

From c2f9d72153f3885d84fab2dfcb928f9578bca285 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 24 Oct 2023 20:26:44 +0000
Subject: [PATCH 02/14] use integer_object_nulls=True to preserve NA/NaN
 distinction

---
 bigframes/session/_io/pandas.py      |  8 +++++++-
 tests/system/small/test_dataframe.py | 10 ----------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py
index fdd79c0a25..ca701c73c0 100644
--- a/bigframes/session/_io/pandas.py
+++ b/bigframes/session/_io/pandas.py
@@ -39,7 +39,13 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
                 crs="EPSG:4326",
             )
         else:
-            series = pandas.Series(column, dtype=dtype)
+            series = column.to_pandas(
+                # Construct Python objects to preserve NA/NaN. Note: This is
+                # currently needed, even for nullable Float64Dtype.
+                # https://github.com/pandas-dev/pandas/issues/55668
+                integer_object_nulls=True,
+                types_mapper=lambda _: dtype,
+            )
 
         serieses[column_name] = series
 
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 19e50eb06d..84e8def83b 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -2046,16 +2046,6 @@ def test__dir__with_rename(scalars_dfs):
 def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step):
     bf_result = scalars_df_index.iloc[start:stop:step].to_pandas()
     pd_result = scalars_pandas_df_index.iloc[start:stop:step]
-
-    # Pandas may assign non-object dtype to empty series and series index
-    # dtypes of empty columns are a known area of divergence from pandas
-    for column in pd_result.columns:
-        if (
-            pd_result[column].empty and column != "geography_col"
-        ):  # for empty geography_col, bigframes assigns non-object dtype
-            pd_result[column] = pd_result[column].astype("object")
-            pd_result.index = pd_result.index.astype("object")
-
     pd.testing.assert_frame_equal(
         bf_result,
         pd_result,

From 7ccb61dbb66b3f9db9628f146bbd8625f3a5c91f Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 24 Oct 2023 20:52:25 +0000
Subject: [PATCH 03/14] allow NUMERIC/BIGNUMERIC to cast to FLOAT64

---
 bigframes/dtypes.py               |  2 ++
 tests/system/small/test_series.py | 42 +++++++++++++++++++++++--------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 5280e97f70..079f0cc27a 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -376,6 +376,8 @@ def cast_ibis_value(
         ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64),
         ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64),
         ibis_dtypes.date: (),
+        ibis_dtypes.Decimal(precision=38, scale=9): (ibis_dtypes.float64,),
+        ibis_dtypes.Decimal(precision=76, scale=38): (ibis_dtypes.float64,),
         ibis_dtypes.time: (),
         ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),),
         ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,),
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index bd9edbb1ca..f5d2feaf82 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -575,7 +575,9 @@ def test_series_int_int_operators_series(scalars_dfs, operator):
 )
 def test_mods(scalars_dfs, col_x, col_y, method):
     scalars_df, scalars_pandas_df = scalars_dfs
-    bf_result = getattr(scalars_df[col_x], method)(scalars_df[col_y]).to_pandas()
+    bf_series = getattr(scalars_df[col_x], method)(scalars_df[col_y])
+    # BigQuery's mod functions return NUMERIC values.
+    bf_result = bf_series.astype("Float64").to_pandas()
     pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y])
     pd.testing.assert_series_equal(pd_result, bf_result)
 
@@ -620,8 +622,20 @@ def test_divmods_series(scalars_dfs, col_x, col_y, method):
     pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)(
         scalars_pandas_df[col_y]
     )
-    pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas())
-    pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas())
+    # BigQuery's mod functions return NUMERIC values for non-INT64 inputs.
+    if bf_div_result.dtype == pd.Int64Dtype():
+        pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas())
+    else:
+        pd.testing.assert_series_equal(
+            pd_div_result, bf_div_result.astype("Float64").to_pandas()
+        )
+
+    if bf_mod_result.dtype == pd.Int64Dtype():
+        pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas())
+    else:
+        pd.testing.assert_series_equal(
+            pd_mod_result, bf_mod_result.astype("Float64").to_pandas()
+        )
 
 
 @pytest.mark.parametrize(
@@ -649,8 +663,20 @@ def test_divmods_scalars(scalars_dfs, col_x, other, method):
     scalars_df, scalars_pandas_df = scalars_dfs
     bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(other)
     pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)(other)
-    pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas())
-    pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas())
+    # BigQuery's mod functions return NUMERIC values for non-INT64 inputs.
+    if bf_div_result.dtype == pd.Int64Dtype():
+        pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas())
+    else:
+        pd.testing.assert_series_equal(
+            pd_div_result, bf_div_result.astype("Float64").to_pandas()
+        )
+
+    if bf_mod_result.dtype == pd.Int64Dtype():
+        pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas())
+    else:
+        pd.testing.assert_series_equal(
+            pd_mod_result, bf_mod_result.astype("Float64").to_pandas()
+        )
 
 
 @pytest.mark.parametrize(
@@ -1941,12 +1967,6 @@ def test_iloc_nested(scalars_df_index, scalars_pandas_df_index):
 def test_series_iloc(scalars_df_index, scalars_pandas_df_index, start, stop, step):
     bf_result = scalars_df_index["string_col"].iloc[start:stop:step].to_pandas()
     pd_result = scalars_pandas_df_index["string_col"].iloc[start:stop:step]
-
-    # Pandas may assign non-object dtype to empty series and series index
-    if pd_result.empty:
-        pd_result = pd_result.astype("object")
-        pd_result.index = pd_result.index.astype("object")
-
     pd.testing.assert_series_equal(
         bf_result,
         pd_result,

From 829cf99a44e7a6a205f2390ee21e832f924365be Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Wed, 25 Oct 2023 16:33:30 +0000
Subject: [PATCH 04/14] better workaround for Float64Dtype NaNs

---
 bigframes/session/_io/pandas.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py
index ca701c73c0..7834811620 100644
--- a/bigframes/session/_io/pandas.py
+++ b/bigframes/session/_io/pandas.py
@@ -16,7 +16,9 @@
 
 import geopandas  # type: ignore
 import pandas
+import pandas.arrays
 import pyarrow  # type: ignore
+import pyarrow.compute  # type: ignore
 
 import bigframes.constants
 
@@ -38,14 +40,17 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
                 # BigQuery geography type is based on the WGS84 reference ellipsoid.
                 crs="EPSG:4326",
             )
-        else:
-            series = column.to_pandas(
-                # Construct Python objects to preserve NA/NaN. Note: This is
-                # currently needed, even for nullable Float64Dtype.
-                # https://github.com/pandas-dev/pandas/issues/55668
-                integer_object_nulls=True,
-                types_mapper=lambda _: dtype,
+        elif dtype == pandas.Float64Dtype():
+            # Preserve NA/NaN distinction. Note: This is currently needed, even if we use
+            # nullable Float64Dtype in the types_mapper. See:
+            # https://github.com/pandas-dev/pandas/issues/55668
+            pd_array = pandas.arrays.FloatingArray(
+                column.to_numpy(),
+                pyarrow.compute.is_null(column).to_numpy(),
             )
+            series = pandas.Series(pd_array, dtype=dtype)
+        else:
+            series = column.to_pandas(types_mapper=lambda _: dtype)
 
         serieses[column_name] = series
 

From 3a90214ac28559f3d2cb1ae7cfadc83ab0f9b849 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Wed, 25 Oct 2023 16:38:17 +0000
Subject: [PATCH 05/14] fix type error

---
 bigframes/session/_io/pandas.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py
index 7834811620..ea1318636a 100644
--- a/bigframes/session/_io/pandas.py
+++ b/bigframes/session/_io/pandas.py
@@ -44,7 +44,10 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
             # Preserve NA/NaN distinction. Note: This is currently needed, even if we use
             # nullable Float64Dtype in the types_mapper. See:
             # https://github.com/pandas-dev/pandas/issues/55668
-            pd_array = pandas.arrays.FloatingArray(
+            # Regarding type: ignore, this class has been public at this
+            # location since pandas 1.2.0. See:
+            # https://pandas.pydata.org/docs/dev/reference/api/pandas.arrays.FloatingArray.html
+            pd_array = pandas.arrays.FloatingArray(  # type: ignore
                 column.to_numpy(),
                 pyarrow.compute.is_null(column).to_numpy(),
             )

From 8bdfd79756da0319c6b861e1705d28b184d429cc Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Wed, 25 Oct 2023 19:02:49 +0000
Subject: [PATCH 06/14] add unit tests for extreme values

---
 bigframes/session/_io/pandas.py      |  27 +++-
 tests/unit/session/test_io_pandas.py | 216 +++++++++++++++++++++++++++
 2 files changed, 238 insertions(+), 5 deletions(-)
 create mode 100644 tests/unit/session/test_io_pandas.py

diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py
index ea1318636a..163127b546 100644
--- a/bigframes/session/_io/pandas.py
+++ b/bigframes/session/_io/pandas.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict
+from typing import Dict, Union
 
 import geopandas  # type: ignore
 import pandas
@@ -23,7 +23,9 @@
 import bigframes.constants
 
 
-def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
+def arrow_to_pandas(
+    arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict
+):
     if len(dtypes) != arrow_table.num_columns:
         raise ValueError(
             f"Number of types {len(dtypes)} doesn't match number of columns "
@@ -31,8 +33,8 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
         )
 
     serieses = {}
-    for column_name, column in zip(arrow_table.column_names, arrow_table):
-        dtype = dtypes[column_name]
+    for field, column in zip(arrow_table.schema, arrow_table):
+        dtype = dtypes[field.name]
 
         if dtype == geopandas.array.GeometryDtype():
             series = geopandas.GeoSeries.from_wkt(
@@ -52,9 +54,24 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict):
                 pyarrow.compute.is_null(column).to_numpy(),
             )
             series = pandas.Series(pd_array, dtype=dtype)
+        elif dtype == pandas.Int64Dtype():
+            # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly
+            # casts to float64 in an intermediate step.
+            pd_array = pandas.arrays.IntegerArray(
+                pyarrow.compute.fill_null(column, 0).to_numpy(),
+                pyarrow.compute.is_null(column).to_numpy(),
+            )
+            series = pandas.Series(pd_array, dtype=dtype)
+        elif isinstance(dtype, pandas.ArrowDtype):
+            # Avoid conversion logic if we are backing the pandas Series by the
+            # arrow array.
+            series = pandas.Series(
+                pandas.arrays.ArrowExtensionArray(column),  # type: ignore
+                dtype=dtype,
+            )
         else:
             series = column.to_pandas(types_mapper=lambda _: dtype)
 
-        serieses[column_name] = series
+        serieses[field.name] = series
 
     return pandas.DataFrame(serieses)
diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py
new file mode 100644
index 0000000000..de4afe71a8
--- /dev/null
+++ b/tests/unit/session/test_io_pandas.py
@@ -0,0 +1,216 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+from typing import Dict, Union
+
+import geopandas  # type: ignore
+import numpy
+import pandas
+import pandas.arrays
+import pandas.testing
+import pyarrow  # type: ignore
+import pytest
+
+import bigframes.session._io.pandas
+
+
+@pytest.mark.parametrize(
+    ("arrow_table", "dtypes", "expected"),
+    (
+        pytest.param(
+            pyarrow.Table.from_pydict({}),
+            {},
+            pandas.DataFrame(),
+            id="empty-df",
+        ),
+        pytest.param(
+            pyarrow.Table.from_pydict(
+                {
+                    "bool": [True, None, True, False],
+                    "bytes": [b"123", None, b"abc", b"xyz"],
+                    "date": pyarrow.array(
+                        [
+                            datetime.date(2023, 8, 29),
+                            None,
+                            datetime.date(2024, 4, 9),
+                            datetime.date(1, 1, 1),
+                        ],
+                        type=pyarrow.date32(),
+                    ),
+                    "datetime": pyarrow.array(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                            datetime.datetime(1, 1, 1, 0, 0, 0, 1),
+                        ],
+                        type=pyarrow.timestamp("us"),
+                    ),
+                    "float": pyarrow.array(
+                        [1.0, None, float("nan"), -1.0],
+                        type=pyarrow.float64(),
+                    ),
+                    "int": pyarrow.array(
+                        [1, None, -1, 2**63 - 1],
+                        type=pyarrow.int64(),
+                    ),
+                    "string": ["123", None, "abc", "xyz"],
+                    "time": pyarrow.array(
+                        [
+                            datetime.time(0, 0, 0, 1),
+                            datetime.time(12, 0, 0),
+                            None,
+                            datetime.time(23, 59, 59, 999999),
+                        ],
+                        type=pyarrow.time64("us"),
+                    ),
+                    "timestamp": pyarrow.array(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            datetime.datetime(1, 1, 1, 0, 0, 0, 1),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                        ],
+                        type=pyarrow.timestamp("us", datetime.timezone.utc),
+                    ),
+                }
+            ),
+            {
+                "bool": "boolean",
+                "bytes": "object",
+                "date": pandas.ArrowDtype(pyarrow.date32()),
+                "datetime": pandas.ArrowDtype(pyarrow.timestamp("us")),
+                "float": pandas.Float64Dtype(),
+                "int": pandas.Int64Dtype(),
+                "string": "string[pyarrow]",
+                "time": pandas.ArrowDtype(pyarrow.time64("us")),
+                "timestamp": pandas.ArrowDtype(
+                    pyarrow.timestamp("us", datetime.timezone.utc)
+                ),
+            },
+            pandas.DataFrame(
+                {
+                    "bool": pandas.Series([True, None, True, False], dtype="boolean"),
+                    "bytes": [b"123", None, b"abc", b"xyz"],
+                    "date": pandas.Series(
+                        [
+                            datetime.date(2023, 8, 29),
+                            None,
+                            datetime.date(2024, 4, 9),
+                            datetime.date(1, 1, 1),
+                        ],
+                        dtype=pandas.ArrowDtype(pyarrow.date32()),
+                    ),
+                    "datetime": pandas.Series(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                            datetime.datetime(1, 1, 1, 0, 0, 0, 1),
+                        ],
+                        dtype=pandas.ArrowDtype(pyarrow.timestamp("us")),
+                    ),
+                    "float": pandas.Series(
+                        pandas.arrays.FloatingArray(
+                            numpy.array(
+                                [1.0, float("nan"), float("nan"), -1.0], dtype="float64"
+                            ),
+                            numpy.array([False, True, False, False], dtype="bool"),
+                        ),
+                        dtype=pandas.Float64Dtype(),
+                    ),
+                    "int": pandas.Series(
+                        [1, None, -1, 2**63 - 1],
+                        dtype=pandas.Int64Dtype(),
+                    ),
+                    "string": pandas.Series(
+                        ["123", None, "abc", "xyz"], dtype="string[pyarrow]"
+                    ),
+                    "time": pandas.Series(
+                        [
+                            datetime.time(0, 0, 0, 1),
+                            datetime.time(12, 0, 0),
+                            None,
+                            datetime.time(23, 59, 59, 999999),
+                        ],
+                        dtype=pandas.ArrowDtype(pyarrow.time64("us")),
+                    ),
+                    "timestamp": pandas.Series(
+                        [
+                            datetime.datetime(2023, 8, 29),
+                            datetime.datetime(1, 1, 1, 0, 0, 0, 1),
+                            None,
+                            datetime.datetime(2024, 4, 9, 23, 59, 59),
+                        ],
+                        dtype=pandas.ArrowDtype(
+                            pyarrow.timestamp("us", datetime.timezone.utc)
+                        ),
+                    ),
+                }
+            ),
+            id="scalar-dtypes",
+        ),
+        pytest.param(
+            pyarrow.Table.from_pydict(
+                {
+                    "geocol": [
+                        "POINT(32 210)",
+                        None,
+                        "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)",
+                    ]
+                }
+            ),
+            {"geocol": geopandas.array.GeometryDtype()},
+            pandas.DataFrame(
+                {
+                    "geocol": geopandas.GeoSeries.from_wkt(
+                        ["POINT(32 210)", None, "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)"],
+                        crs="EPSG:4326",
+                    ),
+                }
+            ),
+            id="geography-dtype",
+        ),
+    ),
+)
+def test_arrow_to_pandas(
+    arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch],
+    dtypes: Dict,
+    expected: pandas.DataFrame,
+):
+    actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)
+    pandas.testing.assert_frame_equal(actual, expected)
+
+
+@pytest.mark.parametrize(
+    ("arrow_table", "dtypes"),
+    (
+        pytest.param(
+            pyarrow.Table.from_pydict({"col1": [1], "col2": [2]}),
+            {"col1": "Int64"},
+            id="too-few-dtypes",
+        ),
+        pytest.param(
+            pyarrow.RecordBatch.from_pydict({"col1": [1]}),
+            {"col1": "Int64", "col2": "string[pyarrow]"},
+            id="too-many-dtypes",
+        ),
+    ),
+)
+def test_arrow_to_pandas_wrong_size_dtypes(
+    arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict
+):
+    with pytest.raises(ValueError, match=f"Number of types {len(dtypes)}"):
+        bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)

From db81a1c51c0c76d38aabf8cfd7260667b9ce83ad Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Wed, 25 Oct 2023 20:22:44 +0000
Subject: [PATCH 07/14] fix tests on latest pandas

---
 tests/unit/session/test_io_pandas.py | 140 +++++++++++++++++++++------
 1 file changed, 110 insertions(+), 30 deletions(-)

diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py
index de4afe71a8..72ae65206f 100644
--- a/tests/unit/session/test_io_pandas.py
+++ b/tests/unit/session/test_io_pandas.py
@@ -38,8 +38,57 @@
         pytest.param(
             pyarrow.Table.from_pydict(
                 {
-                    "bool": [True, None, True, False],
-                    "bytes": [b"123", None, b"abc", b"xyz"],
+                    "bool": pyarrow.array([None, None, None], type=pyarrow.bool_()),
+                    "float": pyarrow.array([None, None, None], type=pyarrow.float64()),
+                    "int": pyarrow.array([None, None, None], type=pyarrow.int64()),
+                    "string": pyarrow.array([None, None, None], type=pyarrow.string()),
+                    "time": pyarrow.array(
+                        [None, None, None], type=pyarrow.time64("us")
+                    ),
+                }
+            ),
+            {
+                "bool": "boolean",
+                "float": pandas.Float64Dtype(),
+                "int": pandas.Int64Dtype(),
+                "string": "string[pyarrow]",
+                "time": pandas.ArrowDtype(pyarrow.time64("us")),
+            },
+            pandas.DataFrame(
+                {
+                    "bool": pandas.Series([None, None, None], dtype="boolean"),
+                    "float": pandas.Series(
+                        pandas.arrays.FloatingArray(
+                            numpy.array(
+                                [float("nan"), float("nan"), float("nan")],
+                                dtype="float64",
+                            ),
+                            numpy.array([True, True, True], dtype="bool"),
+                        ),
+                        dtype=pandas.Float64Dtype(),
+                    ),
+                    "int": pandas.Series(
+                        [None, None, None],
+                        dtype=pandas.Int64Dtype(),
+                    ),
+                    "string": pandas.Series(
+                        [None, None, None], dtype="string[pyarrow]"
+                    ),
+                    "time": pandas.Series(
+                        [
+                            None,
+                            None,
+                            None,
+                        ],
+                        dtype=pandas.ArrowDtype(pyarrow.time64("us")),
+                    ),
+                }
+            ),
+            id="nulls-df",
+        ),
+        pytest.param(
+            pyarrow.Table.from_pydict(
+                {
                     "date": pyarrow.array(
                         [
                             datetime.date(2023, 8, 29),
@@ -58,14 +107,6 @@
                         ],
                         type=pyarrow.timestamp("us"),
                     ),
-                    "float": pyarrow.array(
-                        [1.0, None, float("nan"), -1.0],
-                        type=pyarrow.float64(),
-                    ),
-                    "int": pyarrow.array(
-                        [1, None, -1, 2**63 - 1],
-                        type=pyarrow.int64(),
-                    ),
                     "string": ["123", None, "abc", "xyz"],
                     "time": pyarrow.array(
                         [
@@ -88,12 +129,8 @@
                 }
             ),
             {
-                "bool": "boolean",
-                "bytes": "object",
                 "date": pandas.ArrowDtype(pyarrow.date32()),
                 "datetime": pandas.ArrowDtype(pyarrow.timestamp("us")),
-                "float": pandas.Float64Dtype(),
-                "int": pandas.Int64Dtype(),
                 "string": "string[pyarrow]",
                 "time": pandas.ArrowDtype(pyarrow.time64("us")),
                 "timestamp": pandas.ArrowDtype(
@@ -102,8 +139,6 @@
             },
             pandas.DataFrame(
                 {
-                    "bool": pandas.Series([True, None, True, False], dtype="boolean"),
-                    "bytes": [b"123", None, b"abc", b"xyz"],
                     "date": pandas.Series(
                         [
                             datetime.date(2023, 8, 29),
@@ -122,19 +157,6 @@
                         ],
                         dtype=pandas.ArrowDtype(pyarrow.timestamp("us")),
                     ),
-                    "float": pandas.Series(
-                        pandas.arrays.FloatingArray(
-                            numpy.array(
-                                [1.0, float("nan"), float("nan"), -1.0], dtype="float64"
-                            ),
-                            numpy.array([False, True, False, False], dtype="bool"),
-                        ),
-                        dtype=pandas.Float64Dtype(),
-                    ),
-                    "int": pandas.Series(
-                        [1, None, -1, 2**63 - 1],
-                        dtype=pandas.Int64Dtype(),
-                    ),
                     "string": pandas.Series(
                         ["123", None, "abc", "xyz"], dtype="string[pyarrow]"
                     ),
@@ -160,6 +182,53 @@
                     ),
                 }
             ),
+            id="arrow-dtypes",
+        ),
+        pytest.param(
+            pyarrow.Table.from_pydict(
+                {
+                    "bool": [True, None, True, False],
+                    "bytes": [b"123", None, b"abc", b"xyz"],
+                    "float": pyarrow.array(
+                        [1.0, None, float("nan"), -1.0],
+                        type=pyarrow.float64(),
+                    ),
+                    "int": pyarrow.array(
+                        [1, None, -1, 2**63 - 1],
+                        type=pyarrow.int64(),
+                    ),
+                    "string": ["123", None, "abc", "xyz"],
+                }
+            ),
+            {
+                "bool": "boolean",
+                "bytes": "object",
+                "float": pandas.Float64Dtype(),
+                "int": pandas.Int64Dtype(),
+                "string": "string[pyarrow]",
+            },
+            pandas.DataFrame(
+                {
+                    "bool": pandas.Series([True, None, True, False], dtype="boolean"),
+                    "bytes": [b"123", None, b"abc", b"xyz"],
+                    "float": pandas.Series(
+                        pandas.arrays.FloatingArray(
+                            numpy.array(
+                                [1.0, float("nan"), float("nan"), -1.0], dtype="float64"
+                            ),
+                            numpy.array([False, True, False, False], dtype="bool"),
+                        ),
+                        dtype=pandas.Float64Dtype(),
+                    ),
+                    "int": pandas.Series(
+                        [1, None, -1, 2**63 - 1],
+                        dtype=pandas.Int64Dtype(),
+                    ),
+                    "string": pandas.Series(
+                        ["123", None, "abc", "xyz"], dtype="string[pyarrow]"
+                    ),
+                }
+            ),
             id="scalar-dtypes",
         ),
         pytest.param(
@@ -191,7 +260,18 @@ def test_arrow_to_pandas(
     expected: pandas.DataFrame,
 ):
     actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)
-    pandas.testing.assert_frame_equal(actual, expected)
+    pandas.testing.assert_series_equal(actual.dtypes, expected.dtypes)
+
+    # assert_frame_equal is converting to numpy internally, which causes some
+    # loss of precision with the extreme values in this test.
+    for column in actual.columns:
+        assert tuple(
+            (index, value) if (value is pandas.NA or value == value) else (index, "nan")
+            for index, value in actual[column].items()
+        ) == tuple(
+            (index, value) if (value is pandas.NA or value == value) else (index, "nan")
+            for index, value in expected[column].items()
+        )
 
 
 @pytest.mark.parametrize(

From b25112bc447f0d0bc3330cf274902279cdd9af05 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Wed, 25 Oct 2023 20:57:44 +0000
Subject: [PATCH 08/14] mypy fixes

---
 tests/unit/session/test_io_pandas.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py
index 72ae65206f..8b95977ec3 100644
--- a/tests/unit/session/test_io_pandas.py
+++ b/tests/unit/session/test_io_pandas.py
@@ -58,7 +58,7 @@
                 {
                     "bool": pandas.Series([None, None, None], dtype="boolean"),
                     "float": pandas.Series(
-                        pandas.arrays.FloatingArray(
+                        pandas.arrays.FloatingArray(  # type: ignore
                             numpy.array(
                                 [float("nan"), float("nan"), float("nan")],
                                 dtype="float64",
@@ -212,7 +212,7 @@
                     "bool": pandas.Series([True, None, True, False], dtype="boolean"),
                     "bytes": [b"123", None, b"abc", b"xyz"],
                     "float": pandas.Series(
-                        pandas.arrays.FloatingArray(
+                        pandas.arrays.FloatingArray(  # type: ignore
                             numpy.array(
                                 [1.0, float("nan"), float("nan"), -1.0], dtype="float64"
                             ),

From a3705f9897af539ce52fc1a0cfa8853e17167d12 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Thu, 26 Oct 2023 16:11:33 +0000
Subject: [PATCH 09/14] fix mod tests

---
 tests/system/small/test_series.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index f5d2feaf82..c9510290b6 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -575,9 +575,15 @@ def test_series_int_int_operators_series(scalars_dfs, operator):
 )
 def test_mods(scalars_dfs, col_x, col_y, method):
     scalars_df, scalars_pandas_df = scalars_dfs
-    bf_series = getattr(scalars_df[col_x], method)(scalars_df[col_y])
-    # BigQuery's mod functions return NUMERIC values.
-    bf_result = bf_series.astype("Float64").to_pandas()
+    x_bf = scalars_df[col_x]
+    y_bf = scalars_df[col_y]
+    bf_series = getattr(x_bf, method)(y_bf)
+    # BigQuery's mod functions return [BIG]NUMERIC values unless both arguments are integers.
+    # https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#mod
+    if x_bf.dtype == pd.Int64Dtype() and y_bf.dtype == pd.Int64Dtype():
+        bf_result = bf_series.to_pandas()
+    else:
+        bf_result = bf_series.astype("Float64").to_pandas()
     pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y])
     pd.testing.assert_series_equal(pd_result, bf_result)
 

From 1a7b2d722898b7c95377521931ffb8d29ff48d2a Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Tue, 24 Oct 2023 15:52:48 +0000
Subject: [PATCH 10/14] feat: add `DataFrame.to_pandas_batches()` to download
 large `DataFrame` objects

---
 bigframes/core/blocks.py                | 26 +++++++++++++++++++++----
 bigframes/dataframe.py                  |  4 ++++
 tests/system/small/test_dataframe_io.py |  8 ++++++++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index eab4645477..4d322d48ee 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -412,6 +412,27 @@ def to_pandas(
         )
         return df, query_job
 
+    def to_pandas_batches(self):
+        """Download results one message at a time."""
+        dtypes = dict(zip(self.index_columns, self.index_dtypes))
+        dtypes.update(zip(self.value_columns, self.dtypes))
+        results_iterator, _ = self._expr.start_query()
+        for arrow_table in results_iterator.to_arrow_iterable(
+            bqstorage_client=self._expr._session.bqstoragereadclient
+        ):
+            df = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes)
+            self._copy_index_to_pandas(df)
+            yield df
+
+    def _copy_index_to_pandas(self, df: pd.DataFrame):
+        """Set the index on pandas DataFrame to match this block.
+
+        Warning: This method modifies ``df`` inplace.
+        """
+        if self.index_columns:
+            df.set_index(list(self.index_columns), inplace=True)
+            df.index.names = self.index.names  # type: ignore
+
     def _compute_and_count(
         self,
         value_keys: Optional[Iterable[str]] = None,
@@ -485,10 +506,7 @@ def _compute_and_count(
         else:
             total_rows = results_iterator.total_rows
             df = self._to_dataframe(results_iterator)
-
-            if self.index_columns:
-                df.set_index(list(self.index_columns), inplace=True)
-                df.index.names = self.index.names  # type: ignore
+            self._copy_index_to_pandas(df)
 
         return df, total_rows, query_job
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 5c0d9b78e1..cd73553e23 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -893,6 +893,10 @@ def to_pandas(
         self._set_internal_query_job(query_job)
         return df.set_axis(self._block.column_labels, axis=1, copy=False)
 
+    def to_pandas_batches(self) -> Iterable[pandas.DataFrame]:
+        """Stream DataFrame results to an iterable of pandas DataFrame"""
+        return self._block.to_pandas_batches()
+
     def _compute_dry_run(self) -> bigquery.QueryJob:
         return self._block._compute_dry_run()
 
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
index d60083a837..8f5d706f62 100644
--- a/tests/system/small/test_dataframe_io.py
+++ b/tests/system/small/test_dataframe_io.py
@@ -83,6 +83,14 @@ def test_to_pandas_array_struct_correct_result(session):
     )
 
 
+def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
+    """Verify to_pandas_batches() APIs returns the expected dtypes."""
+    expected = scalars_df_default_index.dtypes
+    for df in scalars_df_default_index.to_pandas_batches():
+        actual = df.dtypes
+        pd.testing.assert_series_equal(actual, expected)
+
+
 @pytest.mark.parametrize(
     ("index"),
     [True, False],

From c4a8b15a06d8eafc65a455ae01806c7b819f1076 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Thu, 26 Oct 2023 17:32:52 +0000
Subject: [PATCH 11/14] allow copies

---
 bigframes/session/_io/pandas.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py
index 163127b546..d110aa25e7 100644
--- a/bigframes/session/_io/pandas.py
+++ b/bigframes/session/_io/pandas.py
@@ -50,16 +50,16 @@ def arrow_to_pandas(
             # location since pandas 1.2.0. See:
             # https://pandas.pydata.org/docs/dev/reference/api/pandas.arrays.FloatingArray.html
             pd_array = pandas.arrays.FloatingArray(  # type: ignore
-                column.to_numpy(),
-                pyarrow.compute.is_null(column).to_numpy(),
+                column.to_numpy(zero_copy_only=False),
+                pyarrow.compute.is_null(column).to_numpy(zero_copy_only=False),
             )
             series = pandas.Series(pd_array, dtype=dtype)
         elif dtype == pandas.Int64Dtype():
             # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly
             # casts to float64 in an intermediate step.
             pd_array = pandas.arrays.IntegerArray(
-                pyarrow.compute.fill_null(column, 0).to_numpy(),
-                pyarrow.compute.is_null(column).to_numpy(),
+                pyarrow.compute.fill_null(column, 0).to_numpy(zero_copy_only=False),
+                pyarrow.compute.is_null(column).to_numpy(zero_copy_only=False),
             )
             series = pandas.Series(pd_array, dtype=dtype)
         elif isinstance(dtype, pandas.ArrowDtype):

From 239e5efd7bee57bdc01cdc16a9d01d33b89e81df Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Thu, 26 Oct 2023 17:44:24 +0000
Subject: [PATCH 12/14] allow copies only for contiguous arrays

---
 bigframes/session/_io/pandas.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py
index d110aa25e7..1af00a2d01 100644
--- a/bigframes/session/_io/pandas.py
+++ b/bigframes/session/_io/pandas.py
@@ -46,20 +46,32 @@ def arrow_to_pandas(
             # Preserve NA/NaN distinction. Note: This is currently needed, even if we use
             # nullable Float64Dtype in the types_mapper. See:
             # https://github.com/pandas-dev/pandas/issues/55668
+            mask = pyarrow.compute.is_null(column)
+            nonnull = pyarrow.compute.fill_null(column, float("nan"))
             # Regarding type: ignore, this class has been public at this
             # location since pandas 1.2.0. See:
             # https://pandas.pydata.org/docs/dev/reference/api/pandas.arrays.FloatingArray.html
             pd_array = pandas.arrays.FloatingArray(  # type: ignore
-                column.to_numpy(zero_copy_only=False),
-                pyarrow.compute.is_null(column).to_numpy(zero_copy_only=False),
+                nonnull.to_numpy()
+                if isinstance(nonnull, pyarrow.ChunkedArray)
+                else nonnull.to_numpy(zero_copy_only=False),
+                mask.to_numpy()
+                if isinstance(mask, pyarrow.ChunkedArray)
+                else mask.to_numpy(zero_copy_only=False),
             )
             series = pandas.Series(pd_array, dtype=dtype)
         elif dtype == pandas.Int64Dtype():
             # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly
             # casts to float64 in an intermediate step.
+            mask = pyarrow.compute.is_null(column)
+            nonnull = pyarrow.compute.fill_null(column, 0)
             pd_array = pandas.arrays.IntegerArray(
-                pyarrow.compute.fill_null(column, 0).to_numpy(zero_copy_only=False),
-                pyarrow.compute.is_null(column).to_numpy(zero_copy_only=False),
+                nonnull.to_numpy()
+                if isinstance(nonnull, pyarrow.ChunkedArray)
+                else nonnull.to_numpy(zero_copy_only=False),
+                mask.to_numpy()
+                if isinstance(mask, pyarrow.ChunkedArray)
+                else mask.to_numpy(zero_copy_only=False),
             )
             series = pandas.Series(pd_array, dtype=dtype)
         elif isinstance(dtype, pandas.ArrowDtype):

From ea4d9dff65e6ddecd1e16fd8c99d901ffc16c4c4 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Thu, 26 Oct 2023 17:53:08 +0000
Subject: [PATCH 13/14] test with chunked_array

---
 tests/unit/session/test_io_pandas.py | 56 ++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py
index 8b95977ec3..0f6f5dae03 100644
--- a/tests/unit/session/test_io_pandas.py
+++ b/tests/unit/session/test_io_pandas.py
@@ -231,6 +231,62 @@
             ),
             id="scalar-dtypes",
         ),
+        pytest.param(
+            pyarrow.Table.from_pydict(
+                {
+                    "bool": pyarrow.chunked_array(
+                        [[True, None], [True, False]],
+                        type=pyarrow.bool_(),
+                    ),
+                    "bytes": pyarrow.chunked_array(
+                        [[b"123", None], [b"abc", b"xyz"]],
+                        type=pyarrow.binary(),
+                    ),
+                    "float": pyarrow.chunked_array(
+                        [[1.0, None], [float("nan"), -1.0]],
+                        type=pyarrow.float64(),
+                    ),
+                    "int": pyarrow.chunked_array(
+                        [[1, None], [-1, 2**63 - 1]],
+                        type=pyarrow.int64(),
+                    ),
+                    "string": pyarrow.chunked_array(
+                        [["123", None], ["abc", "xyz"]],
+                        type=pyarrow.string(),
+                    ),
+                }
+            ),
+            {
+                "bool": "boolean",
+                "bytes": "object",
+                "float": pandas.Float64Dtype(),
+                "int": pandas.Int64Dtype(),
+                "string": "string[pyarrow]",
+            },
+            pandas.DataFrame(
+                {
+                    "bool": pandas.Series([True, None, True, False], dtype="boolean"),
+                    "bytes": [b"123", None, b"abc", b"xyz"],
+                    "float": pandas.Series(
+                        pandas.arrays.FloatingArray(  # type: ignore
+                            numpy.array(
+                                [1.0, float("nan"), float("nan"), -1.0], dtype="float64"
+                            ),
+                            numpy.array([False, True, False, False], dtype="bool"),
+                        ),
+                        dtype=pandas.Float64Dtype(),
+                    ),
+                    "int": pandas.Series(
+                        [1, None, -1, 2**63 - 1],
+                        dtype=pandas.Int64Dtype(),
+                    ),
+                    "string": pandas.Series(
+                        ["123", None, "abc", "xyz"], dtype="string[pyarrow]"
+                    ),
+                }
+            ),
+            id="scalar-dtypes-chunked_array",
+        ),
         pytest.param(
             pyarrow.Table.from_pydict(
                 {

From e1e291d0e0938adde13330c934ac5cf3f3a95b2b Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Thu, 26 Oct 2023 21:43:50 +0000
Subject: [PATCH 14/14] explain type: ignore

---
 bigframes/core/blocks.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 4d322d48ee..4d240484a6 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -431,6 +431,9 @@ def _copy_index_to_pandas(self, df: pd.DataFrame):
         """
         if self.index_columns:
             df.set_index(list(self.index_columns), inplace=True)
+            # Pandas names is annotated as list[str] rather than the more
+            # general Sequence[Label] that BigQuery DataFrames has.
+            # See: https://github.com/pandas-dev/pandas-stubs/issues/804
             df.index.names = self.index.names  # type: ignore
 
     def _compute_and_count(