From c35523c147e1548c77bbd2d26a10725ea144a278 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 23 Oct 2023 16:58:54 +0000 Subject: [PATCH 01/14] refactor: make `to_pandas()` call `to_arrow()` and use local dtypes in DataFrame construction --- bigframes/core/blocks.py | 41 ++++------------------ bigframes/core/indexes/index.py | 3 +- bigframes/dtypes.py | 4 +++ bigframes/session/__init__.py | 10 ++---- bigframes/session/_io/__init__.py | 13 +++++++ bigframes/session/_io/pandas.py | 46 +++++++++++++++++++++++++ tests/unit/test_dtypes.py | 57 ++++++++++++++++--------------- 7 files changed, 104 insertions(+), 70 deletions(-) create mode 100644 bigframes/session/_io/__init__.py create mode 100644 bigframes/session/_io/pandas.py diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 046d2b3a44..eab4645477 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -28,11 +28,8 @@ from typing import Iterable, List, Optional, Sequence, Tuple import warnings -import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery -import numpy import pandas as pd -import pyarrow as pa # type: ignore import bigframes.constants as constants import bigframes.core as core @@ -46,6 +43,7 @@ import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import bigframes.session._io.pandas import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common # Type constraint for wherever column labels are used @@ -372,34 +370,11 @@ def reorder_levels(self, ids: typing.Sequence[str]): level_names = [self.col_id_to_index_name[index_id] for index_id in ids] return Block(self.expr, ids, self.column_labels, level_names) - @classmethod - def _to_dataframe( - cls, result, schema: typing.Mapping[str, bigframes.dtypes.Dtype] - ) -> pd.DataFrame: + def _to_dataframe(self, result) -> pd.DataFrame: """Convert BigQuery data to pandas DataFrame with specific dtypes.""" - dtypes = bigframes.dtypes.to_pandas_dtypes_overrides(result.schema) - df = result.to_dataframe( - dtypes=dtypes, - bool_dtype=pd.BooleanDtype(), - int_dtype=pd.Int64Dtype(), - float_dtype=pd.Float64Dtype(), - string_dtype=pd.StringDtype(storage="pyarrow"), - date_dtype=pd.ArrowDtype(pa.date32()), - datetime_dtype=pd.ArrowDtype(pa.timestamp("us")), - time_dtype=pd.ArrowDtype(pa.time64("us")), - timestamp_dtype=pd.ArrowDtype(pa.timestamp("us", tz="UTC")), - ) - - # Convert Geography column from StringDType to GeometryDtype. - for column_name, dtype in schema.items(): - if dtype == gpd.array.GeometryDtype(): - df[column_name] = gpd.GeoSeries.from_wkt( - # https://github.com/geopandas/geopandas/issues/1879 - df[column_name].replace({numpy.nan: None}), - # BigQuery geography type is based on the WGS84 reference ellipsoid. - crs="EPSG:4326", - ) - return df + dtypes = dict(zip(self.index_columns, self.index_dtypes)) + dtypes.update(zip(self.value_columns, self.dtypes)) + return self._expr._session._rows_to_dataframe(result, dtypes) def to_pandas( self, @@ -480,8 +455,7 @@ def _compute_and_count( if sampling_method == _HEAD: total_rows = int(results_iterator.total_rows * fraction) results_iterator.max_results = total_rows - schema = dict(zip(self.value_columns, self.dtypes)) - df = self._to_dataframe(results_iterator, schema) + df = self._to_dataframe(results_iterator) if self.index_columns: df.set_index(list(self.index_columns), inplace=True) @@ -510,8 +484,7 @@ def _compute_and_count( ) else: total_rows = results_iterator.total_rows - schema = dict(zip(self.value_columns, self.dtypes)) - df = self._to_dataframe(results_iterator, schema) + df = self._to_dataframe(results_iterator) if self.index_columns: df.set_index(list(self.index_columns), inplace=True) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 677bb8529c..b9ffdff21e 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -399,9 +399,10 @@ def to_pandas(self) -> pandas.Index: """Executes deferred operations and downloads the results.""" # Project down to only the index column. So the query can be cached to visualize other data. index_columns = list(self._block.index_columns) + dtypes = dict(zip(index_columns, self.dtypes)) expr = self._expr.select_columns(index_columns) results, _ = expr.start_query() - df = expr._session._rows_to_dataframe(results) + df = expr._session._rows_to_dataframe(results, dtypes) df = df.set_index(index_columns) index = df.index index.names = list(self._block._index_labels) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index da221a95ac..5280e97f70 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -169,6 +169,10 @@ def ibis_dtype_to_bigframes_dtype( if isinstance(ibis_dtype, ibis_dtypes.Struct): return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) + # BigQuery only supports integers of size 64 bits. + if isinstance(ibis_dtype, ibis_dtypes.Integer): + return pd.Int64Dtype() + if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] elif isinstance(ibis_dtype, ibis_dtypes.Null): diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 5ec3da1a5a..6ac6782eea 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1515,14 +1515,10 @@ def _get_table_size(self, destination_table): return table.num_bytes def _rows_to_dataframe( - self, row_iterator: bigquery.table.RowIterator + self, row_iterator: bigquery.table.RowIterator, dtypes: Dict ) -> pandas.DataFrame: - return row_iterator.to_dataframe( - bool_dtype=pandas.BooleanDtype(), - int_dtype=pandas.Int64Dtype(), - float_dtype=pandas.Float64Dtype(), - string_dtype=pandas.StringDtype(storage="pyarrow"), - ) + arrow_table = row_iterator.to_arrow() + return bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) def _start_generic_job(self, job: formatting_helpers.GenericJob): if bigframes.options.display.progress_bar is not None: diff --git a/bigframes/session/_io/__init__.py b/bigframes/session/_io/__init__.py new file mode 100644 index 0000000000..1dc90d1848 --- /dev/null +++ b/bigframes/session/_io/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py new file mode 100644 index 0000000000..fdd79c0a25 --- /dev/null +++ b/bigframes/session/_io/pandas.py @@ -0,0 +1,46 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +import geopandas # type: ignore +import pandas +import pyarrow # type: ignore + +import bigframes.constants + + +def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict): + if len(dtypes) != arrow_table.num_columns: + raise ValueError( + f"Number of types {len(dtypes)} doesn't match number of columns " + f"{arrow_table.num_columns}. {bigframes.constants.FEEDBACK_LINK}" + ) + + serieses = {} + for column_name, column in zip(arrow_table.column_names, arrow_table): + dtype = dtypes[column_name] + + if dtype == geopandas.array.GeometryDtype(): + series = geopandas.GeoSeries.from_wkt( + column, + # BigQuery geography type is based on the WGS84 reference ellipsoid. + crs="EPSG:4326", + ) + else: + series = pandas.Series(column, dtype=dtype) + + serieses[column_name] = series + + return pandas.DataFrame(serieses) diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index 3baff2e1f5..6ceaaf911b 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -29,41 +29,42 @@ # TODO(bmil): Add ARRAY, INTERVAL, STRUCT to cover all the standard # BigQuery data types as they appear in Ibis: # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types - (ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), np.dtype("O")), - (ibis_dtypes.boolean, pd.BooleanDtype()), - (ibis_dtypes.binary, np.dtype("O")), - (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), - (ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us"))), - (ibis_dtypes.float64, pd.Float64Dtype()), - ( + pytest.param( + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), + np.dtype("O"), + id="bignumeric", + ), + pytest.param(ibis_dtypes.boolean, pd.BooleanDtype(), id="bool"), + pytest.param(ibis_dtypes.binary, np.dtype("O"), id="bytes"), + pytest.param(ibis_dtypes.date, pd.ArrowDtype(pa.date32()), id="date"), + pytest.param( + ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us")), id="datetime" + ), + pytest.param(ibis_dtypes.float64, pd.Float64Dtype(), id="float"), + pytest.param( ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True), gpd.array.GeometryDtype(), + id="geography", ), - (ibis_dtypes.int64, pd.Int64Dtype()), - (ibis_dtypes.json, np.dtype("O")), - (ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), np.dtype("O")), - (ibis_dtypes.string, pd.StringDtype(storage="pyarrow")), - (ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))), - ( + pytest.param(ibis_dtypes.int8, pd.Int64Dtype(), id="int8-as-int64"), + pytest.param(ibis_dtypes.int64, pd.Int64Dtype(), id="int64"), + # TODO(tswast): custom dtype (or at least string dtype) for JSON objects + pytest.param(ibis_dtypes.json, np.dtype("O"), id="json"), + pytest.param( + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), + np.dtype("O"), + id="numeric", + ), + pytest.param( + ibis_dtypes.string, pd.StringDtype(storage="pyarrow"), id="string" + ), + pytest.param(ibis_dtypes.time, pd.ArrowDtype(pa.time64("us")), id="time"), + pytest.param( ibis_dtypes.Timestamp(timezone="UTC"), pd.ArrowDtype(pa.timestamp("us", tz="UTC")), # type: ignore + id="timestamp", ), ], - ids=[ - "bignumeric", - "bool", - "bytes", - "date", - "datetime", - "float", - "geography", - "int64", - "json", - "numeric", - "string", - "time", - "timestamp", - ], ) def test_ibis_dtype_converts(ibis_dtype, bigframes_dtype): """Test all the Ibis data types needed to read BigQuery tables""" From c2f9d72153f3885d84fab2dfcb928f9578bca285 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 24 Oct 2023 20:26:44 +0000 Subject: [PATCH 02/14] use integer_object_nulls=True to preserve NA/NaN distinction --- bigframes/session/_io/pandas.py | 8 +++++++- tests/system/small/test_dataframe.py | 10 ---------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index fdd79c0a25..ca701c73c0 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -39,7 +39,13 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict): crs="EPSG:4326", ) else: - series = pandas.Series(column, dtype=dtype) + series = column.to_pandas( + # Construct Python objects to preserve NA/NaN. Note: This is + # currently needed, even for nullable Float64Dtype. + # https://github.com/pandas-dev/pandas/issues/55668 + integer_object_nulls=True, + types_mapper=lambda _: dtype, + ) serieses[column_name] = series diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 19e50eb06d..84e8def83b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2046,16 +2046,6 @@ def test__dir__with_rename(scalars_dfs): def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index.iloc[start:stop:step] - - # Pandas may assign non-object dtype to empty series and series index - # dtypes of empty columns are a known area of divergence from pandas - for column in pd_result.columns: - if ( - pd_result[column].empty and column != "geography_col" - ): # for empty geography_col, bigframes assigns non-object dtype - pd_result[column] = pd_result[column].astype("object") - pd_result.index = pd_result.index.astype("object") - pd.testing.assert_frame_equal( bf_result, pd_result, From 7ccb61dbb66b3f9db9628f146bbd8625f3a5c91f Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 24 Oct 2023 20:52:25 +0000 Subject: [PATCH 03/14] allow NUMERIC/BIGNUMERIC to cast to FLOAT64 --- bigframes/dtypes.py | 2 ++ tests/system/small/test_series.py | 42 +++++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 5280e97f70..079f0cc27a 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -376,6 +376,8 @@ def cast_ibis_value( ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64), ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64), ibis_dtypes.date: (), + ibis_dtypes.Decimal(precision=38, scale=9): (ibis_dtypes.float64,), + ibis_dtypes.Decimal(precision=76, scale=38): (ibis_dtypes.float64,), ibis_dtypes.time: (), ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),), ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,), diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index bd9edbb1ca..f5d2feaf82 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -575,7 +575,9 @@ def test_series_int_int_operators_series(scalars_dfs, operator): ) def test_mods(scalars_dfs, col_x, col_y, method): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = getattr(scalars_df[col_x], method)(scalars_df[col_y]).to_pandas() + bf_series = getattr(scalars_df[col_x], method)(scalars_df[col_y]) + # BigQuery's mod functions return NUMERIC values. + bf_result = bf_series.astype("Float64").to_pandas() pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y]) pd.testing.assert_series_equal(pd_result, bf_result) @@ -620,8 +622,20 @@ def test_divmods_series(scalars_dfs, col_x, col_y, method): pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)( scalars_pandas_df[col_y] ) - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) @pytest.mark.parametrize( @@ -649,8 +663,20 @@ def test_divmods_scalars(scalars_dfs, col_x, other, method): scalars_df, scalars_pandas_df = scalars_dfs bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(other) pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)(other) - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) @pytest.mark.parametrize( @@ -1941,12 +1967,6 @@ def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): def test_series_iloc(scalars_df_index, scalars_pandas_df_index, start, stop, step): bf_result = scalars_df_index["string_col"].iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index["string_col"].iloc[start:stop:step] - - # Pandas may assign non-object dtype to empty series and series index - if pd_result.empty: - pd_result = pd_result.astype("object") - pd_result.index = pd_result.index.astype("object") - pd.testing.assert_series_equal( bf_result, pd_result, From 829cf99a44e7a6a205f2390ee21e832f924365be Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 25 Oct 2023 16:33:30 +0000 Subject: [PATCH 04/14] better workaround for Float64Dtype NaNs --- bigframes/session/_io/pandas.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index ca701c73c0..7834811620 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -16,7 +16,9 @@ import geopandas # type: ignore import pandas +import pandas.arrays import pyarrow # type: ignore +import pyarrow.compute # type: ignore import bigframes.constants @@ -38,14 +40,17 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict): # BigQuery geography type is based on the WGS84 reference ellipsoid. crs="EPSG:4326", ) - else: - series = column.to_pandas( - # Construct Python objects to preserve NA/NaN. Note: This is - # currently needed, even for nullable Float64Dtype. - # https://github.com/pandas-dev/pandas/issues/55668 - integer_object_nulls=True, - types_mapper=lambda _: dtype, + elif dtype == pandas.Float64Dtype(): + # Preserve NA/NaN distinction. Note: This is currently needed, even if we use + # nullable Float64Dtype in the types_mapper. See: + # https://github.com/pandas-dev/pandas/issues/55668 + pd_array = pandas.arrays.FloatingArray( + column.to_numpy(), + pyarrow.compute.is_null(column).to_numpy(), ) + series = pandas.Series(pd_array, dtype=dtype) + else: + series = column.to_pandas(types_mapper=lambda _: dtype) serieses[column_name] = series From 3a90214ac28559f3d2cb1ae7cfadc83ab0f9b849 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 25 Oct 2023 16:38:17 +0000 Subject: [PATCH 05/14] fix type error --- bigframes/session/_io/pandas.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 7834811620..ea1318636a 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -44,7 +44,10 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict): # Preserve NA/NaN distinction. Note: This is currently needed, even if we use # nullable Float64Dtype in the types_mapper. See: # https://github.com/pandas-dev/pandas/issues/55668 - pd_array = pandas.arrays.FloatingArray( + # Regarding type: ignore, this class has been public at this + # location since pandas 1.2.0. See: + # https://pandas.pydata.org/docs/dev/reference/api/pandas.arrays.FloatingArray.html + pd_array = pandas.arrays.FloatingArray( # type: ignore column.to_numpy(), pyarrow.compute.is_null(column).to_numpy(), ) From 8bdfd79756da0319c6b861e1705d28b184d429cc Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 25 Oct 2023 19:02:49 +0000 Subject: [PATCH 06/14] add unit tests for extreme values --- bigframes/session/_io/pandas.py | 27 +++- tests/unit/session/test_io_pandas.py | 216 +++++++++++++++++++++++++++ 2 files changed, 238 insertions(+), 5 deletions(-) create mode 100644 tests/unit/session/test_io_pandas.py diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index ea1318636a..163127b546 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict +from typing import Dict, Union import geopandas # type: ignore import pandas @@ -23,7 +23,9 @@ import bigframes.constants -def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict): +def arrow_to_pandas( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict +): if len(dtypes) != arrow_table.num_columns: raise ValueError( f"Number of types {len(dtypes)} doesn't match number of columns " @@ -31,8 +33,8 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict): ) serieses = {} - for column_name, column in zip(arrow_table.column_names, arrow_table): - dtype = dtypes[column_name] + for field, column in zip(arrow_table.schema, arrow_table): + dtype = dtypes[field.name] if dtype == geopandas.array.GeometryDtype(): series = geopandas.GeoSeries.from_wkt( @@ -52,9 +54,24 @@ def arrow_to_pandas(arrow_table: pyarrow.Table, dtypes: Dict): pyarrow.compute.is_null(column).to_numpy(), ) series = pandas.Series(pd_array, dtype=dtype) + elif dtype == pandas.Int64Dtype(): + # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly + # casts to float64 in an intermediate step. + pd_array = pandas.arrays.IntegerArray( + pyarrow.compute.fill_null(column, 0).to_numpy(), + pyarrow.compute.is_null(column).to_numpy(), + ) + series = pandas.Series(pd_array, dtype=dtype) + elif isinstance(dtype, pandas.ArrowDtype): + # Avoid conversion logic if we are backing the pandas Series by the + # arrow array. + series = pandas.Series( + pandas.arrays.ArrowExtensionArray(column), # type: ignore + dtype=dtype, + ) else: series = column.to_pandas(types_mapper=lambda _: dtype) - serieses[column_name] = series + serieses[field.name] = series return pandas.DataFrame(serieses) diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py new file mode 100644 index 0000000000..de4afe71a8 --- /dev/null +++ b/tests/unit/session/test_io_pandas.py @@ -0,0 +1,216 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +from typing import Dict, Union + +import geopandas # type: ignore +import numpy +import pandas +import pandas.arrays +import pandas.testing +import pyarrow # type: ignore +import pytest + +import bigframes.session._io.pandas + + +@pytest.mark.parametrize( + ("arrow_table", "dtypes", "expected"), + ( + pytest.param( + pyarrow.Table.from_pydict({}), + {}, + pandas.DataFrame(), + id="empty-df", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": [True, None, True, False], + "bytes": [b"123", None, b"abc", b"xyz"], + "date": pyarrow.array( + [ + datetime.date(2023, 8, 29), + None, + datetime.date(2024, 4, 9), + datetime.date(1, 1, 1), + ], + type=pyarrow.date32(), + ), + "datetime": pyarrow.array( + [ + datetime.datetime(2023, 8, 29), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + ], + type=pyarrow.timestamp("us"), + ), + "float": pyarrow.array( + [1.0, None, float("nan"), -1.0], + type=pyarrow.float64(), + ), + "int": pyarrow.array( + [1, None, -1, 2**63 - 1], + type=pyarrow.int64(), + ), + "string": ["123", None, "abc", "xyz"], + "time": pyarrow.array( + [ + datetime.time(0, 0, 0, 1), + datetime.time(12, 0, 0), + None, + datetime.time(23, 59, 59, 999999), + ], + type=pyarrow.time64("us"), + ), + "timestamp": pyarrow.array( + [ + datetime.datetime(2023, 8, 29), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + ], + type=pyarrow.timestamp("us", datetime.timezone.utc), + ), + } + ), + { + "bool": "boolean", + "bytes": "object", + "date": pandas.ArrowDtype(pyarrow.date32()), + "datetime": pandas.ArrowDtype(pyarrow.timestamp("us")), + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + "time": pandas.ArrowDtype(pyarrow.time64("us")), + "timestamp": pandas.ArrowDtype( + pyarrow.timestamp("us", datetime.timezone.utc) + ), + }, + pandas.DataFrame( + { + "bool": pandas.Series([True, None, True, False], dtype="boolean"), + "bytes": [b"123", None, b"abc", b"xyz"], + "date": pandas.Series( + [ + datetime.date(2023, 8, 29), + None, + datetime.date(2024, 4, 9), + datetime.date(1, 1, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.date32()), + ), + "datetime": pandas.Series( + [ + datetime.datetime(2023, 8, 29), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us")), + ), + "float": pandas.Series( + pandas.arrays.FloatingArray( + numpy.array( + [1.0, float("nan"), float("nan"), -1.0], dtype="float64" + ), + numpy.array([False, True, False, False], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [1, None, -1, 2**63 - 1], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + "time": pandas.Series( + [ + datetime.time(0, 0, 0, 1), + datetime.time(12, 0, 0), + None, + datetime.time(23, 59, 59, 999999), + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + "timestamp": pandas.Series( + [ + datetime.datetime(2023, 8, 29), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + ], + dtype=pandas.ArrowDtype( + pyarrow.timestamp("us", datetime.timezone.utc) + ), + ), + } + ), + id="scalar-dtypes", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "geocol": [ + "POINT(32 210)", + None, + "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)", + ] + } + ), + {"geocol": geopandas.array.GeometryDtype()}, + pandas.DataFrame( + { + "geocol": geopandas.GeoSeries.from_wkt( + ["POINT(32 210)", None, "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)"], + crs="EPSG:4326", + ), + } + ), + id="geography-dtype", + ), + ), +) +def test_arrow_to_pandas( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], + dtypes: Dict, + expected: pandas.DataFrame, +): + actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) + pandas.testing.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize( + ("arrow_table", "dtypes"), + ( + pytest.param( + pyarrow.Table.from_pydict({"col1": [1], "col2": [2]}), + {"col1": "Int64"}, + id="too-few-dtypes", + ), + pytest.param( + pyarrow.RecordBatch.from_pydict({"col1": [1]}), + {"col1": "Int64", "col2": "string[pyarrow]"}, + id="too-many-dtypes", + ), + ), +) +def test_arrow_to_pandas_wrong_size_dtypes( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict +): + with pytest.raises(ValueError, match=f"Number of types {len(dtypes)}"): + bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) From db81a1c51c0c76d38aabf8cfd7260667b9ce83ad Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 25 Oct 2023 20:22:44 +0000 Subject: [PATCH 07/14] fix tests on latest pandas --- tests/unit/session/test_io_pandas.py | 140 +++++++++++++++++++++------ 1 file changed, 110 insertions(+), 30 deletions(-) diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py index de4afe71a8..72ae65206f 100644 --- a/tests/unit/session/test_io_pandas.py +++ b/tests/unit/session/test_io_pandas.py @@ -38,8 +38,57 @@ pytest.param( pyarrow.Table.from_pydict( { - "bool": [True, None, True, False], - "bytes": [b"123", None, b"abc", b"xyz"], + "bool": pyarrow.array([None, None, None], type=pyarrow.bool_()), + "float": pyarrow.array([None, None, None], type=pyarrow.float64()), + "int": pyarrow.array([None, None, None], type=pyarrow.int64()), + "string": pyarrow.array([None, None, None], type=pyarrow.string()), + "time": pyarrow.array( + [None, None, None], type=pyarrow.time64("us") + ), + } + ), + { + "bool": "boolean", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + "time": pandas.ArrowDtype(pyarrow.time64("us")), + }, + pandas.DataFrame( + { + "bool": pandas.Series([None, None, None], dtype="boolean"), + "float": pandas.Series( + pandas.arrays.FloatingArray( + numpy.array( + [float("nan"), float("nan"), float("nan")], + dtype="float64", + ), + numpy.array([True, True, True], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [None, None, None], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + [None, None, None], dtype="string[pyarrow]" + ), + "time": pandas.Series( + [ + None, + None, + None, + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + } + ), + id="nulls-df", + ), + pytest.param( + pyarrow.Table.from_pydict( + { "date": pyarrow.array( [ datetime.date(2023, 8, 29), @@ -58,14 +107,6 @@ ], type=pyarrow.timestamp("us"), ), - "float": pyarrow.array( - [1.0, None, float("nan"), -1.0], - type=pyarrow.float64(), - ), - "int": pyarrow.array( - [1, None, -1, 2**63 - 1], - type=pyarrow.int64(), - ), "string": ["123", None, "abc", "xyz"], "time": pyarrow.array( [ @@ -88,12 +129,8 @@ } ), { - "bool": "boolean", - "bytes": "object", "date": pandas.ArrowDtype(pyarrow.date32()), "datetime": pandas.ArrowDtype(pyarrow.timestamp("us")), - "float": pandas.Float64Dtype(), - "int": pandas.Int64Dtype(), "string": "string[pyarrow]", "time": pandas.ArrowDtype(pyarrow.time64("us")), "timestamp": pandas.ArrowDtype( @@ -102,8 +139,6 @@ }, pandas.DataFrame( { - "bool": pandas.Series([True, None, True, False], dtype="boolean"), - "bytes": [b"123", None, b"abc", b"xyz"], "date": pandas.Series( [ datetime.date(2023, 8, 29), @@ -122,19 +157,6 @@ ], dtype=pandas.ArrowDtype(pyarrow.timestamp("us")), ), - "float": pandas.Series( - pandas.arrays.FloatingArray( - numpy.array( - [1.0, float("nan"), float("nan"), -1.0], dtype="float64" - ), - numpy.array([False, True, False, False], dtype="bool"), - ), - dtype=pandas.Float64Dtype(), - ), - "int": pandas.Series( - [1, None, -1, 2**63 - 1], - dtype=pandas.Int64Dtype(), - ), "string": pandas.Series( ["123", None, "abc", "xyz"], dtype="string[pyarrow]" ), @@ -160,6 +182,53 @@ ), } ), + id="arrow-dtypes", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": [True, None, True, False], + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pyarrow.array( + [1.0, None, float("nan"), -1.0], + type=pyarrow.float64(), + ), + "int": pyarrow.array( + [1, None, -1, 2**63 - 1], + type=pyarrow.int64(), + ), + "string": ["123", None, "abc", "xyz"], + } + ), + { + "bool": "boolean", + "bytes": "object", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + }, + pandas.DataFrame( + { + "bool": pandas.Series([True, None, True, False], dtype="boolean"), + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pandas.Series( + pandas.arrays.FloatingArray( + numpy.array( + [1.0, float("nan"), float("nan"), -1.0], dtype="float64" + ), + numpy.array([False, True, False, False], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [1, None, -1, 2**63 - 1], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + } + ), id="scalar-dtypes", ), pytest.param( @@ -191,7 +260,18 @@ def test_arrow_to_pandas( expected: pandas.DataFrame, ): actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) - pandas.testing.assert_frame_equal(actual, expected) + pandas.testing.assert_series_equal(actual.dtypes, expected.dtypes) + + # assert_frame_equal is converting to numpy internally, which causes some + # loss of precision with the extreme values in this test. + for column in actual.columns: + assert tuple( + (index, value) if (value is pandas.NA or value == value) else (index, "nan") + for index, value in actual[column].items() + ) == tuple( + (index, value) if (value is pandas.NA or value == value) else (index, "nan") + for index, value in expected[column].items() + ) @pytest.mark.parametrize( From b25112bc447f0d0bc3330cf274902279cdd9af05 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 25 Oct 2023 20:57:44 +0000 Subject: [PATCH 08/14] mypy fixes --- tests/unit/session/test_io_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py index 72ae65206f..8b95977ec3 100644 --- a/tests/unit/session/test_io_pandas.py +++ b/tests/unit/session/test_io_pandas.py @@ -58,7 +58,7 @@ { "bool": pandas.Series([None, None, None], dtype="boolean"), "float": pandas.Series( - pandas.arrays.FloatingArray( + pandas.arrays.FloatingArray( # type: ignore numpy.array( [float("nan"), float("nan"), float("nan")], dtype="float64", @@ -212,7 +212,7 @@ "bool": pandas.Series([True, None, True, False], dtype="boolean"), "bytes": [b"123", None, b"abc", b"xyz"], "float": pandas.Series( - pandas.arrays.FloatingArray( + pandas.arrays.FloatingArray( # type: ignore numpy.array( [1.0, float("nan"), float("nan"), -1.0], dtype="float64" ), From a3705f9897af539ce52fc1a0cfa8853e17167d12 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 26 Oct 2023 16:11:33 +0000 Subject: [PATCH 09/14] fix mod tests --- tests/system/small/test_series.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index f5d2feaf82..c9510290b6 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -575,9 +575,15 @@ def test_series_int_int_operators_series(scalars_dfs, operator): ) def test_mods(scalars_dfs, col_x, col_y, method): scalars_df, scalars_pandas_df = scalars_dfs - bf_series = getattr(scalars_df[col_x], method)(scalars_df[col_y]) - # BigQuery's mod functions return NUMERIC values. - bf_result = bf_series.astype("Float64").to_pandas() + x_bf = scalars_df[col_x] + y_bf = scalars_df[col_y] + bf_series = getattr(x_bf, method)(y_bf) + # BigQuery's mod functions return [BIG]NUMERIC values unless both arguments are integers. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#mod + if x_bf.dtype == pd.Int64Dtype() and y_bf.dtype == pd.Int64Dtype(): + bf_result = bf_series.to_pandas() + else: + bf_result = bf_series.astype("Float64").to_pandas() pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y]) pd.testing.assert_series_equal(pd_result, bf_result) From 1a7b2d722898b7c95377521931ffb8d29ff48d2a Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 24 Oct 2023 15:52:48 +0000 Subject: [PATCH 10/14] feat: add `DataFrame.to_pandas_batches()` to download large `DataFrame` objects --- bigframes/core/blocks.py | 26 +++++++++++++++++++++---- bigframes/dataframe.py | 4 ++++ tests/system/small/test_dataframe_io.py | 8 ++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index eab4645477..4d322d48ee 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -412,6 +412,27 @@ def to_pandas( ) return df, query_job + def to_pandas_batches(self): + """Download results one message at a time.""" + dtypes = dict(zip(self.index_columns, self.index_dtypes)) + dtypes.update(zip(self.value_columns, self.dtypes)) + results_iterator, _ = self._expr.start_query() + for arrow_table in results_iterator.to_arrow_iterable( + bqstorage_client=self._expr._session.bqstoragereadclient + ): + df = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) + self._copy_index_to_pandas(df) + yield df + + def _copy_index_to_pandas(self, df: pd.DataFrame): + """Set the index on pandas DataFrame to match this block. + + Warning: This method modifies ``df`` inplace. + """ + if self.index_columns: + df.set_index(list(self.index_columns), inplace=True) + df.index.names = self.index.names # type: ignore + def _compute_and_count( self, value_keys: Optional[Iterable[str]] = None, @@ -485,10 +506,7 @@ def _compute_and_count( else: total_rows = results_iterator.total_rows df = self._to_dataframe(results_iterator) - - if self.index_columns: - df.set_index(list(self.index_columns), inplace=True) - df.index.names = self.index.names # type: ignore + self._copy_index_to_pandas(df) return df, total_rows, query_job diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5c0d9b78e1..cd73553e23 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -893,6 +893,10 @@ def to_pandas( self._set_internal_query_job(query_job) return df.set_axis(self._block.column_labels, axis=1, copy=False) + def to_pandas_batches(self) -> Iterable[pandas.DataFrame]: + """Stream DataFrame results to an iterable of pandas DataFrame""" + return self._block.to_pandas_batches() + def _compute_dry_run(self) -> bigquery.QueryJob: return self._block._compute_dry_run() diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index d60083a837..8f5d706f62 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -83,6 +83,14 @@ def test_to_pandas_array_struct_correct_result(session): ) +def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): + """Verify to_pandas_batches() APIs returns the expected dtypes.""" + expected = scalars_df_default_index.dtypes + for df in scalars_df_default_index.to_pandas_batches(): + actual = df.dtypes + pd.testing.assert_series_equal(actual, expected) + + @pytest.mark.parametrize( ("index"), [True, False], From c4a8b15a06d8eafc65a455ae01806c7b819f1076 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 26 Oct 2023 17:32:52 +0000 Subject: [PATCH 11/14] allow copies --- bigframes/session/_io/pandas.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 163127b546..d110aa25e7 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -50,16 +50,16 @@ def arrow_to_pandas( # location since pandas 1.2.0. See: # https://pandas.pydata.org/docs/dev/reference/api/pandas.arrays.FloatingArray.html pd_array = pandas.arrays.FloatingArray( # type: ignore - column.to_numpy(), - pyarrow.compute.is_null(column).to_numpy(), + column.to_numpy(zero_copy_only=False), + pyarrow.compute.is_null(column).to_numpy(zero_copy_only=False), ) series = pandas.Series(pd_array, dtype=dtype) elif dtype == pandas.Int64Dtype(): # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly # casts to float64 in an intermediate step. pd_array = pandas.arrays.IntegerArray( - pyarrow.compute.fill_null(column, 0).to_numpy(), - pyarrow.compute.is_null(column).to_numpy(), + pyarrow.compute.fill_null(column, 0).to_numpy(zero_copy_only=False), + pyarrow.compute.is_null(column).to_numpy(zero_copy_only=False), ) series = pandas.Series(pd_array, dtype=dtype) elif isinstance(dtype, pandas.ArrowDtype): From 239e5efd7bee57bdc01cdc16a9d01d33b89e81df Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 26 Oct 2023 17:44:24 +0000 Subject: [PATCH 12/14] allow copies only for contiguous arrays --- bigframes/session/_io/pandas.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index d110aa25e7..1af00a2d01 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -46,20 +46,32 @@ def arrow_to_pandas( # Preserve NA/NaN distinction. Note: This is currently needed, even if we use # nullable Float64Dtype in the types_mapper. See: # https://github.com/pandas-dev/pandas/issues/55668 + mask = pyarrow.compute.is_null(column) + nonnull = pyarrow.compute.fill_null(column, float("nan")) # Regarding type: ignore, this class has been public at this # location since pandas 1.2.0. See: # https://pandas.pydata.org/docs/dev/reference/api/pandas.arrays.FloatingArray.html pd_array = pandas.arrays.FloatingArray( # type: ignore - column.to_numpy(zero_copy_only=False), - pyarrow.compute.is_null(column).to_numpy(zero_copy_only=False), + nonnull.to_numpy() + if isinstance(nonnull, pyarrow.ChunkedArray) + else nonnull.to_numpy(zero_copy_only=False), + mask.to_numpy() + if isinstance(mask, pyarrow.ChunkedArray) + else mask.to_numpy(zero_copy_only=False), ) series = pandas.Series(pd_array, dtype=dtype) elif dtype == pandas.Int64Dtype(): # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly # casts to float64 in an intermediate step. + mask = pyarrow.compute.is_null(column) + nonnull = pyarrow.compute.fill_null(column, 0) pd_array = pandas.arrays.IntegerArray( - pyarrow.compute.fill_null(column, 0).to_numpy(zero_copy_only=False), - pyarrow.compute.is_null(column).to_numpy(zero_copy_only=False), + nonnull.to_numpy() + if isinstance(nonnull, pyarrow.ChunkedArray) + else nonnull.to_numpy(zero_copy_only=False), + mask.to_numpy() + if isinstance(mask, pyarrow.ChunkedArray) + else mask.to_numpy(zero_copy_only=False), ) series = pandas.Series(pd_array, dtype=dtype) elif isinstance(dtype, pandas.ArrowDtype): From ea4d9dff65e6ddecd1e16fd8c99d901ffc16c4c4 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 26 Oct 2023 17:53:08 +0000 Subject: [PATCH 13/14] test with chunked_array --- tests/unit/session/test_io_pandas.py | 56 ++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py index 8b95977ec3..0f6f5dae03 100644 --- a/tests/unit/session/test_io_pandas.py +++ b/tests/unit/session/test_io_pandas.py @@ -231,6 +231,62 @@ ), id="scalar-dtypes", ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": pyarrow.chunked_array( + [[True, None], [True, False]], + type=pyarrow.bool_(), + ), + "bytes": pyarrow.chunked_array( + [[b"123", None], [b"abc", b"xyz"]], + type=pyarrow.binary(), + ), + "float": pyarrow.chunked_array( + [[1.0, None], [float("nan"), -1.0]], + type=pyarrow.float64(), + ), + "int": pyarrow.chunked_array( + [[1, None], [-1, 2**63 - 1]], + type=pyarrow.int64(), + ), + "string": pyarrow.chunked_array( + [["123", None], ["abc", "xyz"]], + type=pyarrow.string(), + ), + } + ), + { + "bool": "boolean", + "bytes": "object", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + }, + pandas.DataFrame( + { + "bool": pandas.Series([True, None, True, False], dtype="boolean"), + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pandas.Series( + pandas.arrays.FloatingArray( # type: ignore + numpy.array( + [1.0, float("nan"), float("nan"), -1.0], dtype="float64" + ), + numpy.array([False, True, False, False], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [1, None, -1, 2**63 - 1], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + } + ), + id="scalar-dtypes-chunked_array", + ), pytest.param( pyarrow.Table.from_pydict( { From e1e291d0e0938adde13330c934ac5cf3f3a95b2b Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 26 Oct 2023 21:43:50 +0000 Subject: [PATCH 14/14] explain type: ignore --- bigframes/core/blocks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 4d322d48ee..4d240484a6 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -431,6 +431,9 @@ def _copy_index_to_pandas(self, df: pd.DataFrame): """ if self.index_columns: df.set_index(list(self.index_columns), inplace=True) + # Pandas names is annotated as list[str] rather than the more + # general Sequence[Label] that BigQuery DataFrames has. + # See: https://github.com/pandas-dev/pandas-stubs/issues/804 df.index.names = self.index.names # type: ignore def _compute_and_count(