From 1daec0077a19de9fd9bcb97e0119a71b5e78bedc Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 21 Dec 2024 11:50:21 +0100 Subject: [PATCH 01/10] TST: add test to show error when an empty object column is written using arrow --- pyogrio/tests/test_geopandas_io.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 96d9e3a0..9bd143cb 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1104,15 +1104,27 @@ def test_write_dataframe_index(tmp_path, naturalearth_lowres, use_arrow): @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".geojsonl"]) +@pytest.mark.parametrize( + "columns, dtype", + [ + ([], None), + (["col_int"], np.int64), + (["col_float"], np.float64), + (["col_object"], object), + ], +) @pytest.mark.requires_arrow_write_api -def test_write_empty_dataframe(tmp_path, ext, use_arrow): - expected = gp.GeoDataFrame(geometry=[], crs=4326) +def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow): + if use_arrow and dtype is object: + pytest.xfail(reason="writing an empty object column with Arrow gives an error") + + expected = gp.GeoDataFrame(geometry=[], columns=columns, dtype=dtype, crs=4326) filename = tmp_path / f"test{ext}" write_dataframe(expected, filename, use_arrow=use_arrow) assert filename.exists() - df = read_dataframe(filename) + df = read_dataframe(filename, use_arrow=use_arrow) assert_geodataframe_equal(df, expected) From e35e7083a21546a10329e656ad7fe989faf7d1a8 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 20:20:02 +0100 Subject: [PATCH 02/10] Fix issue --- pyogrio/geopandas.py | 5 +++++ pyogrio/tests/test_geopandas_io.py | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 45fe3b1a..ef8c95ec 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -458,6 +458,11 @@ def write_dataframe( use_arrow = bool(int(os.environ.get("PYOGRIO_USE_ARROW", "0"))) path, driver = _get_write_path_driver(path, driver, append=append) + if use_arrow and df.empty or len(df) == 0: + # with arrow, string columns without data trigger an error, so disable arrow + # when writing an empty dataframe. + use_arrow = False + geometry_columns = df.columns[df.dtypes == "geometry"] if len(geometry_columns) > 1: raise ValueError( diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 725a2d5b..0eb34217 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1157,9 +1157,6 @@ def test_write_dataframe_index(tmp_path, naturalearth_lowres, use_arrow): ) @pytest.mark.requires_arrow_write_api def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow): - if use_arrow and dtype is object: - pytest.xfail(reason="writing an empty object column with Arrow gives an error") - expected = gp.GeoDataFrame(geometry=[], columns=columns, dtype=dtype, crs=4326) filename = tmp_path / f"test{ext}" From 5c145693af5d297f1b0794ae62e12329173ff527 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 21:09:37 +0100 Subject: [PATCH 03/10] Update CHANGES.md --- CHANGES.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 117751cd..7c9f2b48 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,7 +9,8 @@ ### Bug fixes - Fix WKB writing on big-endian systems (#497). -- Fix writing fids to e.g. GPKG file with use_arrow (#511). +- Fix writing fids to e.g. GPKG file with `use_arrow` (#511). +- Fix error when an empty object column is written with `use_arrow` (#512). ### Packaging From 8026fa406c393a93bcbd2413f37d7183cc183f9a Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 21:10:10 +0100 Subject: [PATCH 04/10] Add test with xfail for object column with only None values --- pyogrio/tests/test_geopandas_io.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 0eb34217..c0bb2541 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1184,6 +1184,24 @@ def test_write_empty_geometry(tmp_path): assert_geodataframe_equal(df, expected) +@pytest.mark.requires_arrow_write_api +def test_write_None_string_column(tmp_path, use_arrow): + if use_arrow: + pytest.xfail( + "error when an object column with only None values is written with arrow." + ) + + gdf = gp.GeoDataFrame({"object_col": [None]}, geometry=[Point(0, 0)], crs=4326) + filename = tmp_path / "test.gpkg" + + write_dataframe(gdf, filename, use_arrow=use_arrow) + assert filename.exists() + + result_gdf = read_dataframe(filename, use_arrow=use_arrow) + assert result_gdf.object_col.dtype == object + assert_geodataframe_equal(result_gdf, gdf) + + @pytest.mark.parametrize("ext", [".geojsonl", ".geojsons"]) @pytest.mark.requires_arrow_write_api def test_write_read_empty_dataframe_unsupported(tmp_path, ext, use_arrow): From a5bfa291291e57fe2da9e7082e14e3b4c316b15b Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 22:39:37 +0100 Subject: [PATCH 05/10] try fix for python 3.10 tests --- pyogrio/tests/test_geopandas_io.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index c0bb2541..42336111 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1157,7 +1157,9 @@ def test_write_dataframe_index(tmp_path, naturalearth_lowres, use_arrow): ) @pytest.mark.requires_arrow_write_api def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow): - expected = gp.GeoDataFrame(geometry=[], columns=columns, dtype=dtype, crs=4326) + expected = gp.GeoDataFrame(geometry=[], columns=columns, crs=4326) + for column in columns: + expected[column] = expected[column].astype(dtype) filename = tmp_path / f"test{ext}" write_dataframe(expected, filename, use_arrow=use_arrow) From 46ba442005c30461f0d721f686d0d6af4d0d2626 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 22:52:06 +0100 Subject: [PATCH 06/10] take 2 --- pyogrio/tests/test_geopandas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 42336111..ae914b1c 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1166,7 +1166,7 @@ def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow): assert filename.exists() df = read_dataframe(filename, use_arrow=use_arrow) - assert_geodataframe_equal(df, expected) + assert_geodataframe_equal(df, expected, check_index_type=False) def test_write_empty_geometry(tmp_path): From eda0ff86a0d3c80a221c4a6e906e6aab9fbd87b7 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Thu, 23 Jan 2025 22:58:52 +0100 Subject: [PATCH 07/10] Update test_geopandas_io.py --- pyogrio/tests/test_geopandas_io.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index ae914b1c..ebf0319e 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1157,10 +1157,7 @@ def test_write_dataframe_index(tmp_path, naturalearth_lowres, use_arrow): ) @pytest.mark.requires_arrow_write_api def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow): - expected = gp.GeoDataFrame(geometry=[], columns=columns, crs=4326) - for column in columns: - expected[column] = expected[column].astype(dtype) - + expected = gp.GeoDataFrame(geometry=[], columns=columns, dtype=dtype, crs=4326) filename = tmp_path / f"test{ext}" write_dataframe(expected, filename, use_arrow=use_arrow) From ebb012f9ea7f1bb1c5033458f47a079b599b1606 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 26 Jan 2025 20:19:51 +0100 Subject: [PATCH 08/10] Update geopandas.py --- pyogrio/geopandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index ef8c95ec..8b97ea32 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -458,7 +458,7 @@ def write_dataframe( use_arrow = bool(int(os.environ.get("PYOGRIO_USE_ARROW", "0"))) path, driver = _get_write_path_driver(path, driver, append=append) - if use_arrow and df.empty or len(df) == 0: + if use_arrow and (df.empty or len(df) == 0): # with arrow, string columns without data trigger an error, so disable arrow # when writing an empty dataframe. use_arrow = False From e04a8b1b9ffbf03d70e61582ed4f992b739606ab Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 26 Jan 2025 20:58:21 +0100 Subject: [PATCH 09/10] Fix for both empty gataframes and object columns with all None values --- pyogrio/geopandas.py | 14 +++++++++----- pyogrio/tests/test_geopandas_io.py | 13 +++++++++---- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index 8b97ea32..e6919eed 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -458,11 +458,6 @@ def write_dataframe( use_arrow = bool(int(os.environ.get("PYOGRIO_USE_ARROW", "0"))) path, driver = _get_write_path_driver(path, driver, append=append) - if use_arrow and (df.empty or len(df) == 0): - # with arrow, string columns without data trigger an error, so disable arrow - # when writing an empty dataframe. - use_arrow = False - geometry_columns = df.columns[df.dtypes == "geometry"] if len(geometry_columns) > 1: raise ValueError( @@ -588,6 +583,15 @@ def write_dataframe( table = pa.Table.from_pandas(df, preserve_index=False) + # Null arrow columns are not supported by GDAL, so convert to string + for field_index, field in enumerate(table.schema): + if field.type == pa.null(): + table = table.set_column( + field_index, + field.with_type(pa.string()), + table[field_index].cast(pa.string()), + ) + if geometry_column is not None: # ensure that the geometry column is binary (for all-null geometries, # this could be a wrong type) diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index ebf0319e..73b1990d 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -1157,6 +1157,11 @@ def test_write_dataframe_index(tmp_path, naturalearth_lowres, use_arrow): ) @pytest.mark.requires_arrow_write_api def test_write_empty_dataframe(tmp_path, ext, columns, dtype, use_arrow): + """Test writing dataframe with no rows. + + With use_arrow, object type columns with no rows are converted to null type columns + by pyarrow, but null columns are not supported by GDAL. Added to test fix for #513. + """ expected = gp.GeoDataFrame(geometry=[], columns=columns, dtype=dtype, crs=4326) filename = tmp_path / f"test{ext}" write_dataframe(expected, filename, use_arrow=use_arrow) @@ -1185,11 +1190,11 @@ def test_write_empty_geometry(tmp_path): @pytest.mark.requires_arrow_write_api def test_write_None_string_column(tmp_path, use_arrow): - if use_arrow: - pytest.xfail( - "error when an object column with only None values is written with arrow." - ) + """Test pandas object columns with all None values. + With use_arrow, such columns are converted to null type columns by pyarrow, but null + columns are not supported by GDAL. Added to test fix for #513. + """ gdf = gp.GeoDataFrame({"object_col": [None]}, geometry=[Point(0, 0)], crs=4326) filename = tmp_path / "test.gpkg" From 36b95a88eb0e13bf1cf1b1de02448142d995f625 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sun, 26 Jan 2025 21:12:38 +0100 Subject: [PATCH 10/10] Update CHANGES.md --- CHANGES.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 7c9f2b48..920b1828 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,7 +10,8 @@ - Fix WKB writing on big-endian systems (#497). - Fix writing fids to e.g. GPKG file with `use_arrow` (#511). -- Fix error when an empty object column is written with `use_arrow` (#512). +- Fix error in `write_dataframe` when writing an empty or all-None object + column with `use_arrow` (#512). ### Packaging