From b837945adb10a288f17774488d92340b27c51edd Mon Sep 17 00:00:00 2001 From: Brian T Date: Tue, 4 Feb 2025 21:40:51 +0800 Subject: [PATCH] fix: `large_list` and `large_string` unit test for `read_parquet_metadata` (#3089) * Adds unit test for pyarrow large lists and strings * Fixes typo in test func name --- tests/unit/test_s3_parquet.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py index b650100bc..eaf51ecb1 100644 --- a/tests/unit/test_s3_parquet.py +++ b/tests/unit/test_s3_parquet.py @@ -62,6 +62,27 @@ def test_read_parquet_metadata_nonexistent_file(path): wr.s3.read_parquet_metadata(path + "non-existent-file.parquet") +def test_read_parquet_metadata_large_dtype(path): + schema = pa.schema( + [ + pa.field("c0", pa.large_list(pa.large_string())), + pa.field("c1", pa.large_string()), + ] + ) + c0 = pa.array([["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]]) + c1 = pa.array(["a", "b", "c"]) + df = pa.table([c0, c1], schema=schema) + + # use pyarrow-backed dataframe to simulate the large_list and large_string dtypes + pandas_df = df.to_pandas(types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype)) + + wr.s3.to_parquet(pandas_df, path) + columns_types, _ = wr.s3.read_parquet_metadata(path) + assert len(columns_types) == len(df.columns) + assert columns_types.get("c0") == "array" + assert columns_types.get("c1") == "string" + + @pytest.mark.parametrize( "partition_cols", [