Skip to content

Commit

Permalink
fix: large_list and large_string unit test for `read_parquet_meta…
Browse files Browse the repository at this point in the history
…data` (#3089)

* Adds unit test for pyarrow large lists and strings

* Fixes typo in test func name
  • Loading branch information
ashrielbrian authored Feb 4, 2025
1 parent 322ad04 commit b837945
Showing 1 changed file with 21 additions and 0 deletions.
21 changes: 21 additions & 0 deletions tests/unit/test_s3_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,27 @@ def test_read_parquet_metadata_nonexistent_file(path):
wr.s3.read_parquet_metadata(path + "non-existent-file.parquet")


def test_read_parquet_metadata_large_dtype(path):
schema = pa.schema(
[
pa.field("c0", pa.large_list(pa.large_string())),
pa.field("c1", pa.large_string()),
]
)
c0 = pa.array([["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]])
c1 = pa.array(["a", "b", "c"])
df = pa.table([c0, c1], schema=schema)

# use pyarrow-backed dataframe to simulate the large_list and large_string dtypes
pandas_df = df.to_pandas(types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype))

wr.s3.to_parquet(pandas_df, path)
columns_types, _ = wr.s3.read_parquet_metadata(path)
assert len(columns_types) == len(df.columns)
assert columns_types.get("c0") == "array<string>"
assert columns_types.get("c1") == "string"


@pytest.mark.parametrize(
"partition_cols",
[
Expand Down

0 comments on commit b837945

Please sign in to comment.