From b837945adb10a288f17774488d92340b27c51edd Mon Sep 17 00:00:00 2001
From: Brian T <ashrielbrian@users.noreply.github.com>
Date: Tue, 4 Feb 2025 21:40:51 +0800
Subject: [PATCH] fix: `large_list` and `large_string` unit test for
 `read_parquet_metadata` (#3089)

* Adds unit test for pyarrow large lists and strings

* Fixes typo in test func name
---
 tests/unit/test_s3_parquet.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
diff --git a/tests/unit/test_s3_parquet.py b/tests/unit/test_s3_parquet.py
index b650100bc..eaf51ecb1 100644
--- a/tests/unit/test_s3_parquet.py
+++ b/tests/unit/test_s3_parquet.py
@@ -62,6 +62,27 @@ def test_read_parquet_metadata_nonexistent_file(path):
         wr.s3.read_parquet_metadata(path + "non-existent-file.parquet")
 
 
+def test_read_parquet_metadata_large_dtype(path):
+    schema = pa.schema(
+        [
+            pa.field("c0", pa.large_list(pa.large_string())),
+            pa.field("c1", pa.large_string()),
+        ]
+    )
+    c0 = pa.array([["a", "b", "c"], ["d", "e", "f"], ["g", "h", "i"]])
+    c1 = pa.array(["a", "b", "c"])
+    df = pa.table([c0, c1], schema=schema)
+
+    # use pyarrow-backed dataframe to simulate the large_list and large_string dtypes
+    pandas_df = df.to_pandas(types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype))
+
+    wr.s3.to_parquet(pandas_df, path)
+    columns_types, _ = wr.s3.read_parquet_metadata(path)
+    assert len(columns_types) == len(df.columns)
+    assert columns_types.get("c0") == "array<string>"
+    assert columns_types.get("c1") == "string"
+
+
 @pytest.mark.parametrize(
     "partition_cols",
     [