From 91b80eed6040c34586f57df1bb72070a4238b2b2 Mon Sep 17 00:00:00 2001 From: rui-mo Date: Tue, 17 Sep 2024 13:08:53 -0700 Subject: [PATCH] Fix Parquet read with an isNull filter on nested array (#10890) Summary: Below failure occurs when selecting a struct column from Parquet filtering using an isNull filter on nested array and the filter is extracted to subfield filters in 'extractFiltersFromRemainingFilter'. ``` velox/dwio/parquet/reader/PageReader.cpp:737, Function:skip, Expression: No decoder to skip, Source: RUNTIME, ErrorCode: INVALID_STATE unknown file: Failure C++ exception with description "Exception: VeloxRuntimeError Error Source: RUNTIME Error Code: INVALID_STATE Reason: No decoder to skip ``` Pull Request resolved: https://github.com/facebookincubator/velox/pull/10890 Reviewed By: Yuhta Differential Revision: D62885150 Pulled By: kevinwilfong fbshipit-source-id: a940c092f03a6dbf234312e05306760cf3556c26 --- velox/dwio/parquet/reader/PageReader.cpp | 6 ++++++ .../tests/examples/struct_of_array.parquet | Bin 0 -> 1004 bytes .../tests/reader/ParquetTableScanTest.cpp | 18 +++++++++++++++--- 3 files changed, 21 insertions(+), 3 deletions(-) create mode 100644 velox/dwio/parquet/tests/examples/struct_of_array.parquet diff --git a/velox/dwio/parquet/reader/PageReader.cpp b/velox/dwio/parquet/reader/PageReader.cpp index cf46fdb58184..c185358470e6 100644 --- a/velox/dwio/parquet/reader/PageReader.cpp +++ b/velox/dwio/parquet/reader/PageReader.cpp @@ -719,8 +719,14 @@ void PageReader::skip(int64_t numRows) { } firstUnvisited_ += numRows; + if (toSkip == 0) { + return; + } // Skip nulls toSkip = skipNulls(toSkip); + if (toSkip == 0) { + return; + } // Skip the decoder if (isDictionary()) { diff --git a/velox/dwio/parquet/tests/examples/struct_of_array.parquet b/velox/dwio/parquet/tests/examples/struct_of_array.parquet new file mode 100644 index 0000000000000000000000000000000000000000..21dd6dcf883771c6ee7e443378bfd36eb1f868f8 GIT binary patch literal 1004 zcmbVLPm9w)6rY(*V_8Z;aE6KGP=hVI5JQvn-B7!V3O{d*pns$>^SxPUu zccsZf)dmuTu-xP7GzXY1`%~isD99cfTqT^C`>be(NdN0TQZVk8D~{=V5(z&qC(-`G+h{HawL=g%6Igyf{qr5KVA#{V4PIeH&byP z*K@qePJb3ggG#$C{d&-;@A-bGC4;6H_`y)L0>9C3%c1XkuJ3ulUSrp=VYJnBJMf$i H;hp~mRA=(5 literal 0 HcmV?d00001 diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index 4261ee702249..476272aac692 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -370,7 +370,6 @@ TEST_F(ParquetTableScanTest, decimalSubfieldFilter) { "Scalar function signature is not supported: eq(DECIMAL(5, 2), DECIMAL(5, 1))"); } -// Core dump is fixed. TEST_F(ParquetTableScanTest, map) { auto vector = makeMapVector({{{"name", "gluten"}}}); @@ -399,7 +398,6 @@ TEST_F(ParquetTableScanTest, nullMap) { assertSelectWithFilter({"i", "c"}, {}, "", "SELECT i, c FROM tmp"); } -// Core dump is fixed. TEST_F(ParquetTableScanTest, singleRowStruct) { auto vector = makeArrayVector({{}}); loadData( @@ -414,7 +412,6 @@ TEST_F(ParquetTableScanTest, singleRowStruct) { assertSelectWithFilter({"s"}, {}, "", "SELECT (0, 1)"); } -// Core dump and incorrect result are fixed. TEST_F(ParquetTableScanTest, array) { auto vector = makeArrayVector({}); loadData( @@ -528,6 +525,21 @@ TEST_F(ParquetTableScanTest, reqArrayLegacy) { "SELECT UNNEST(array[array['a', 'b'], array[], array['c', 'd']])"); } +TEST_F(ParquetTableScanTest, filterOnNestedArray) { + loadData( + getExampleFilePath("struct_of_array.parquet"), + ROW({"struct"}, + {ROW({"a0", "a1"}, {ARRAY(VARCHAR()), ARRAY(INTEGER())})}), + makeRowVector( + {"unused"}, + { + makeFlatVector({}), + })); + + assertSelectWithFilter( + {"struct"}, {}, "struct.a0 is null", "SELECT ROW(NULL, NULL)"); +} + TEST_F(ParquetTableScanTest, readAsLowerCase) { auto plan = PlanBuilder(pool_.get()) .tableScan(ROW({"a"}, {BIGINT()}), {}, "")