Skip to content

Commit

Permalink
Parquet reader should use parent type for nested types (facebookincub…
Browse files Browse the repository at this point in the history
…ator#7774)

Summary:
When traversing the Parquet metadata and evaluating a placeholder node (node created to represent
 "array" and "map") current implementation determines array(list) or map based on the number of children.
 But for some Parquet files, this is not necessarily accurate. A better check is to check on the parent of the placeholder node and determine the type of the parent.

Pull Request resolved: facebookincubator#7774

Reviewed By: pedroerp

Differential Revision: D56243812

Pulled By: Yuhta

fbshipit-source-id: e7112e39271021006ad02914c4d1f128d1d38093
  • Loading branch information
chliang71 authored and facebook-github-bot committed Apr 17, 2024
1 parent 841a9ba commit 720787a
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 4 deletions.
21 changes: 17 additions & 4 deletions velox/dwio/parquet/reader/ParquetReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,10 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
// value children.
if (schema[parentSchemaIdx].converted_type ==
thrift::ConvertedType::MAP) {
// TODO: the group names need to be checked. According to the spec,
// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps
// the name of the schema element being 'key_value' is
// also an indication of this is a map type
VELOX_CHECK_EQ(
schemaElement.repetition_type,
thrift::FieldRepetitionType::REPEATED);
Expand Down Expand Up @@ -331,10 +335,14 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
} else {
if (schemaElement.repetition_type ==
thrift::FieldRepetitionType::REPEATED) {
VELOX_CHECK_LE(
children.size(), 2, "children size should not be larger than 2");
if (children.size() == 1) {
if (schema[parentSchemaIdx].converted_type ==
thrift::ConvertedType::LIST) {
// TODO: the group names need to be checked. According to spec,
// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
// the name of the schema element being 'array' is
// also an indication of this is a list type
// child of LIST
VELOX_CHECK_GE(children.size(), 1);
auto type = TypeFactory<TypeKind::ARRAY>::create(children[0]->type());
return std::make_unique<ParquetTypeWithId>(
std::move(type),
Expand All @@ -347,8 +355,13 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
maxRepeat,
maxDefine);
} else if (children.size() == 2) {
} else if (
schema[parentSchemaIdx].converted_type ==
thrift::ConvertedType::MAP ||
schema[parentSchemaIdx].converted_type ==
thrift::ConvertedType::MAP_KEY_VALUE) {
// children of MAP
VELOX_CHECK_EQ(children.size(), 2);
auto type = TypeFactory<TypeKind::MAP>::create(
children[0]->type(), children[1]->type());
return std::make_unique<ParquetTypeWithId>(
Expand Down
Binary file not shown.
Binary file not shown.
85 changes: 85 additions & 0 deletions velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,91 @@ TEST_F(ParquetReaderTest, parseSample) {
sampleSchema(), *rowReader, expected, *leafPool_);
}

TEST_F(ParquetReaderTest, parseUnannotatedList) {
// unannotated_list.parquet has the following the schema
// the list is defined without the middle layer
// message ParquetSchema {
// optional group self (LIST) {
// repeated group self_tuple {
// optional int64 a;
// optional boolean b;
// required binary c (STRING);
// }
// }
// }
const std::string sample(getExampleFilePath("unannotated_list.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOpts{leafPool_.get()};
auto reader = createReader(sample, readerOpts);

EXPECT_EQ(reader->numberOfRows(), 22ULL);

auto type = reader->typeWithId();
EXPECT_EQ(type->size(), 1ULL);
auto col0 = type->childAt(0);
EXPECT_EQ(col0->type()->kind(), TypeKind::ARRAY);
EXPECT_EQ(
std::static_pointer_cast<const ParquetTypeWithId>(col0)->name_, "self");

EXPECT_EQ(col0->size(), 3ULL);
EXPECT_EQ(col0->childAt(0)->type()->kind(), TypeKind::BIGINT);
EXPECT_EQ(
std::static_pointer_cast<const ParquetTypeWithId>(col0->childAt(0))
->name_,
"a");

EXPECT_EQ(col0->childAt(1)->type()->kind(), TypeKind::BOOLEAN);
EXPECT_EQ(
std::static_pointer_cast<const ParquetTypeWithId>(col0->childAt(1))
->name_,
"b");

EXPECT_EQ(col0->childAt(2)->type()->kind(), TypeKind::VARCHAR);
EXPECT_EQ(
std::static_pointer_cast<const ParquetTypeWithId>(col0->childAt(2))
->name_,
"c");
}

TEST_F(ParquetReaderTest, parseUnannotatedMap) {
// unannotated_map.parquet has the following the schema
// the map is defined with a MAP_KEY_VALUE node
// message hive_schema {
// optional group test (MAP) {
// repeated group key_value (MAP_KEY_VALUE) {
// required binary key (STRING);
// optional int64 value;
// }
// }
//}
const std::string filename("unnotated_map.parquet");
const std::string sample(getExampleFilePath(filename));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);
auto numRows = reader->numberOfRows();

auto type = reader->typeWithId();
EXPECT_EQ(type->size(), 1ULL);
auto col0 = type->childAt(0);
EXPECT_EQ(col0->type()->kind(), TypeKind::MAP);
EXPECT_EQ(
std::static_pointer_cast<const ParquetTypeWithId>(col0)->name_, "test");

EXPECT_EQ(col0->size(), 2ULL);
EXPECT_EQ(col0->childAt(0)->type()->kind(), TypeKind::VARCHAR);
EXPECT_EQ(
std::static_pointer_cast<const ParquetTypeWithId>(col0->childAt(0))
->name_,
"key");

EXPECT_EQ(col0->childAt(1)->type()->kind(), TypeKind::BIGINT);
EXPECT_EQ(
std::static_pointer_cast<const ParquetTypeWithId>(col0->childAt(1))
->name_,
"value");
}

TEST_F(ParquetReaderTest, parseSampleRange1) {
const std::string sample(getExampleFilePath("sample.parquet"));

Expand Down

0 comments on commit 720787a

Please sign in to comment.