Skip to content

Commit fd1c5b3

Browse files
authored
Safely ignore Parquet fields with unimplemented Thrift types (apache#9974)
# Which issue does this PR close? - Closes apache#9973. # Rationale for this change The thrift decoder should be able to skip fields with unimplemented thrift types `set`, `map`, and `uuid`. # What changes are included in this PR? Flesh out the thrift enums and add code to the skip function to handle the above types. # Are these changes tested? Yes, test is added # Are there any user-facing changes? No, changes are to internal APIs
1 parent 30185d6 commit fd1c5b3

3 files changed

Lines changed: 52 additions & 19 deletions

File tree

parquet/src/parquet_thrift.rs

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ pub(crate) enum FieldType {
157157
Set = 10,
158158
Map = 11,
159159
Struct = 12,
160+
Uuid = 13,
160161
}
161162

162163
impl TryFrom<u8> for FieldType {
@@ -176,25 +177,27 @@ impl TryFrom<u8> for FieldType {
176177
10 => Ok(Self::Set),
177178
11 => Ok(Self::Map),
178179
12 => Ok(Self::Struct),
180+
13 => Ok(Self::Uuid),
179181
_ => Err(ThriftProtocolError::InvalidFieldType(value)),
180182
}
181183
}
182184
}
183185

184-
impl TryFrom<ElementType> for FieldType {
185-
type Error = ThriftProtocolError;
186-
fn try_from(value: ElementType) -> std::result::Result<Self, Self::Error> {
186+
impl From<ElementType> for FieldType {
187+
fn from(value: ElementType) -> Self {
187188
match value {
188-
ElementType::Bool => Ok(Self::BooleanTrue),
189-
ElementType::Byte => Ok(Self::Byte),
190-
ElementType::I16 => Ok(Self::I16),
191-
ElementType::I32 => Ok(Self::I32),
192-
ElementType::I64 => Ok(Self::I64),
193-
ElementType::Double => Ok(Self::Double),
194-
ElementType::Binary => Ok(Self::Binary),
195-
ElementType::List => Ok(Self::List),
196-
ElementType::Struct => Ok(Self::Struct),
197-
_ => Err(ThriftProtocolError::InvalidFieldType(value as u8)),
189+
ElementType::Bool => Self::BooleanTrue,
190+
ElementType::Byte => Self::Byte,
191+
ElementType::I16 => Self::I16,
192+
ElementType::I32 => Self::I32,
193+
ElementType::I64 => Self::I64,
194+
ElementType::Double => Self::Double,
195+
ElementType::Binary => Self::Binary,
196+
ElementType::List => Self::List,
197+
ElementType::Set => Self::Set,
198+
ElementType::Map => Self::Map,
199+
ElementType::Struct => Self::Struct,
200+
ElementType::Uuid => Self::Uuid,
198201
}
199202
}
200203
}
@@ -213,6 +216,7 @@ pub(crate) enum ElementType {
213216
Set = 10,
214217
Map = 11,
215218
Struct = 12,
219+
Uuid = 13,
216220
}
217221

218222
impl TryFrom<u8> for ElementType {
@@ -235,6 +239,7 @@ impl TryFrom<u8> for ElementType {
235239
10 => Ok(Self::Set),
236240
11 => Ok(Self::Map),
237241
12 => Ok(Self::Struct),
242+
13 => Ok(Self::Uuid),
238243
_ => Err(ThriftProtocolError::InvalidElementType(value)),
239244
}
240245
}
@@ -499,27 +504,44 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
499504
FieldType::I64 => self.skip_vlq().map(|_| ()),
500505
FieldType::Double => self.skip_bytes(8).map(|_| ()),
501506
FieldType::Binary => self.skip_binary().map(|_| ()),
507+
// see https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#struct
502508
FieldType::Struct => {
503-
let mut last_field_id = 0i16;
504509
loop {
505-
let field_ident = self.read_field_begin(last_field_id)?;
510+
// we don't need field id for skipping, so always pass 0 for last id
511+
let field_ident = self.read_field_begin(0)?;
506512
if field_ident.field_type == FieldType::Stop {
507513
break;
508514
}
509515
self.skip_till_depth(field_ident.field_type, depth - 1)?;
510-
last_field_id = field_ident.id;
511516
}
512517
Ok(())
513518
}
514-
FieldType::List => {
519+
// lists and sets are encoded the same
520+
// see https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set
521+
FieldType::List | FieldType::Set => {
515522
let list_ident = self.read_list_begin()?;
523+
let element_type = FieldType::from(list_ident.element_type);
516524
for _ in 0..list_ident.size {
517-
let element_type = FieldType::try_from(list_ident.element_type)?;
518525
self.skip_till_depth(element_type, depth - 1)?;
519526
}
520527
Ok(())
521528
}
522-
// no list or map types in parquet format
529+
// see https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#map
530+
FieldType::Map => {
531+
let size = i32::try_from(self.read_vlq()?)?;
532+
if size > 0 {
533+
let kv = self.read_byte()?;
534+
let key_type = FieldType::from(ElementType::try_from(kv >> 4)?);
535+
let val_type = FieldType::from(ElementType::try_from(kv & 0xf)?);
536+
for _ in 0..size {
537+
self.skip_till_depth(key_type, depth - 1)?;
538+
self.skip_till_depth(val_type, depth - 1)?;
539+
}
540+
}
541+
Ok(())
542+
}
543+
// see https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#universal-unique-identifier-encoding
544+
FieldType::Uuid => self.skip_bytes(16).map(|_| ()),
523545
_ => Err(ThriftProtocolError::SkipUnsupportedType(field_type)),
524546
}
525547
}

parquet/tests/arrow_reader/bad_data.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use arrow::util::test_util::parquet_test_data;
2121
use bytes::Bytes;
2222
use parquet::arrow::arrow_reader::ArrowReaderBuilder;
2323
use parquet::errors::ParquetError;
24+
use parquet::file::metadata::ParquetMetaDataReader;
2425
use std::collections::HashSet;
2526
use std::path::PathBuf;
2627

@@ -175,6 +176,16 @@ fn non_standard_delta_blocks() {
175176
}
176177
}
177178

179+
#[test]
180+
fn skip_unknown_types() {
181+
// test file contains a FileMetaData with unknown fields with
182+
// types not currently used by Parquet (uuid, set, map). The
183+
// parser should be able to skip these unknown fields without
184+
// erroring.
185+
let file = Bytes::from_static(include_bytes!("new_types.bin"));
186+
ParquetMetaDataReader::decode_metadata(&file).unwrap();
187+
}
188+
178189
#[cfg(feature = "async")]
179190
#[tokio::test]
180191
#[allow(deprecated)]
56 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)