@@ -225,11 +225,6 @@ LogicalType ParquetReader::DeriveLogicalType(const SchemaElement &s_ele, Parquet
225225 return LogicalType::TIME_TZ;
226226 }
227227 return LogicalType::TIME;
228- } else if (s_ele.logicalType .__isset .GEOMETRY ) {
229- // TODO: Set CRS too
230- return LogicalType::GEOMETRY ();
231- } else if (s_ele.logicalType .__isset .GEOGRAPHY ) {
232- return LogicalType::GEOMETRY ();
233228 }
234229 }
235230 if (s_ele.__isset .converted_type ) {
@@ -409,6 +404,9 @@ unique_ptr<ColumnReader> ParquetReader::CreateReaderRecursive(ClientContext &con
409404 switch (schema.schema_type ) {
410405 case ParquetColumnSchemaType::FILE_ROW_NUMBER:
411406 return make_uniq<RowNumberColumnReader>(*this , schema);
407+ case ParquetColumnSchemaType::GEOMETRY: {
408+ return GeometryColumnReader::Create (*this , schema, context);
409+ }
412410 case ParquetColumnSchemaType::COLUMN: {
413411 if (schema.children .empty ()) {
414412 // leaf reader
@@ -486,11 +484,11 @@ ParquetColumnSchema::ParquetColumnSchema(string name_p, LogicalType type_p, idx_
486484 max_repeat (max_repeat), schema_index(schema_index), column_index(column_index) {
487485}
488486
489- ParquetColumnSchema::ParquetColumnSchema (ParquetColumnSchema parent , LogicalType result_type,
487+ ParquetColumnSchema::ParquetColumnSchema (ParquetColumnSchema child , LogicalType result_type,
490488 ParquetColumnSchemaType schema_type)
491- : schema_type(schema_type), name(parent .name), type(std::move(result_type)), max_define(parent .max_define),
492- max_repeat(parent .max_repeat), schema_index(parent .schema_index), column_index(parent .column_index) {
493- children.push_back (std::move (parent ));
489+ : schema_type(schema_type), name(child .name), type(std::move(result_type)), max_define(child .max_define),
490+ max_repeat(child .max_repeat), schema_index(child .schema_index), column_index(child .column_index) {
491+ children.push_back (std::move (child ));
494492}
495493
496494unique_ptr<BaseStatistics> ParquetColumnSchema::Stats (const FileMetaData &file_meta_data,
@@ -517,6 +515,32 @@ unique_ptr<BaseStatistics> ParquetColumnSchema::Stats(const FileMetaData &file_m
517515 return ParquetStatisticsUtils::TransformColumnStatistics (*this , columns, parquet_options.can_have_nan );
518516}
519517
518+ static bool IsGeometryType (const SchemaElement &s_ele, const ParquetFileMetadataCache &metadata, idx_t depth) {
519+ const auto is_blob = s_ele.__isset .type && s_ele.type == Type::BYTE_ARRAY;
520+ if (!is_blob) {
521+ return false ;
522+ }
523+
524+ // TODO: Handle CRS in the future
525+ const auto is_native_geom = s_ele.__isset .logicalType && s_ele.logicalType .__isset .GEOMETRY ;
526+ const auto is_native_geog = s_ele.__isset .logicalType && s_ele.logicalType .__isset .GEOGRAPHY ;
527+ if (is_native_geom || is_native_geog) {
528+ return true ;
529+ }
530+
531+ // geoparquet types have to be at the root of the schema, and have to be present in the kv metadata.
532+ const auto is_at_root = depth == 1 ;
533+ const auto is_in_gpq_metadata = metadata.geo_metadata && metadata.geo_metadata ->IsGeometryColumn (s_ele.name );
534+ const auto is_leaf = s_ele.num_children == 0 ;
535+ const auto is_geoparquet_geom = is_at_root && is_in_gpq_metadata && is_leaf;
536+
537+ if (is_geoparquet_geom) {
538+ return true ;
539+ }
540+
541+ return false ;
542+ }
543+
520544ParquetColumnSchema ParquetReader::ParseSchemaRecursive (idx_t depth, idx_t max_define, idx_t max_repeat,
521545 idx_t &next_schema_idx, idx_t &next_file_idx,
522546 ClientContext &context) {
@@ -540,16 +564,26 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d
540564 max_repeat++;
541565 }
542566
543- // Check for geoparquet spatial types
544- if (depth == 1 ) {
545- // geoparquet types have to be at the root of the schema, and have to be present in the kv metadata.
546- // geoarrow types, although geometry columns, are structs and have children and are handled below.
547- if (metadata->geo_metadata && metadata->geo_metadata ->IsGeometryColumn (s_ele.name ) && s_ele.num_children == 0 ) {
548- auto geom_schema = ParseColumnSchema (s_ele, max_define, max_repeat, this_idx, next_file_idx++);
549- // overwrite the derived type with GEOMETRY
550- geom_schema.type = LogicalType::GEOMETRY ();
551- return geom_schema;
552- }
567+ // Check for geometry type
568+ if (IsGeometryType (s_ele, *metadata, depth)) {
569+ // Geometries in both GeoParquet and native parquet are stored as a WKB-encoded BLOB.
570+ // Because we don't just want to validate that the WKB encoding is correct, but also transform it into
571+ // little-endian if necessary, we cant just make use of the StringColumnReader without heavily modifying it.
572+ // Therefore, we create a dedicated GEOMETRY parquet column schema type, which wraps the underlying BLOB column.
573+ // This schema type gets instantiated as a ExpressionColumnReader on top of the standard Blob/String reader,
574+ // which performs the WKB validation/transformation using the `ST_GeomFromWKB` function of DuckDB.
575+ // This enables us to also support other geometry encodings (such as GeoArrow geometries) easier in the future.
576+
577+ // Inner BLOB schema
578+ ParquetColumnSchema blob_schema (max_define, max_repeat, this_idx, next_file_idx++,
579+ ParquetColumnSchemaType::COLUMN);
580+ blob_schema.name = s_ele.name ;
581+ blob_schema.type = LogicalType::BLOB;
582+
583+ // Wrap in geometry schema
584+ ParquetColumnSchema geom_schema (std::move (blob_schema), LogicalType::GEOMETRY (),
585+ ParquetColumnSchemaType::GEOMETRY);
586+ return geom_schema;
553587 }
554588
555589 if (s_ele.__isset .num_children && s_ele.num_children > 0 ) { // inner node
0 commit comments