Skip to content

Commit 713214c

Browse files
committed
fix for logical schema
1 parent d74b8ac commit 713214c

2 files changed

Lines changed: 48 additions & 40 deletions

File tree

datafusion/datasource-parquet/src/opener.rs

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -180,16 +180,16 @@ impl FileOpener for ParquetOpener {
180180
}
181181
}
182182

183-
let ordering = reader_metadata
184-
.metadata()
185-
.file_metadata()
186-
.column_orders()
187-
.map_or_else(
188-
|| vec![ColumnOrdering::Unknown; physical_file_schema.fields().len()],
189-
|column_orders| {
190-
column_orders
191-
.iter()
192-
.map(|order| match order {
183+
// Map column ordering from physical schema to logical
184+
let ordering: Vec<ColumnOrdering> = if let Some(column_orders) =
185+
reader_metadata.metadata().file_metadata().column_orders()
186+
{
187+
logical_file_schema
188+
.fields()
189+
.iter()
190+
.map(|field| {
191+
match physical_file_schema.index_of(field.name()) {
192+
Ok(idx) => match column_orders[idx] {
193193
ColumnOrder::TYPE_DEFINED_ORDER(sort_order) => {
194194
match sort_order {
195195
SortOrder::SIGNED => ColumnOrdering::Signed,
@@ -202,10 +202,14 @@ impl FileOpener for ParquetOpener {
202202
ColumnOrdering::TotalOrder
203203
}*/
204204
ColumnOrder::UNDEFINED => ColumnOrdering::Unknown,
205-
})
206-
.collect::<Vec<_>>()
207-
},
208-
);
205+
},
206+
_ => ColumnOrdering::Unknown,
207+
}
208+
})
209+
.collect::<Vec<_>>()
210+
} else {
211+
vec![ColumnOrdering::Unknown; logical_file_schema.fields().len()]
212+
};
209213

210214
// Build predicates for this specific file
211215
let (pruning_predicate, page_pruning_predicate) = build_pruning_predicates(

datafusion/physical-optimizer/src/pruning.rs

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1511,35 +1511,39 @@ fn build_predicate_expression(
15111511

15121512
// TODO(ets): this is rather parquet specific...still wonder if this should be done
15131513
// at the datasource level
1514-
let colidx = column_index_for_expr(&left, schema)
1515-
.or_else(|| column_index_for_expr(&right, schema));
1516-
if let Some(colidx) = colidx {
1517-
let col_order = column_orderings[colidx];
1518-
1519-
// If the ColumnOrder is undefined (as opposed to unknown), we shouldn't be pruning
1520-
// since min/max are invalid.
1521-
if col_order == ColumnOrdering::Undefined {
1522-
dbg!("Cannot prune because column order is undefined");
1523-
return unhandled_hook.handle(expr);
1524-
}
15251514

1526-
// left and right should have the same type by now, so only check left
1527-
if left.data_type(schema).is_ok_and(|t| t.is_floating()) {
1528-
// By the time we've reached this code, we've narrowed down the possible expressions
1529-
// to binary expressions. Of those allowed by `build_statistics_expr`, we only need
1530-
// to worry about greater/less than expressions and not equal.
1531-
match op {
1532-
Operator::Gt
1533-
| Operator::GtEq
1534-
| Operator::Lt
1535-
| Operator::LtEq
1536-
| Operator::NotEq => {
1537-
if col_order != ColumnOrdering::TotalOrder {
1538-
dbg!("Cannot prune floating point column because NaN may be present");
1539-
return unhandled_hook.handle(expr);
1515+
// Sanity check that column orderings match schema
1516+
if column_orderings.len() == schema.fields().len() {
1517+
let colidx = column_index_for_expr(&left, schema)
1518+
.or_else(|| column_index_for_expr(&right, schema));
1519+
if let Some(colidx) = colidx {
1520+
let col_order = column_orderings[colidx];
1521+
1522+
// If the ColumnOrder is undefined (as opposed to unknown), we shouldn't be pruning
1523+
// since min/max are invalid.
1524+
if col_order == ColumnOrdering::Undefined {
1525+
dbg!("Cannot prune because column order is undefined");
1526+
return unhandled_hook.handle(expr);
1527+
}
1528+
1529+
// left and right should have the same type by now, so only check left
1530+
if left.data_type(schema).is_ok_and(|t| t.is_floating()) {
1531+
// By the time we've reached this code, we've narrowed down the possible expressions
1532+
// to binary expressions. Of those allowed by `build_statistics_expr`, we only need
1533+
// to worry about greater/less than expressions and not equal.
1534+
match op {
1535+
Operator::Gt
1536+
| Operator::GtEq
1537+
| Operator::Lt
1538+
| Operator::LtEq
1539+
| Operator::NotEq => {
1540+
if col_order != ColumnOrdering::TotalOrder {
1541+
dbg!("Cannot prune floating point column because NaN may be present");
1542+
return unhandled_hook.handle(expr);
1543+
}
15401544
}
1545+
_ => (),
15411546
}
1542-
_ => (),
15431547
}
15441548
}
15451549
}

0 commit comments

Comments
 (0)