Skip to content

Commit 88222ac

Browse files
authored
perf(duckdb): push down list length expressions (#8544)
Pushes DuckDB's list-length scalar function into the Vortex scan as the `list_length` expression, so lengths are computed from list offsets/sizes without materializing element values. Pushdowns supported: - **Projection** (`SELECT len(list)` / `length(list)` / `array_length(list)`) - **Filter** (`WHERE array_length(list) >= k`, also `len`/`length`) Each maps to `cast(list_length(col), i64)` — DuckDB's `len`/`array_length` return `BIGINT` while `list_length` returns `u64`. `len`/`length` are overloaded with strings/bits, so the filter path needs the argument type to disambiguate. Added a small FFI accessor `duckdb_vx_expr_get_return_type` plus `ExpressionRef::return_type()`, and gate `len`/`length`/`array_length` on the bound child being `LIST`/`ARRAY`. Does not currently support `array_length(expr, dim)`. Stacked on #8495. --------- Signed-off-by: Matt Katz <mhkatz97@gmail.com>
1 parent 3be9427 commit 88222ac

6 files changed

Lines changed: 352 additions & 0 deletions

File tree

vortex-duckdb/cpp/expr.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ extern "C" duckdb_vx_expr_class duckdb_vx_expr_get_class(duckdb_vx_expr ffi_expr
4747
return static_cast<duckdb_vx_expr_class>(expr->GetExpressionClass());
4848
}
4949

50+
extern "C" duckdb_logical_type duckdb_vx_expr_get_return_type(duckdb_vx_expr ffi_expr) {
51+
D_ASSERT(ffi_expr);
52+
auto expr = reinterpret_cast<Expression *>(ffi_expr);
53+
return reinterpret_cast<duckdb_logical_type>(&expr->return_type);
54+
}
55+
5056
extern "C" const char *duckdb_vx_expr_get_bound_column_ref_get_name(duckdb_vx_expr ffi_expr) {
5157
if (!ffi_expr) {
5258
return nullptr;

vortex-duckdb/cpp/include/expr.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,10 @@ typedef enum DUCKDB_VX_EXPR_TYPE {
213213

214214
duckdb_vx_expr_class duckdb_vx_expr_get_class(duckdb_vx_expr expr);
215215

216+
/// Return the (bound) return type of the expression. The logical type is borrowed from the
217+
/// expression and must not be freed.
218+
duckdb_logical_type duckdb_vx_expr_get_return_type(duckdb_vx_expr expr);
219+
216220
const char *duckdb_vx_expr_get_bound_column_ref_get_name(duckdb_vx_expr expr);
217221

218222
duckdb_value duckdb_vx_expr_bound_constant_get_value(duckdb_vx_expr expr);

vortex-duckdb/src/convert/expr.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use vortex::expr::get_item;
2222
use vortex::expr::is_not_null;
2323
use vortex::expr::is_null;
2424
use vortex::expr::list_contains;
25+
use vortex::expr::list_length;
2526
use vortex::expr::lit;
2627
use vortex::expr::not;
2728
use vortex::expr::or_collect;
@@ -37,6 +38,7 @@ use vortex::scalar_fn::fns::like::LikeOptions;
3738
use vortex::scalar_fn::fns::literal::Literal;
3839
use vortex::scalar_fn::fns::operators::Operator;
3940

41+
use crate::cpp::DUCKDB_TYPE;
4042
use crate::cpp::DUCKDB_VX_EXPR_TYPE;
4143
use crate::duckdb;
4244
use crate::duckdb::BoundFunction;
@@ -57,6 +59,20 @@ fn from_bound_str(value: &duckdb::ExpressionRef) -> VortexResult<String> {
5759
}
5860
}
5961

62+
/// Whether the expression's return type is a `LIST` or fixed-size `ARRAY`.
63+
fn returns_a_list(expr: &duckdb::ExpressionRef) -> bool {
64+
matches!(
65+
expr.return_type().as_type_id(),
66+
DUCKDB_TYPE::DUCKDB_TYPE_LIST | DUCKDB_TYPE::DUCKDB_TYPE_ARRAY
67+
)
68+
}
69+
70+
/// Wrap `expr` in `list_length`. Since vortex `list_length` returns u64 but duckdb equivalents
71+
/// return i64, we must cast as well.
72+
fn build_list_length(expr: Expression, nullability: Nullability) -> Expression {
73+
cast(list_length(expr), DType::Primitive(PType::I64, nullability))
74+
}
75+
6076
fn try_from_bound_function(
6177
func: &BoundFunction,
6278
col_sub: Option<&Expression>,
@@ -115,6 +131,37 @@ fn try_from_bound_function(
115131
};
116132
Like.new_expr(LikeOptions::default(), [value, lit(pattern)])
117133
}
134+
"array_length" => {
135+
let children = func.children().collect::<Vec<_>>();
136+
// Only accept array_length(expr) rather than array_length(expr, dim).
137+
if children.len() != 1 {
138+
return Ok(None);
139+
}
140+
let Some(col) = try_from_expression_inner(children[0], col_sub)? else {
141+
return Ok(None);
142+
};
143+
144+
// We don't know the column's nullability here, so we set it to nullable.
145+
build_list_length(col, Nullability::Nullable)
146+
}
147+
// len/length semantics depend on the return type of underlying expr.
148+
"len" | "length" => {
149+
let children: Vec<_> = func.children().collect();
150+
vortex_ensure!(children.len() == 1);
151+
let child = children[0];
152+
153+
if returns_a_list(child) {
154+
let Some(col) = try_from_expression_inner(child, col_sub)? else {
155+
return Ok(None);
156+
};
157+
158+
// Same nullability rationale as in "array_length" branch.
159+
let list_len_expr = build_list_length(col, Nullability::Nullable);
160+
return Ok(Some(list_len_expr));
161+
} else {
162+
return Ok(None);
163+
}
164+
}
118165
_ => {
119166
debug!("bound function {}", func.scalar_function.name());
120167
return Ok(None);
@@ -137,6 +184,11 @@ pub(super) fn try_from_bound_expression_with_col_sub(
137184
try_from_expression_inner(value, Some(col_sub))
138185
}
139186

187+
fn is_supported_length_alias(func: &BoundFunction) -> bool {
188+
let children: Vec<_> = func.children().collect();
189+
children.len() == 1 && returns_a_list(children[0])
190+
}
191+
140192
// Called before pushdown_complex_filter or a table filter expression call.
141193
// As we support complex filter pushdown, Duckdb pushes expressions to Vortex.
142194
// However, it doesn't know what type of expressions we can handle. Here we list
@@ -173,6 +225,8 @@ pub fn can_push_expression(value: &duckdb::ExpressionRef) -> bool {
173225
|| name == "~~"
174226
|| name == "!~~"
175227
|| name == "strlen"
228+
|| name == "array_length"
229+
|| (matches!(name, "len" | "length") && is_supported_length_alias(&func))
176230
}
177231
ExpressionClass::BoundOperator(op) => {
178232
if !matches!(
@@ -190,6 +244,13 @@ pub fn can_push_expression(value: &duckdb::ExpressionRef) -> bool {
190244
}
191245
}
192246

247+
/// Applies `list_length` expression to a duckdb field
248+
fn list_length_on_field(field: &DuckdbField) -> Expression {
249+
let col = get_item(field.name.as_str(), root());
250+
251+
build_list_length(col, field.dtype.nullability())
252+
}
253+
193254
pub fn try_from_projection_expression(
194255
value: &duckdb::ExpressionRef,
195256
field: &DuckdbField,
@@ -208,6 +269,13 @@ pub fn try_from_projection_expression(
208269
let col = cast(col, dtype);
209270
Some(col)
210271
}
272+
"array_length" => {
273+
// Only accept array_length(expr) rather than array_length(expr, dim).
274+
(func.children().count() == 1).then(|| list_length_on_field(field))
275+
}
276+
// len/length have different semantics depending on field dtype.
277+
"len" | "length" => matches!(field.dtype, DType::List(..) | DType::FixedSizeList(..))
278+
.then(|| list_length_on_field(field)),
211279
_ => None,
212280
})
213281
}

vortex-duckdb/src/duckdb/expr.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ use std::ptr;
1010
use crate::cpp;
1111
use crate::cpp::duckdb_vx_expr_class;
1212
use crate::duckdb::DDBString;
13+
use crate::duckdb::LogicalType;
14+
use crate::duckdb::LogicalTypeRef;
1315
use crate::duckdb::ScalarFunction;
1416
use crate::duckdb::ScalarFunctionRef;
1517
use crate::duckdb::Value;
@@ -33,6 +35,11 @@ impl ExpressionRef {
3335
unsafe { cpp::duckdb_vx_expr_get_class(self.as_ptr()) }
3436
}
3537

38+
/// The return type of this expression.
39+
pub fn return_type(&self) -> &LogicalTypeRef {
40+
unsafe { LogicalType::borrow(cpp::duckdb_vx_expr_get_return_type(self.as_ptr())) }
41+
}
42+
3643
/// Match the subclass of the expression.
3744
pub fn as_class(&self) -> Option<ExpressionClass<'_>> {
3845
Some(

vortex-duckdb/src/e2e_test/vortex_scan_test.rs

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,3 +1014,101 @@ fn test_geometry() {
10141014
let area = vec.as_slice_with_len::<f64>(chunk.len().as_())[0];
10151015
assert_eq!(area, 1000.0);
10161016
}
1017+
1018+
/// `SELECT array_length(list)` / `len(list)` / `length(list)` should push the list-length
1019+
/// computation into the Vortex scan (computed from offsets, without materializing the list
1020+
/// elements) and return the per-row element counts.
1021+
#[test]
1022+
fn test_vortex_scan_list_length_projection() {
1023+
let file = RUNTIME.block_on(async {
1024+
let integers = PrimitiveArray::from_iter([
1025+
10i32, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150,
1026+
]);
1027+
// Variable-length lists with 3, 4, 1, 5, 2 elements respectively.
1028+
let offsets = buffer![0i32, 3, 7, 8, 13, 15];
1029+
let list_array = ListArray::try_new(
1030+
integers.into_array(),
1031+
offsets.into_array(),
1032+
Validity::AllValid,
1033+
)
1034+
.unwrap();
1035+
1036+
write_single_column_vortex_file("int_list", list_array).await
1037+
});
1038+
1039+
let conn = database_connection();
1040+
let file_path = file.path().to_string_lossy();
1041+
1042+
// `len`/`length` bind to the same DuckDB function set as `array_length` for list arguments.
1043+
for func in ["array_length", "len", "length"] {
1044+
let result = conn
1045+
.query(&format!("SELECT {func}(int_list) FROM '{file_path}'"))
1046+
.unwrap();
1047+
1048+
let mut lengths = Vec::new();
1049+
for chunk in result {
1050+
let len = chunk.len().as_();
1051+
let vec = chunk.get_vector(0);
1052+
lengths.extend_from_slice(vec.as_slice_with_len::<i64>(len));
1053+
}
1054+
1055+
assert_eq!(lengths, vec![3, 4, 1, 5, 2], "{func}(int_list) mismatch");
1056+
}
1057+
}
1058+
1059+
/// `WHERE array_length(list) >= k` should push down as a complex filter.
1060+
#[test]
1061+
fn test_vortex_scan_list_length_filter() {
1062+
let file = RUNTIME.block_on(async {
1063+
let integers = PrimitiveArray::from_iter([
1064+
10i32, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150,
1065+
]);
1066+
// Variable-length lists with 3, 4, 1, 5, 2 elements respectively.
1067+
let offsets = buffer![0i32, 3, 7, 8, 13, 15];
1068+
let list_array = ListArray::try_new(
1069+
integers.into_array(),
1070+
offsets.into_array(),
1071+
Validity::AllValid,
1072+
)
1073+
.unwrap();
1074+
1075+
write_single_column_vortex_file("int_list", list_array).await
1076+
});
1077+
1078+
// Lists with length >= 4: the 4-element and 5-element lists => 2 rows.
1079+
let count = scan_vortex_file_single_row::<i64, i64>(
1080+
file,
1081+
"SELECT COUNT(*) FROM ? WHERE array_length(int_list) >= 4",
1082+
0,
1083+
);
1084+
assert_eq!(count, 2);
1085+
}
1086+
1087+
/// `array_length`/`len`/`length` over a FixedSizeList column. The length is the fixed list size.
1088+
#[test]
1089+
fn test_vortex_scan_fixed_size_list_length_projection() {
1090+
let file = RUNTIME.block_on(async {
1091+
// 6 fixed-size lists of 4 i32 elements each.
1092+
let elements = (0..24i32).collect::<PrimitiveArray>();
1093+
let fsl = FixedSizeListArray::new(elements.into_array(), 4, Validity::AllValid, 6);
1094+
write_single_column_vortex_file("int_lists", fsl).await
1095+
});
1096+
1097+
let conn = database_connection();
1098+
let file_path = file.path().to_string_lossy();
1099+
1100+
for func in ["array_length", "len", "length"] {
1101+
let result = conn
1102+
.query(&format!("SELECT {func}(int_lists) FROM '{file_path}'"))
1103+
.unwrap();
1104+
1105+
let mut lengths = Vec::new();
1106+
for chunk in result {
1107+
let len = chunk.len().as_();
1108+
let vec = chunk.get_vector(0);
1109+
lengths.extend_from_slice(vec.as_slice_with_len::<i64>(len));
1110+
}
1111+
1112+
assert_eq!(lengths, vec![4i64; 6], "{func}(int_lists) mismatch");
1113+
}
1114+
}

0 commit comments

Comments
 (0)