Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions src/functions/ducklake_add_data_files.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,13 +410,21 @@ FROM parquet_full_metadata(%s)
}

if (stats_null_count_validity.RowIsValid(metadata_idx)) {
stats.has_null_count = true;
stats.null_count = stats_null_count_data[metadata_idx];
auto null_count = stats_null_count_data[metadata_idx];
// Guard against negative values (indicates an underflow in parquet reader)
if (null_count >= 0) {
stats.has_null_count = true;
stats.null_count = static_cast<idx_t>(null_count);
}
}

if (stats_num_values_validity.RowIsValid(metadata_idx)) {
stats.has_num_values = true;
stats.num_values = stats_num_values_data[metadata_idx];
auto num_values = stats_num_values_data[metadata_idx];
// Guard against negative values (indicates an underflow in parquet reader)
if (num_values >= 0) {
stats.has_num_values = true;
stats.num_values = static_cast<idx_t>(num_values);
}
}

if (total_compressed_size_validity.RowIsValid(metadata_idx)) {
Expand Down
13 changes: 11 additions & 2 deletions src/storage/ducklake_transaction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1181,8 +1181,17 @@ DuckLakeFileInfo DuckLakeTransaction::GetNewDataFile(DuckLakeDataFile &file, Duc
column_stats.column_size_bytes = to_string(stats.column_size_bytes);
if (stats.has_null_count) {
if (stats.has_num_values) {
column_stats.value_count = to_string(stats.num_values);
column_stats.null_count = to_string(stats.null_count);
// value_count should be the count of non-null values: num_values - null_count
// Validate that null_count doesn't exceed num_values to prevent underflow
if (stats.null_count > stats.num_values) {
// Invalid stats - null_count can't exceed total values
// This can happen with nested columns in some parquet files
column_stats.value_count = "NULL";
column_stats.null_count = "NULL";
} else {
column_stats.value_count = to_string(stats.num_values - stats.null_count);
column_stats.null_count = to_string(stats.null_count);
}
} else {
if (stats.null_count > file.row_count) {
// Something went wrong with the stats, make them NULL
Expand Down
213 changes: 213 additions & 0 deletions test/sql/add_files/add_files_nested_list_struct_nulls.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# name: test/sql/add_files/add_files_nested_list_struct_nulls.test
# description: Test that ducklake_add_data_files correctly computes value_count for nested columns
# group: [add_files]
#
# This test validates the fix for integer underflow in value_count computation.
# The fix ensures: value_count = num_values - null_count (not just num_values)
#
# Note: The original bug was triggered by PyArrow-generated parquet files with
# List(Struct(...)) columns containing NULLs at various nesting levels.
# To fully reproduce, run the Python script from the bug report and test manually.

require ducklake

require parquet

test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db

test-env DATA_PATH __TEST_DIR__

statement ok
ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/nested_nulls_data', METADATA_CATALOG 'metadata');

# ============================================================================
# Test 1: Validate value_count = num_values - null_count with simple columns
# ============================================================================
statement ok
CREATE TABLE ducklake.simple_test(a VARCHAR, b VARCHAR);

# Create a parquet file with known data:
# - 5 rows total
# - Column 'a': 3 non-null values, 2 null values
# - Column 'b': 4 non-null values, 1 null value
statement ok
COPY (
SELECT * FROM (VALUES
('x', 'p'),
('y', 'q'),
('z', 'r'),
(NULL, 's'),
(NULL, NULL)
) AS t(a, b)
) TO '${DATA_PATH}/simple_test.parquet';

# First, let's see what parquet reports for num_values and null_count
query IIII
SELECT
column_id,
num_values,
stats_null_count,
num_values - stats_null_count as expected_value_count
FROM parquet_metadata('${DATA_PATH}/simple_test.parquet')
ORDER BY column_id
----
0 5 2 3
1 5 1 4

statement ok
CALL ducklake_add_data_files('ducklake', 'simple_test', '${DATA_PATH}/simple_test.parquet')

# Verify value_count equals num_values - null_count (the fix we implemented)
# Column 'a' (column_id 1): num_values=5, null_count=2 -> value_count should be 3
# Column 'b' (column_id 2): num_values=5, null_count=1 -> value_count should be 4
query III
SELECT column_id, value_count, null_count
FROM metadata.ducklake_file_column_stats s
JOIN metadata.ducklake_data_file f ON s.data_file_id = f.data_file_id
WHERE f.path LIKE '%simple_test%'
ORDER BY column_id
----
1 3 2
2 4 1

# Verify data can be read correctly
query II
SELECT * FROM ducklake.simple_test ORDER BY a NULLS LAST, b NULLS LAST
----
x p
y q
z r
NULL s
NULL NULL

# ============================================================================
# Test 2: Nested List(Struct) with NULLs - validates no underflow occurs
# ============================================================================
statement ok
CREATE TABLE ducklake.nested_test(
items STRUCT(
id VARCHAR,
metadata STRUCT(
source VARCHAR,
"type" VARCHAR,
extra VARCHAR
),
attributes STRUCT(
"name" VARCHAR,
code VARCHAR,
"value" DOUBLE
),
tags VARCHAR[]
)[]
);

# Create a parquet file with nested structures and various NULL patterns
statement ok
COPY (
SELECT items FROM (
-- NULL list
SELECT NULL::STRUCT(id VARCHAR, metadata STRUCT(source VARCHAR, "type" VARCHAR, extra VARCHAR), attributes STRUCT("name" VARCHAR, code VARCHAR, "value" DOUBLE), tags VARCHAR[])[] as items
UNION ALL
-- Empty list
SELECT []::STRUCT(id VARCHAR, metadata STRUCT(source VARCHAR, "type" VARCHAR, extra VARCHAR), attributes STRUCT("name" VARCHAR, code VARCHAR, "value" DOUBLE), tags VARCHAR[])[]
UNION ALL
-- List with struct containing many NULL fields
SELECT [{
'id': 'item1',
'metadata': {'source': 'A', 'type': NULL, 'extra': NULL},
'attributes': {'name': NULL, 'code': NULL, 'value': NULL},
'tags': NULL
}]
UNION ALL
-- List with multiple elements
SELECT [{
'id': 'item2',
'metadata': {'source': 'B', 'type': 'T1', 'extra': NULL},
'attributes': {'name': 'Test', 'code': NULL, 'value': NULL},
'tags': []
}, {
'id': NULL,
'metadata': NULL,
'attributes': NULL,
'tags': NULL
}]
UNION ALL
-- More patterns
SELECT NULL::STRUCT(id VARCHAR, metadata STRUCT(source VARCHAR, "type" VARCHAR, extra VARCHAR), attributes STRUCT("name" VARCHAR, code VARCHAR, "value" DOUBLE), tags VARCHAR[])[]
UNION ALL
SELECT [{
'id': NULL,
'metadata': {'source': NULL, 'type': NULL, 'extra': NULL},
'attributes': {'name': NULL, 'code': NULL, 'value': NULL},
'tags': NULL
}]
)
) TO '${DATA_PATH}/nested_test.parquet';

statement ok
CALL ducklake_add_data_files('ducklake', 'nested_test', '${DATA_PATH}/nested_test.parquet')

# Verify data can be read
query I
SELECT COUNT(*) FROM ducklake.nested_test
----
6

# Verify no stats have underflowed values (would be close to 2^64)
query I
SELECT COUNT(*) FROM metadata.ducklake_file_column_stats
WHERE value_count IS NOT NULL AND value_count > 9223372036854775807
----
0

query I
SELECT COUNT(*) FROM metadata.ducklake_file_column_stats
WHERE null_count IS NOT NULL AND null_count > 9223372036854775807
----
0

# Verify stats are reasonable (no negative values)
query I
SELECT COUNT(*) FROM metadata.ducklake_file_column_stats
WHERE value_count IS NOT NULL AND value_count < 0
----
0

# ============================================================================
# Test 3: Larger nested dataset to stress test
# ============================================================================
statement ok
COPY (
SELECT items FROM (
SELECT
CASE
WHEN (row_number() OVER ()) % 5 = 0 THEN NULL
WHEN (row_number() OVER ()) % 4 = 0 THEN []::STRUCT(id VARCHAR, metadata STRUCT(source VARCHAR, "type" VARCHAR, extra VARCHAR), attributes STRUCT("name" VARCHAR, code VARCHAR, "value" DOUBLE), tags VARCHAR[])[]
ELSE [{
'id': 'id_' || (row_number() OVER ())::VARCHAR,
'metadata': {'source': 'src', 'type': NULL, 'extra': NULL},
'attributes': {'name': NULL, 'code': NULL, 'value': NULL},
'tags': NULL
}]
END as items
FROM range(100)
)
) TO '${DATA_PATH}/nested_large.parquet';

statement ok
CALL ducklake_add_data_files('ducklake', 'nested_test', '${DATA_PATH}/nested_large.parquet')

# Verify all data can be read
query I
SELECT COUNT(*) FROM ducklake.nested_test
----
106

# Final check: no underflowed values anywhere
query I
SELECT COUNT(*) FROM metadata.ducklake_file_column_stats
WHERE value_count < 0 OR null_count < 0
OR value_count > 9223372036854775807
OR null_count > 9223372036854775807
----
0
Loading