diff --git a/src/functions/ducklake_add_data_files.cpp b/src/functions/ducklake_add_data_files.cpp index c3e5fffcb36..4224758c951 100644 --- a/src/functions/ducklake_add_data_files.cpp +++ b/src/functions/ducklake_add_data_files.cpp @@ -410,13 +410,21 @@ FROM parquet_full_metadata(%s) } if (stats_null_count_validity.RowIsValid(metadata_idx)) { - stats.has_null_count = true; - stats.null_count = stats_null_count_data[metadata_idx]; + auto null_count = stats_null_count_data[metadata_idx]; + // Guard against negative values (indicates an underflow in parquet reader) + if (null_count >= 0) { + stats.has_null_count = true; + stats.null_count = static_cast(null_count); + } } if (stats_num_values_validity.RowIsValid(metadata_idx)) { - stats.has_num_values = true; - stats.num_values = stats_num_values_data[metadata_idx]; + auto num_values = stats_num_values_data[metadata_idx]; + // Guard against negative values (indicates an underflow in parquet reader) + if (num_values >= 0) { + stats.has_num_values = true; + stats.num_values = static_cast(num_values); + } } if (total_compressed_size_validity.RowIsValid(metadata_idx)) { diff --git a/src/storage/ducklake_transaction.cpp b/src/storage/ducklake_transaction.cpp index 9e295aea5d7..6fb648319c3 100644 --- a/src/storage/ducklake_transaction.cpp +++ b/src/storage/ducklake_transaction.cpp @@ -1181,8 +1181,17 @@ DuckLakeFileInfo DuckLakeTransaction::GetNewDataFile(DuckLakeDataFile &file, Duc column_stats.column_size_bytes = to_string(stats.column_size_bytes); if (stats.has_null_count) { if (stats.has_num_values) { - column_stats.value_count = to_string(stats.num_values); - column_stats.null_count = to_string(stats.null_count); + // value_count should be the count of non-null values: num_values - null_count + // Validate that null_count doesn't exceed num_values to prevent underflow + if (stats.null_count > stats.num_values) { + // Invalid stats - null_count can't exceed total values + // This can happen with nested columns in some parquet files + column_stats.value_count = "NULL"; + column_stats.null_count = "NULL"; + } else { + column_stats.value_count = to_string(stats.num_values - stats.null_count); + column_stats.null_count = to_string(stats.null_count); + } } else { if (stats.null_count > file.row_count) { // Something went wrong with the stats, make them NULL diff --git a/test/sql/add_files/add_files_nested_list_struct_nulls.test b/test/sql/add_files/add_files_nested_list_struct_nulls.test new file mode 100644 index 00000000000..5db9655de04 --- /dev/null +++ b/test/sql/add_files/add_files_nested_list_struct_nulls.test @@ -0,0 +1,213 @@ +# name: test/sql/add_files/add_files_nested_list_struct_nulls.test +# description: Test that ducklake_add_data_files correctly computes value_count for nested columns +# group: [add_files] +# +# This test validates the fix for integer underflow in value_count computation. +# The fix ensures: value_count = num_values - null_count (not just num_values) +# +# Note: The original bug was triggered by PyArrow-generated parquet files with +# List(Struct(...)) columns containing NULLs at various nesting levels. +# To fully reproduce, run the Python script from the bug report and test manually. + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +test-env DATA_PATH __TEST_DIR__ + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/nested_nulls_data', METADATA_CATALOG 'metadata'); + +# ============================================================================ +# Test 1: Validate value_count = num_values - null_count with simple columns +# ============================================================================ +statement ok +CREATE TABLE ducklake.simple_test(a VARCHAR, b VARCHAR); + +# Create a parquet file with known data: +# - 5 rows total +# - Column 'a': 3 non-null values, 2 null values +# - Column 'b': 4 non-null values, 1 null value +statement ok +COPY ( + SELECT * FROM (VALUES + ('x', 'p'), + ('y', 'q'), + ('z', 'r'), + (NULL, 's'), + (NULL, NULL) + ) AS t(a, b) +) TO '${DATA_PATH}/simple_test.parquet'; + +# First, let's see what parquet reports for num_values and null_count +query IIII +SELECT + column_id, + num_values, + stats_null_count, + num_values - stats_null_count as expected_value_count +FROM parquet_metadata('${DATA_PATH}/simple_test.parquet') +ORDER BY column_id +---- +0 5 2 3 +1 5 1 4 + +statement ok +CALL ducklake_add_data_files('ducklake', 'simple_test', '${DATA_PATH}/simple_test.parquet') + +# Verify value_count equals num_values - null_count (the fix we implemented) +# Column 'a' (column_id 1): num_values=5, null_count=2 -> value_count should be 3 +# Column 'b' (column_id 2): num_values=5, null_count=1 -> value_count should be 4 +query III +SELECT column_id, value_count, null_count +FROM metadata.ducklake_file_column_stats s +JOIN metadata.ducklake_data_file f ON s.data_file_id = f.data_file_id +WHERE f.path LIKE '%simple_test%' +ORDER BY column_id +---- +1 3 2 +2 4 1 + +# Verify data can be read correctly +query II +SELECT * FROM ducklake.simple_test ORDER BY a NULLS LAST, b NULLS LAST +---- +x p +y q +z r +NULL s +NULL NULL + +# ============================================================================ +# Test 2: Nested List(Struct) with NULLs - validates no underflow occurs +# ============================================================================ +statement ok +CREATE TABLE ducklake.nested_test( + items STRUCT( + id VARCHAR, + metadata STRUCT( + source VARCHAR, + "type" VARCHAR, + extra VARCHAR + ), + attributes STRUCT( + "name" VARCHAR, + code VARCHAR, + "value" DOUBLE + ), + tags VARCHAR[] + )[] +); + +# Create a parquet file with nested structures and various NULL patterns +statement ok +COPY ( + SELECT items FROM ( + -- NULL list + SELECT NULL::STRUCT(id VARCHAR, metadata STRUCT(source VARCHAR, "type" VARCHAR, extra VARCHAR), attributes STRUCT("name" VARCHAR, code VARCHAR, "value" DOUBLE), tags VARCHAR[])[] as items + UNION ALL + -- Empty list + SELECT []::STRUCT(id VARCHAR, metadata STRUCT(source VARCHAR, "type" VARCHAR, extra VARCHAR), attributes STRUCT("name" VARCHAR, code VARCHAR, "value" DOUBLE), tags VARCHAR[])[] + UNION ALL + -- List with struct containing many NULL fields + SELECT [{ + 'id': 'item1', + 'metadata': {'source': 'A', 'type': NULL, 'extra': NULL}, + 'attributes': {'name': NULL, 'code': NULL, 'value': NULL}, + 'tags': NULL + }] + UNION ALL + -- List with multiple elements + SELECT [{ + 'id': 'item2', + 'metadata': {'source': 'B', 'type': 'T1', 'extra': NULL}, + 'attributes': {'name': 'Test', 'code': NULL, 'value': NULL}, + 'tags': [] + }, { + 'id': NULL, + 'metadata': NULL, + 'attributes': NULL, + 'tags': NULL + }] + UNION ALL + -- More patterns + SELECT NULL::STRUCT(id VARCHAR, metadata STRUCT(source VARCHAR, "type" VARCHAR, extra VARCHAR), attributes STRUCT("name" VARCHAR, code VARCHAR, "value" DOUBLE), tags VARCHAR[])[] + UNION ALL + SELECT [{ + 'id': NULL, + 'metadata': {'source': NULL, 'type': NULL, 'extra': NULL}, + 'attributes': {'name': NULL, 'code': NULL, 'value': NULL}, + 'tags': NULL + }] + ) +) TO '${DATA_PATH}/nested_test.parquet'; + +statement ok +CALL ducklake_add_data_files('ducklake', 'nested_test', '${DATA_PATH}/nested_test.parquet') + +# Verify data can be read +query I +SELECT COUNT(*) FROM ducklake.nested_test +---- +6 + +# Verify no stats have underflowed values (would be close to 2^64) +query I +SELECT COUNT(*) FROM metadata.ducklake_file_column_stats +WHERE value_count IS NOT NULL AND value_count > 9223372036854775807 +---- +0 + +query I +SELECT COUNT(*) FROM metadata.ducklake_file_column_stats +WHERE null_count IS NOT NULL AND null_count > 9223372036854775807 +---- +0 + +# Verify stats are reasonable (no negative values) +query I +SELECT COUNT(*) FROM metadata.ducklake_file_column_stats +WHERE value_count IS NOT NULL AND value_count < 0 +---- +0 + +# ============================================================================ +# Test 3: Larger nested dataset to stress test +# ============================================================================ +statement ok +COPY ( + SELECT items FROM ( + SELECT + CASE + WHEN (row_number() OVER ()) % 5 = 0 THEN NULL + WHEN (row_number() OVER ()) % 4 = 0 THEN []::STRUCT(id VARCHAR, metadata STRUCT(source VARCHAR, "type" VARCHAR, extra VARCHAR), attributes STRUCT("name" VARCHAR, code VARCHAR, "value" DOUBLE), tags VARCHAR[])[] + ELSE [{ + 'id': 'id_' || (row_number() OVER ())::VARCHAR, + 'metadata': {'source': 'src', 'type': NULL, 'extra': NULL}, + 'attributes': {'name': NULL, 'code': NULL, 'value': NULL}, + 'tags': NULL + }] + END as items + FROM range(100) + ) +) TO '${DATA_PATH}/nested_large.parquet'; + +statement ok +CALL ducklake_add_data_files('ducklake', 'nested_test', '${DATA_PATH}/nested_large.parquet') + +# Verify all data can be read +query I +SELECT COUNT(*) FROM ducklake.nested_test +---- +106 + +# Final check: no underflowed values anywhere +query I +SELECT COUNT(*) FROM metadata.ducklake_file_column_stats +WHERE value_count < 0 OR null_count < 0 + OR value_count > 9223372036854775807 + OR null_count > 9223372036854775807 +---- +0