Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/MinIO.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake
PIP_BREAK_SYSTEM_PACKAGES: 1
BUILD_EXTENSION_TEST_DEPS: full
S3_TEST_SERVER_AVAILABLE: 1

steps:
- name: Install required ubuntu packages
Expand Down
57 changes: 52 additions & 5 deletions src/storage/ducklake_delete_filter.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,53 @@
#include "storage/ducklake_delete_filter.hpp"
#include "duckdb/catalog/catalog_entry/table_function_catalog_entry.hpp"
#include "duckdb/common/multi_file/multi_file_list.hpp"
#include "duckdb/common/multi_file/multi_file_reader.hpp"
#include "duckdb/common/multi_file/multi_file_states.hpp"
#include "duckdb/parser/tableref/table_function_ref.hpp"
#include "duckdb/parallel/thread_context.hpp"
#include "duckdb/main/database.hpp"

namespace duckdb {

//! FunctionInfo to pass delete file metadata to the MultiFileReader
struct DeleteFileFunctionInfo : public TableFunctionInfo {
DuckLakeFileData file_data;
};

//! Custom MultiFileReader that creates a SimpleMultiFileList with extended info
struct DeleteFileMultiFileReader : public MultiFileReader {
static unique_ptr<MultiFileReader> CreateInstance(const TableFunction &table_function) {
return make_uniq<DeleteFileMultiFileReader>(table_function);
}

explicit DeleteFileMultiFileReader(const TableFunction &table_function) {
auto &info = table_function.function_info->Cast<DeleteFileFunctionInfo>();
auto &delete_file = info.file_data;

OpenFileInfo file_info(delete_file.path);
auto extended_info = make_shared_ptr<ExtendedOpenFileInfo>();
extended_info->options["file_size"] = Value::UBIGINT(delete_file.file_size_bytes);
extended_info->options["etag"] = Value("");
extended_info->options["last_modified"] = Value::TIMESTAMP(timestamp_t(0));
Comment on lines +30 to +31
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@samansmink since you are more aware than I am of the effects of S3 files in the multifile reader, can this have any ill effects?

if (!delete_file.encryption_key.empty()) {
extended_info->options["encryption_key"] = Value::BLOB_RAW(delete_file.encryption_key);
}
file_info.extended_info = std::move(extended_info);

vector<OpenFileInfo> files;
files.push_back(std::move(file_info));
file_list = make_shared_ptr<SimpleMultiFileList>(std::move(files));
}

shared_ptr<MultiFileList> CreateFileList(ClientContext &context, const vector<string> &paths,
const FileGlobInput &options) override {
return file_list;
}

private:
shared_ptr<MultiFileList> file_list;
};

DuckLakeDeleteFilter::DuckLakeDeleteFilter() : delete_data(make_shared_ptr<DuckLakeDeleteData>()) {
}

Expand Down Expand Up @@ -52,7 +94,14 @@ vector<idx_t> DuckLakeDeleteFilter::ScanDeleteFile(ClientContext &context, const
auto &instance = DatabaseInstance::GetDatabase(context);
ExtensionLoader loader(instance, "ducklake");
auto &parquet_scan_entry = loader.GetTableFunction("parquet_scan");
auto &parquet_scan = parquet_scan_entry.functions.functions[0];
auto parquet_scan = parquet_scan_entry.functions.functions[0];

// Set up function_info with delete file metadata and custom MultiFileReader
// This allows the bind to use our file list with extended info (file_size, etag, last_modified)
auto function_info = make_shared_ptr<DeleteFileFunctionInfo>();
function_info->file_data = delete_file;
parquet_scan.function_info = std::move(function_info);
parquet_scan.get_multi_file_reader = DeleteFileMultiFileReader::CreateInstance;

// Prepare the inputs for the bind
vector<Value> children;
Expand All @@ -68,10 +117,8 @@ vector<idx_t> DuckLakeDeleteFilter::ScanDeleteFile(ClientContext &context, const
}

TableFunctionRef empty;
TableFunction dummy_table_function;
dummy_table_function.name = "DuckLakeDeleteScan";
TableFunctionBindInput bind_input(children, named_params, input_types, input_names, nullptr, nullptr,
dummy_table_function, empty);
TableFunctionBindInput bind_input(children, named_params, input_types, input_names, nullptr, nullptr, parquet_scan,
empty);
vector<LogicalType> return_types;
vector<string> return_names;

Expand Down
43 changes: 43 additions & 0 deletions test/sql/delete/delete_metadata.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# name: test/sql/delete/delete_metadata.test
# description: Test ducklake deletes
# group: [delete]

require httpfs

require-env S3_TEST_SERVER_AVAILABLE 1

test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db

require ducklake

require parquet

statement ok
ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH 's3://mybucket')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other small nit

test-env DATA_PATH __TEST_DIR__

statement ok
ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/delete_metadata}')

This test will still only be executed with minio due to the require-env but could eventually be executed in dfferent scenarios in the future

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes actually we cannot do this because of duckdb/duckdb#20396 - once the 1.4 branch updates its dependency to a more recent duckdb 1.4 tag - we'll be able to do this.


# Clean up any existing table from previous runs
statement ok
DROP TABLE IF EXISTS ducklake.test;

statement ok
CREATE TABLE ducklake.test AS SELECT i id FROM range(1000) t(i);

statement ok
DELETE FROM ducklake.test WHERE id IN (100, 200, 300);

# Verify there is 1 delete file
query I
SELECT sum(delete_file_count) FROM ducklake_table_info('ducklake') WHERE table_name = 'test';
----
1

query II
EXPLAIN ANALYZE SELECT * FROM ducklake.test
----
analyzed_plan <REGEX>:.*#HEAD: 0.*

# we can time travel to see the state of the table before deletes
query II
EXPLAIN ANALYZE SELECT * FROM ducklake.test AT (VERSION => 1)
----
analyzed_plan <REGEX>:.*#HEAD: 0.*
Loading