diff --git a/src/functions/ducklake_compaction_functions.cpp b/src/functions/ducklake_compaction_functions.cpp index 66c64c55509..73ef92ba583 100644 --- a/src/functions/ducklake_compaction_functions.cpp +++ b/src/functions/ducklake_compaction_functions.cpp @@ -16,6 +16,14 @@ #include "duckdb/planner/operator/logical_empty_result.hpp" #include "fmt/format.h" +#include "functions/ducklake_compaction_functions.hpp" +#include "duckdb/planner/operator/logical_order.hpp" +#include "duckdb/planner/operator/logical_projection.hpp" +#include "duckdb/parser/parser.hpp" +#include "duckdb/parser/query_node/select_node.hpp" +#include "duckdb/planner/expression_binder/order_binder.hpp" +#include "duckdb/planner/expression_binder/select_bind_state.hpp" + namespace duckdb { //===--------------------------------------------------------------------===// @@ -91,74 +99,6 @@ string DuckLakeCompaction::GetName() const { return "DUCKLAKE_COMPACTION"; } -//===--------------------------------------------------------------------===// -// Logical Operator -//===--------------------------------------------------------------------===// -class DuckLakeLogicalCompaction : public LogicalExtensionOperator { -public: - DuckLakeLogicalCompaction(idx_t table_index, DuckLakeTableEntry &table, - vector source_files_p, string encryption_key_p, - optional_idx partition_id, vector partition_values_p, optional_idx row_id_start, - CompactionType type) - : table_index(table_index), table(table), source_files(std::move(source_files_p)), - encryption_key(std::move(encryption_key_p)), partition_id(partition_id), - partition_values(std::move(partition_values_p)), row_id_start(row_id_start), type(type) { - } - - idx_t table_index; - DuckLakeTableEntry &table; - vector source_files; - string encryption_key; - optional_idx partition_id; - vector partition_values; - optional_idx row_id_start; - CompactionType type; - -public: - PhysicalOperator &CreatePlan(ClientContext &context, PhysicalPlanGenerator &planner) override { - auto &child = planner.CreatePlan(*children[0]); - return planner.Make(types, table, std::move(source_files), std::move(encryption_key), - partition_id, std::move(partition_values), row_id_start, child, type); - } - - string GetExtensionName() const override { - return "ducklake"; - } - vector GetColumnBindings() override { - vector result; - result.emplace_back(table_index, 0); - return result; - } - - void ResolveTypes() override { - types = {LogicalType::BOOLEAN}; - } -}; - -//===--------------------------------------------------------------------===// -// Compaction Command Generator -//===--------------------------------------------------------------------===// -class DuckLakeCompactor { -public: - DuckLakeCompactor(ClientContext &context, DuckLakeCatalog &catalog, DuckLakeTransaction &transaction, - Binder &binder, TableIndex table_id, DuckLakeMergeAdjacentOptions options); - DuckLakeCompactor(ClientContext &context, DuckLakeCatalog &catalog, DuckLakeTransaction &transaction, - Binder &binder, TableIndex table_id, double delete_threshold); - void GenerateCompactions(DuckLakeTableEntry &table, vector> &compactions); - unique_ptr GenerateCompactionCommand(vector source_files); - -private: - ClientContext &context; - DuckLakeCatalog &catalog; - DuckLakeTransaction &transaction; - Binder &binder; - TableIndex table_id; - double delete_threshold = 0.95; - DuckLakeMergeAdjacentOptions options; - - CompactionType type; -}; - DuckLakeCompactor::DuckLakeCompactor(ClientContext &context, DuckLakeCatalog &catalog, DuckLakeTransaction &transaction, Binder &binder, TableIndex table_id, DuckLakeMergeAdjacentOptions options) : context(context), catalog(catalog), transaction(transaction), binder(binder), table_id(table_id), @@ -221,6 +161,8 @@ void DuckLakeCompactor::GenerateCompactions(DuckLakeTableEntry &table, filter_options.min_file_size = options.min_file_size; filter_options.max_file_size = options.max_file_size; filter_options.target_file_size = target_file_size; + // FIXME: pass in the sort_data so that list of files is approximately sorted in the same way + // (sorted by the min/max metadata) auto files = metadata_manager.GetFilesForCompaction(table, type, delete_threshold, snapshot, filter_options); // iterate over the files and split into separate compaction groups @@ -303,6 +245,91 @@ void DuckLakeCompactor::GenerateCompactions(DuckLakeTableEntry &table, } } +unique_ptr DuckLakeCompactor::InsertSort(Binder &binder, unique_ptr &plan, + DuckLakeTableEntry &table, + optional_ptr sort_data) { + auto bindings = plan->GetColumnBindings(); + + vector orders; + + vector pre_bound_orders; + + for (auto &pre_bound_order : sort_data->fields) { + if (pre_bound_order.dialect != "duckdb") { + continue; + } + auto parsed_expression = Parser::ParseExpressionList(pre_bound_order.expression); + OrderByNode order_node(pre_bound_order.sort_direction, pre_bound_order.null_order, + std::move(parsed_expression[0])); + pre_bound_orders.emplace_back(std::move(order_node)); + } + + if (pre_bound_orders.empty()) { + // Then the sorts were not in the DuckDB dialect and we return the original plan + return std::move(plan); + } + auto root_get = unique_ptr_cast(std::move(plan)); + + // FIXME: Allow arbitrary expressions instead of just column references. (Need a dynamic bind) + // Build a map of the names of columns in the query plan so we can bind to them + case_insensitive_map_t alias_map; + auto current_columns = table.GetColumns().GetColumnNames(); + for (idx_t col_idx = 0; col_idx < current_columns.size(); col_idx++) { + alias_map[current_columns[col_idx]] = col_idx; + } + + root_get->ResolveOperatorTypes(); + auto &root_types = root_get->types; + + vector unmatching_names; + for (auto &pre_bound_order : pre_bound_orders) { + auto name = pre_bound_order.expression->GetName(); + auto order_idx_check = alias_map.find(name); + if (order_idx_check != alias_map.end()) { + auto order_idx = order_idx_check->second; + auto expr = make_uniq(root_types[order_idx], bindings[order_idx], 0); + orders.emplace_back(pre_bound_order.type, pre_bound_order.null_order, std::move(expr)); + } else { + // Then we did not find the column in the table + // We want to record all of the ones that we do not find and then throw a more informative error that + // includes all incorrect columns. + unmatching_names.push_back(name); + } + } + + if (!unmatching_names.empty()) { + string error_string = + "Columns in the SET SORTED BY statement were not found in the DuckLake table. Unmatched columns were: "; + for (auto &unmatching_name : unmatching_names) { + error_string += unmatching_name + ", "; + } + error_string.resize(error_string.length() - 2); // Remove trailing ", " + throw BinderException(error_string); + } + + auto order = make_uniq(std::move(orders)); + + order->children.push_back(std::move(root_get)); + + vector> cast_expressions; + order->ResolveOperatorTypes(); + + auto &types = order->types; + auto order_bindings = order->GetColumnBindings(); + + for (idx_t col_idx = 0; col_idx < types.size(); col_idx++) { + auto &type = types[col_idx]; + auto &binding = order_bindings[col_idx]; + auto ref_expr = make_uniq(type, binding); + cast_expressions.push_back(std::move(ref_expr)); + } + + auto projected = make_uniq(binder.GenerateTableIndex(), std::move(cast_expressions)); + projected->children.push_back(std::move(order)); + + return std::move(projected); +} + unique_ptr DuckLakeCompactor::GenerateCompactionCommand(vector source_files) { // get the table entry at the specified snapshot @@ -452,6 +479,26 @@ DuckLakeCompactor::GenerateCompactionCommand(vector root = DuckLakeInsert::InsertCasts(binder, root); } + // If compaction should be ordered, add Order By (and projection) to logical plan + // Do not pull the sort setting at the time of the creation of the files being compacted, + // and instead pull the latest sort setting + // First, see if there are transaction local changes to the table + // Then fall back to latest snapshot if no local changes + auto latest_entry = transaction.GetTransactionLocalEntry(CatalogType::TABLE_ENTRY, table.schema.name, table.name); + if (!latest_entry) { + auto latest_snapshot = transaction.GetSnapshot(); + latest_entry = catalog.GetEntryById(transaction, latest_snapshot, table_id); + if (!latest_entry) { + throw InternalException("DuckLakeCompactor: failed to find local table entry"); + } + } + auto &latest_table = latest_entry->Cast(); + + auto sort_data = latest_table.GetSortData(); + if (sort_data) { + root = DuckLakeCompactor::InsertSort(binder, root, latest_table, sort_data); + } + // generate the LogicalCopyToFile auto copy = make_uniq(std::move(copy_options.copy_function), std::move(copy_options.bind_data), std::move(copy_options.info)); diff --git a/src/functions/ducklake_flush_inlined_data.cpp b/src/functions/ducklake_flush_inlined_data.cpp index 9224e3f0f5d..8f545236a52 100644 --- a/src/functions/ducklake_flush_inlined_data.cpp +++ b/src/functions/ducklake_flush_inlined_data.cpp @@ -17,6 +17,8 @@ #include "storage/ducklake_flush_data.hpp" #include "duckdb/planner/operator/logical_projection.hpp" +#include "functions/ducklake_compaction_functions.hpp" + namespace duckdb { //===--------------------------------------------------------------------===// @@ -200,6 +202,26 @@ unique_ptr DuckLakeDataFlusher::GenerateFlushCommand() { root = DuckLakeInsert::InsertCasts(binder, root); } + // If flush should be ordered, add Order By (and projection) to logical plan + // Do not pull the sort setting at the time of the creation of the rows being flushed, + // and instead pull the latest sort setting + // First, see if there are transaction local changes to the table + // Then fall back to latest snapshot if no local changes + auto latest_entry = transaction.GetTransactionLocalEntry(CatalogType::TABLE_ENTRY, table.schema.name, table.name); + if (!latest_entry) { + auto latest_snapshot = transaction.GetSnapshot(); + latest_entry = catalog.GetEntryById(transaction, latest_snapshot, table_id); + if (!latest_entry) { + throw InternalException("DuckLakeDataFlusher: failed to find latest table entry for latest snapshot id"); + } + } + auto &latest_table = latest_entry->Cast(); + + auto sort_data = latest_table.GetSortData(); + if (sort_data) { + root = DuckLakeCompactor::InsertSort(binder, root, latest_table, sort_data); + } + // generate the LogicalCopyToFile auto copy = make_uniq(std::move(copy_options.copy_function), std::move(copy_options.bind_data), std::move(copy_options.info)); diff --git a/src/include/common/local_change.hpp b/src/include/common/local_change.hpp index 1fd07a948a7..4ba5b919e94 100644 --- a/src/include/common/local_change.hpp +++ b/src/include/common/local_change.hpp @@ -26,7 +26,8 @@ enum class LocalChangeType { ADD_COLUMN, REMOVE_COLUMN, CHANGE_COLUMN_TYPE, - SET_DEFAULT + SET_DEFAULT, + SET_SORT_KEY }; struct LocalChange { diff --git a/src/include/functions/ducklake_compaction_functions.hpp b/src/include/functions/ducklake_compaction_functions.hpp new file mode 100644 index 00000000000..ff3942601f1 --- /dev/null +++ b/src/include/functions/ducklake_compaction_functions.hpp @@ -0,0 +1,99 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// functions/ducklake_compaction_functions.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "functions/ducklake_table_functions.hpp" +#include "storage/ducklake_transaction.hpp" +#include "storage/ducklake_catalog.hpp" +#include "storage/ducklake_schema_entry.hpp" +#include "storage/ducklake_table_entry.hpp" +#include "storage/ducklake_insert.hpp" +#include "storage/ducklake_multi_file_reader.hpp" +#include "duckdb/planner/operator/logical_get.hpp" +#include "duckdb/planner/operator/logical_copy_to_file.hpp" +#include "duckdb/planner/operator/logical_extension_operator.hpp" +#include "duckdb/planner/operator/logical_set_operation.hpp" +#include "storage/ducklake_compaction.hpp" +#include "duckdb/common/multi_file/multi_file_function.hpp" +#include "storage/ducklake_multi_file_list.hpp" +#include "duckdb/planner/tableref/bound_at_clause.hpp" +#include "duckdb/planner/operator/logical_empty_result.hpp" + +namespace duckdb { +//===--------------------------------------------------------------------===// +// Logical Operator +//===--------------------------------------------------------------------===// +class DuckLakeLogicalCompaction : public LogicalExtensionOperator { +public: + DuckLakeLogicalCompaction(idx_t table_index, DuckLakeTableEntry &table, + vector source_files_p, string encryption_key_p, + optional_idx partition_id, vector partition_values_p, optional_idx row_id_start, + CompactionType type) + : table_index(table_index), table(table), source_files(std::move(source_files_p)), + encryption_key(std::move(encryption_key_p)), partition_id(partition_id), + partition_values(std::move(partition_values_p)), row_id_start(row_id_start), type(type) { + } + + idx_t table_index; + DuckLakeTableEntry &table; + vector source_files; + string encryption_key; + optional_idx partition_id; + vector partition_values; + optional_idx row_id_start; + CompactionType type; + +public: + PhysicalOperator &CreatePlan(ClientContext &context, PhysicalPlanGenerator &planner) override { + auto &child = planner.CreatePlan(*children[0]); + return planner.Make(types, table, std::move(source_files), std::move(encryption_key), + partition_id, std::move(partition_values), row_id_start, child, type); + } + + string GetExtensionName() const override { + return "ducklake"; + } + vector GetColumnBindings() override { + vector result; + result.emplace_back(table_index, 0); + return result; + } + + void ResolveTypes() override { + types = {LogicalType::BOOLEAN}; + } +}; + +//===--------------------------------------------------------------------===// +// Compaction Command Generator +//===--------------------------------------------------------------------===// +class DuckLakeCompactor { +public: + DuckLakeCompactor(ClientContext &context, DuckLakeCatalog &catalog, DuckLakeTransaction &transaction, + Binder &binder, TableIndex table_id, DuckLakeMergeAdjacentOptions options); + DuckLakeCompactor(ClientContext &context, DuckLakeCatalog &catalog, DuckLakeTransaction &transaction, + Binder &binder, TableIndex table_id, double delete_threshold); + void GenerateCompactions(DuckLakeTableEntry &table, vector> &compactions); + unique_ptr GenerateCompactionCommand(vector source_files); + static unique_ptr InsertSort(Binder &binder, unique_ptr &plan, + DuckLakeTableEntry &table, optional_ptr sort_data); + +private: + ClientContext &context; + DuckLakeCatalog &catalog; + DuckLakeTransaction &transaction; + Binder &binder; + TableIndex table_id; + double delete_threshold = 0.95; + DuckLakeMergeAdjacentOptions options; + + CompactionType type; +}; + +} // namespace duckdb diff --git a/src/include/storage/ducklake_catalog.hpp b/src/include/storage/ducklake_catalog.hpp index 98e5033f1ec..63fb7df0385 100644 --- a/src/include/storage/ducklake_catalog.hpp +++ b/src/include/storage/ducklake_catalog.hpp @@ -24,6 +24,7 @@ struct DuckLakeFileListEntry; struct DuckLakeConfigOption; struct DeleteFileMap; class LogicalGet; +struct DuckLakeSort; class DuckLakeCatalog : public Catalog { public: @@ -165,6 +166,15 @@ class DuckLakeCatalog : public Catalog { DuckLakeCatalogSet &schema); //! Return the schema for the given snapshot - loading it if it is not yet loaded DuckLakeCatalogSet &GetSchemaForSnapshot(DuckLakeTransaction &transaction, DuckLakeSnapshot snapshot); + //! Update the sort data for a table in the cached schema (used when SET_SORT_KEY commits without schema change) + void UpdateSortDataInCache(idx_t schema_version, TableIndex table_id, unique_ptr sort_data); + //! Update the table comment in the cached schema (used when SET_COMMENT commits without schema change) + void UpdateTableCommentInCache(idx_t schema_version, TableIndex table_id, const Value &new_comment); + //! Update a column comment in the cached schema (used when SET_COLUMN_COMMENT commits without schema change) + void UpdateColumnCommentInCache(idx_t schema_version, TableIndex table_id, FieldIndex field_index, + const Value &new_comment); + //! Update a view comment in the cached schema (used when SET_COMMENT on view commits without schema change) + void UpdateViewCommentInCache(idx_t schema_version, TableIndex view_id, const Value &new_comment); private: void DropSchema(ClientContext &context, DropInfo &info) override; diff --git a/src/include/storage/ducklake_metadata_info.hpp b/src/include/storage/ducklake_metadata_info.hpp index 25a0a4c8528..7abbb5e9d1b 100644 --- a/src/include/storage/ducklake_metadata_info.hpp +++ b/src/include/storage/ducklake_metadata_info.hpp @@ -19,6 +19,7 @@ #include "common/ducklake_name_map.hpp" #include "storage/ducklake_inlined_data.hpp" #include "duckdb/parser/parsed_expression.hpp" +#include "duckdb/common/enums/order_type.hpp" namespace duckdb { @@ -208,6 +209,39 @@ struct DuckLakePartitionInfo { } }; +struct DuckLakeSortFieldInfo { + idx_t sort_key_index = 0; + // TODO: Validate that expression is case insensitive when stored + string expression; + string dialect; + OrderType sort_direction; + OrderByNullType null_order; + bool operator!=(const DuckLakeSortFieldInfo &new_field) const { + return expression != new_field.expression || dialect != new_field.dialect || + sort_direction != new_field.sort_direction || null_order != new_field.null_order; + } +}; + +struct DuckLakeSortInfo { + optional_idx id; + TableIndex table_id; + vector fields; + bool operator==(const DuckLakeSortInfo &new_sort) const { + if (table_id != new_sort.table_id || fields.size() != new_sort.fields.size()) { + return false; + } + for (idx_t i = 0; i < fields.size(); i++) { + if (fields[i] != new_sort.fields[i]) { + return false; + } + } + return true; + } + bool operator!=(vector::const_reference value) const { + return !(*this == value); + } +}; + struct DuckLakeGlobalColumnStatsInfo { FieldIndex column_id; @@ -295,6 +329,7 @@ struct DuckLakeCatalogInfo { vector views; vector macros; vector partitions; + vector sorts; }; struct DuckLakeFileData { diff --git a/src/include/storage/ducklake_metadata_manager.hpp b/src/include/storage/ducklake_metadata_manager.hpp index 3aa1fbe95b7..426560bff8e 100644 --- a/src/include/storage/ducklake_metadata_manager.hpp +++ b/src/include/storage/ducklake_metadata_manager.hpp @@ -146,6 +146,7 @@ class DuckLakeMetadataManager { virtual string WriteNewViews(const vector &new_views); virtual string WriteNewPartitionKeys(DuckLakeSnapshot commit_snapshot, const vector &new_partitions); + virtual string WriteNewSortKeys(DuckLakeSnapshot commit_snapshot, const vector &new_sorts); virtual string WriteDroppedColumns(const vector &dropped_columns); virtual string WriteNewColumns(const vector &new_columns); virtual string WriteNewTags(const vector &new_tags); diff --git a/src/include/storage/ducklake_sort_data.hpp b/src/include/storage/ducklake_sort_data.hpp new file mode 100644 index 00000000000..03594e5ead2 --- /dev/null +++ b/src/include/storage/ducklake_sort_data.hpp @@ -0,0 +1,31 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// storage/ducklake_sort_data.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/common/common.hpp" +#include "common/index.hpp" +#include "duckdb/common/enums/order_type.hpp" + +namespace duckdb { +class BaseStatistics; + +struct DuckLakeSortField { + idx_t sort_key_index = 0; + string expression; + string dialect; + OrderType sort_direction; + OrderByNullType null_order; +}; + +struct DuckLakeSort { + idx_t sort_id = 0; + vector fields; +}; + +} // namespace duckdb diff --git a/src/include/storage/ducklake_table_entry.hpp b/src/include/storage/ducklake_table_entry.hpp index ae2f9ead127..8eab0305192 100644 --- a/src/include/storage/ducklake_table_entry.hpp +++ b/src/include/storage/ducklake_table_entry.hpp @@ -9,8 +9,10 @@ #pragma once #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" +#include "duckdb/parser/parsed_data/alter_table_info.hpp" #include "storage/ducklake_stats.hpp" #include "storage/ducklake_partition_data.hpp" +#include "storage/ducklake_sort_data.hpp" #include "common/index.hpp" #include "storage/ducklake_field_data.hpp" #include "common/local_change.hpp" @@ -53,6 +55,9 @@ class DuckLakeTableEntry : public TableCatalogEntry { optional_ptr GetPartitionData() { return partition_data.get(); } + optional_ptr GetSortData() { + return sort_data.get(); + } DuckLakeFieldData &GetFieldData() { return *field_data; } @@ -72,6 +77,8 @@ class DuckLakeTableEntry : public TableCatalogEntry { //! Returns the field id of a column by a field index optional_ptr GetFieldId(FieldIndex field_index) const; void SetPartitionData(unique_ptr partition_data); + void SetSortData(unique_ptr sort_data); + void SetColumnComment(FieldIndex field_index, const Value &new_comment); optional_ptr GetTableStats(ClientContext &context); optional_ptr GetTableStats(DuckLakeTransaction &transaction); @@ -121,6 +128,7 @@ class DuckLakeTableEntry : public TableCatalogEntry { unique_ptr AlterTable(DuckLakeTransaction &transaction, RemoveFieldInfo &info); unique_ptr AlterTable(DuckLakeTransaction &transaction, RenameFieldInfo &info); unique_ptr AlterTable(DuckLakeTransaction &transaction, SetDefaultInfo &info); + unique_ptr AlterTable(DuckLakeTransaction &transaction, SetSortedByInfo &info); unique_ptr GetNestedEvolution(const DuckLakeFieldId &source_id, const LogicalType &target, ColumnChangeInfo &result, optional_idx parent_idx); @@ -145,6 +153,8 @@ class DuckLakeTableEntry : public TableCatalogEntry { unique_ptr changed_fields, shared_ptr new_field_data); // ! Create a DuckLakeTableEntry from a SET PARTITION KEY DuckLakeTableEntry(DuckLakeTableEntry &parent, CreateTableInfo &info, unique_ptr partition_data); + // ! Create a DuckLakeTableEntry from a SET SORT KEY + DuckLakeTableEntry(DuckLakeTableEntry &parent, CreateTableInfo &info, unique_ptr sort_data); private: TableIndex table_id; @@ -155,6 +165,7 @@ class DuckLakeTableEntry : public TableCatalogEntry { vector inlined_data_tables; LocalChange local_change; unique_ptr partition_data; + unique_ptr sort_data; // only set for REMOVED_COLUMN unique_ptr changed_fields; }; diff --git a/src/include/storage/ducklake_transaction.hpp b/src/include/storage/ducklake_transaction.hpp index c326157d598..a844d183b3a 100644 --- a/src/include/storage/ducklake_transaction.hpp +++ b/src/include/storage/ducklake_transaction.hpp @@ -10,6 +10,7 @@ #include "ducklake_macro_entry.hpp" #include "common/ducklake_data_file.hpp" +#include "common/local_change.hpp" #include "common/ducklake_snapshot.hpp" #include "duckdb/common/case_insensitive_map.hpp" #include "duckdb/common/types/value_map.hpp" @@ -91,6 +92,7 @@ class DuckLakeTransaction : public Transaction, public enable_shared_from_this new_entry); DuckLakeCatalogSet &GetOrCreateTransactionLocalEntries(CatalogEntry &entry); + DuckLakeCatalogSet &GetOrCreateTransactionLocalEntriesAlter(CatalogEntry &entry, LocalChangeType change_type); optional_ptr GetTransactionLocalSchemas(); optional_ptr GetTransactionLocalEntries(CatalogType type, const string &schema_name); optional_ptr GetTransactionLocalEntry(CatalogType catalog_type, const string &schema_name, @@ -184,6 +186,7 @@ class DuckLakeTransaction : public Transaction, public enable_shared_from_this> new_tables; + //! Tables altered with non-schema-changing operations + case_insensitive_map_t> altered_tables_same_schema; set dropped_tables; //! New macros added by this transaction diff --git a/src/storage/ducklake_catalog.cpp b/src/storage/ducklake_catalog.cpp index 37cd4b04af4..65692ff73a3 100644 --- a/src/storage/ducklake_catalog.cpp +++ b/src/storage/ducklake_catalog.cpp @@ -1,6 +1,7 @@ #include "storage/ducklake_catalog.hpp" #include "common/ducklake_types.hpp" +#include "storage/ducklake_sort_data.hpp" #include "duckdb/catalog/catalog_entry/macro_catalog_entry.hpp" #include "duckdb/catalog/catalog_entry/schema_catalog_entry.hpp" #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" @@ -190,6 +191,66 @@ DuckLakeCatalogSet &DuckLakeCatalog::GetSchemaForSnapshot(DuckLakeTransaction &t return result; } +void DuckLakeCatalog::UpdateSortDataInCache(idx_t schema_version, TableIndex table_id, + unique_ptr sort_data) { + lock_guard guard(schemas_lock); + auto entry = schemas.find(schema_version); + if (entry == schemas.end()) { + // Schema not cached yet, nothing to update + return; + } + auto table_entry = entry->second->GetEntryById(table_id); + if (!table_entry || table_entry->type != CatalogType::TABLE_ENTRY) { + return; + } + auto &table = table_entry->Cast(); + table.SetSortData(std::move(sort_data)); +} + +void DuckLakeCatalog::UpdateTableCommentInCache(idx_t schema_version, TableIndex table_id, const Value &new_comment) { + lock_guard guard(schemas_lock); + auto entry = schemas.find(schema_version); + if (entry == schemas.end()) { + // Schema not cached yet, nothing to update + return; + } + auto table_entry = entry->second->GetEntryById(table_id); + if (!table_entry || table_entry->type != CatalogType::TABLE_ENTRY) { + return; + } + table_entry->comment = new_comment; +} + +void DuckLakeCatalog::UpdateColumnCommentInCache(idx_t schema_version, TableIndex table_id, FieldIndex field_index, + const Value &new_comment) { + lock_guard guard(schemas_lock); + auto entry = schemas.find(schema_version); + if (entry == schemas.end()) { + // Schema not cached yet, nothing to update + return; + } + auto table_entry = entry->second->GetEntryById(table_id); + if (!table_entry || table_entry->type != CatalogType::TABLE_ENTRY) { + return; + } + auto &table = table_entry->Cast(); + table.SetColumnComment(field_index, new_comment); +} + +void DuckLakeCatalog::UpdateViewCommentInCache(idx_t schema_version, TableIndex view_id, const Value &new_comment) { + lock_guard guard(schemas_lock); + auto entry = schemas.find(schema_version); + if (entry == schemas.end()) { + // Schema not cached yet, nothing to update + return; + } + auto view_entry = entry->second->GetEntryById(view_id); + if (!view_entry || view_entry->type != CatalogType::VIEW_ENTRY) { + return; + } + view_entry->comment = new_comment; +} + static unique_ptr TransformColumnType(DuckLakeColumnInfo &col) { DuckLakeColumnData col_data; col_data.id = col.id; @@ -452,6 +513,29 @@ unique_ptr DuckLakeCatalog::LoadSchemaForSnapshot(DuckLakeTr auto &ducklake_table = table->Cast(); ducklake_table.SetPartitionData(std::move(partition)); } + + // load the sort entries + for (auto &entry : catalog.sorts) { + auto table = schema_set->GetEntryById(entry.table_id); + if (!table || table->type != CatalogType::TABLE_ENTRY) { + throw InvalidInputException("Could not find matching table for sort entry"); + } + auto sort = make_uniq(); + sort->sort_id = entry.id.GetIndex(); + for (auto &field : entry.fields) { + DuckLakeSortField sort_field; + sort_field.sort_key_index = field.sort_key_index; + sort_field.expression = field.expression; + sort_field.dialect = field.dialect; + sort_field.sort_direction = field.sort_direction; + sort_field.null_order = field.null_order; + + sort->fields.push_back(sort_field); + } + auto &ducklake_table = table->Cast(); + ducklake_table.SetSortData(std::move(sort)); + } + return schema_set; } diff --git a/src/storage/ducklake_catalog_set.cpp b/src/storage/ducklake_catalog_set.cpp index 851c128d726..53a981f933e 100644 --- a/src/storage/ducklake_catalog_set.cpp +++ b/src/storage/ducklake_catalog_set.cpp @@ -53,7 +53,9 @@ optional_ptr DuckLakeCatalogSet::GetEntryById(TableIndex index) { if (entry == table_entry_map.end()) { return nullptr; } - D_ASSERT(entry->second.get().type == CatalogType::TABLE_ENTRY); + // Both tables and views are stored in the table_entry_map + D_ASSERT(entry->second.get().type == CatalogType::TABLE_ENTRY || + entry->second.get().type == CatalogType::VIEW_ENTRY); return entry->second.get(); } diff --git a/src/storage/ducklake_metadata_manager.cpp b/src/storage/ducklake_metadata_manager.cpp index 2e11cb36c48..bdf98cacff2 100644 --- a/src/storage/ducklake_metadata_manager.cpp +++ b/src/storage/ducklake_metadata_manager.cpp @@ -88,6 +88,8 @@ CREATE TABLE {METADATA_CATALOG}.ducklake_schema_versions(begin_snapshot BIGINT, CREATE TABLE {METADATA_CATALOG}.ducklake_macro(schema_id BIGINT, macro_id BIGINT, macro_name VARCHAR, begin_snapshot BIGINT, end_snapshot BIGINT); CREATE TABLE {METADATA_CATALOG}.ducklake_macro_impl(macro_id BIGINT, impl_id BIGINT, dialect VARCHAR, sql VARCHAR, type VARCHAR); CREATE TABLE {METADATA_CATALOG}.ducklake_macro_parameters(macro_id BIGINT, impl_id BIGINT,column_id BIGINT, parameter_name VARCHAR, parameter_type VARCHAR, default_value VARCHAR, default_value_type VARCHAR); +CREATE TABLE {METADATA_CATALOG}.ducklake_sort_info(sort_id BIGINT, table_id BIGINT, begin_snapshot BIGINT, end_snapshot BIGINT); +CREATE TABLE {METADATA_CATALOG}.ducklake_sort_expression(sort_id BIGINT, table_id BIGINT, sort_key_index BIGINT, expression VARCHAR, dialect VARCHAR, sort_direction VARCHAR, null_order VARCHAR); INSERT INTO {METADATA_CATALOG}.ducklake_snapshot VALUES (0, NOW(), 0, 1, 0); INSERT INTO {METADATA_CATALOG}.ducklake_snapshot_changes VALUES (0, 'created_schema:"main"', NULL, NULL, NULL); INSERT INTO {METADATA_CATALOG}.ducklake_metadata (key, value) VALUES ('version', '0.4-dev1'), ('created_by', 'DuckDB %s'), ('data_path', %s), ('encrypted', '%s'); @@ -166,6 +168,8 @@ CREATE TABLE {IF_NOT_EXISTS} {METADATA_CATALOG}.ducklake_macro_impl(macro_id BIG CREATE TABLE {IF_NOT_EXISTS} {METADATA_CATALOG}.ducklake_macro_parameters(macro_id BIGINT, impl_id BIGINT,column_id BIGINT, parameter_name VARCHAR, parameter_type VARCHAR, default_value VARCHAR, default_value_type VARCHAR); ALTER TABLE {METADATA_CATALOG}.ducklake_column ADD COLUMN {IF_NOT_EXISTS} default_value_type VARCHAR DEFAULT 'literal'; ALTER TABLE {METADATA_CATALOG}.ducklake_column ADD COLUMN {IF_NOT_EXISTS} default_value_dialect VARCHAR DEFAULT NULL; +CREATE TABLE {IF_NOT_EXISTS} {METADATA_CATALOG}.ducklake_sort_info(sort_id BIGINT, table_id BIGINT, begin_snapshot BIGINT, end_snapshot BIGINT); +CREATE TABLE {IF_NOT_EXISTS} {METADATA_CATALOG}.ducklake_sort_expression(sort_id BIGINT, table_id BIGINT, sort_key_index BIGINT, expression VARCHAR, dialect VARCHAR, sort_direction VARCHAR, null_order VARCHAR); ALTER TABLE {METADATA_CATALOG}.ducklake_schema_versions ADD COLUMN {IF_NOT_EXISTS} table_id BIGINT; UPDATE {METADATA_CATALOG}.ducklake_metadata SET value = '0.4-dev1' WHERE key = 'version'; )"; @@ -560,6 +564,46 @@ ORDER BY part.table_id, partition_id, partition_key_index partition_field.transform = row.GetValue(4); partition_entry.fields.push_back(std::move(partition_field)); } + + // load sort information + result = transaction.Query(snapshot, R"( +SELECT sort.sort_id, sort.table_id, sort_expr.sort_key_index, sort_expr.expression, sort_expr.dialect, sort_expr.sort_direction, sort_expr.null_order +FROM {METADATA_CATALOG}.ducklake_sort_info sort +JOIN {METADATA_CATALOG}.ducklake_sort_expression sort_expr USING (sort_id) +WHERE {SNAPSHOT_ID} >= sort.begin_snapshot AND ({SNAPSHOT_ID} < sort.end_snapshot OR sort.end_snapshot IS NULL) +ORDER BY sort.table_id, sort.sort_id, sort_expr.sort_key_index +)"); + if (result->HasError()) { + result->GetErrorObject().Throw("Failed to get sort information from DuckLake: "); + } + auto &sorts = catalog.sorts; + for (auto &row : *result) { + auto sort_id = row.GetValue(0); + auto table_id = TableIndex(row.GetValue(1)); + + if (sorts.empty() || sorts.back().table_id != table_id) { + DuckLakeSortInfo sort_info; + sort_info.id = sort_id; + sort_info.table_id = table_id; + sorts.push_back(std::move(sort_info)); + } + auto &sort_entry = sorts.back(); + + DuckLakeSortFieldInfo sort_field; + sort_field.sort_key_index = row.GetValue(2); + sort_field.expression = row.GetValue(3); + sort_field.dialect = row.GetValue(4); + + auto sort_direction_str = row.GetValue(5); + sort_field.sort_direction = + (StringUtil::CIEquals(sort_direction_str, "DESC") ? OrderType::DESCENDING : OrderType::ASCENDING); + + auto null_order_str = row.GetValue(6); + sort_field.null_order = (StringUtil::CIEquals(null_order_str, "NULLS_FIRST") ? OrderByNullType::NULLS_FIRST + : OrderByNullType::NULLS_LAST); + sort_entry.fields.push_back(std::move(sort_field)); + } + return catalog; } @@ -1534,6 +1578,7 @@ string DuckLakeMetadataManager::DropTables(const set &ids, bool rena batch_query += FlushDrop("ducklake_data_file", "table_id", ids); batch_query += FlushDrop("ducklake_delete_file", "table_id", ids); batch_query += FlushDrop("ducklake_tag", "object_id", ids); + batch_query += FlushDrop("ducklake_sort_info", "table_id", ids); } return batch_query; } @@ -2724,6 +2769,117 @@ WHERE table_id IN (%s) AND end_snapshot IS NULL return batch_query; } +void CheckTableSortEqual(const vector &old_sorts, + unordered_map &new_sort_map) { + for (auto &sort : old_sorts) { + if (new_sort_map.find(sort.table_id.index) != new_sort_map.end()) { + if (new_sort_map[sort.table_id.index] == sort) { + // If a new sort already exists in an old sort, it's a nop, we can remove it + new_sort_map.erase(sort.table_id.index); + } + } + } +} + +void CheckTableSortReset(const unordered_set &old_sort_set, const vector &new_sorts, + unordered_map &new_sort_map) { + vector sort_ids_to_erase; + for (auto &sort : new_sorts) { + if (old_sort_set.find(sort.table_id.index) == old_sort_set.end() && sort.fields.empty()) { + // If a map does not exist on the old sort and the sort has no fields, this is an reset over + // an empty sort definition, hence also a nop + sort_ids_to_erase.push_back(sort.table_id.index); + } + } + for (auto &id : sort_ids_to_erase) { + new_sort_map.erase(id); + } +} + +static unordered_map GetNewSorts(const vector &old_sorts, + const vector &new_sorts) { + unordered_map new_sort_map; + for (auto &sort : new_sorts) { + new_sort_map[sort.table_id.index] = sort; + } + unordered_set old_sort_set; + for (auto &sort : old_sorts) { + old_sort_set.insert(sort.table_id.index); + } + CheckTableSortEqual(old_sorts, new_sort_map); + CheckTableSortReset(old_sort_set, new_sorts, new_sort_map); + + return new_sort_map; +} + +string DuckLakeMetadataManager::WriteNewSortKeys(DuckLakeSnapshot commit_snapshot, + const vector &new_sorts) { + if (new_sorts.empty()) { + return {}; + } + auto catalog = GetCatalogForSnapshot(commit_snapshot); + + string old_sort_table_ids; + string new_sort_values; + string new_sort_expressions; + + // Do not update if they are the same + auto new_sort_map = GetNewSorts(catalog.sorts, new_sorts); + if (new_sort_map.empty()) { + return {}; + } + for (auto &new_sort : new_sort_map) { + // set old partition data as no longer valid + if (!old_sort_table_ids.empty()) { + old_sort_table_ids += ", "; + } + old_sort_table_ids += to_string(new_sort.second.table_id.index); + + if (!new_sort.second.id.IsValid()) { + // dropping sort data - skip adding new values but continue to set end_snapshot on old sort + continue; + } + auto sort_id = new_sort.second.id.GetIndex(); + + if (!new_sort_values.empty()) { + new_sort_values += ", "; + } + new_sort_values += + StringUtil::Format(R"((%d, %d, {SNAPSHOT_ID}, NULL))", sort_id, new_sort.second.table_id.index); + + for (auto &field : new_sort.second.fields) { + if (!new_sort_expressions.empty()) { + new_sort_expressions += ", "; + } + string sort_direction = (field.sort_direction == OrderType::DESCENDING ? "DESC" : "ASC"); + string null_order = (field.null_order == OrderByNullType::NULLS_FIRST ? "NULLS_FIRST" : "NULLS_LAST"); + new_sort_expressions += + StringUtil::Format("(%d, %d, %d, %s, %s, %s, %s)", sort_id, new_sort.second.table_id.index, + field.sort_key_index, SQLString(field.expression), SQLString(field.dialect), + SQLString(sort_direction), SQLString(null_order)); + } + } + // update old sort information for any tables that have been altered + auto update_sort_query = StringUtil::Format(R"( +UPDATE {METADATA_CATALOG}.ducklake_sort_info +SET end_snapshot = {SNAPSHOT_ID} +WHERE table_id IN (%s) AND end_snapshot IS NULL +;)", + old_sort_table_ids); + string batch_query = update_sort_query; + if (!new_sort_values.empty()) { + new_sort_values = "INSERT INTO {METADATA_CATALOG}.ducklake_sort_info VALUES " + new_sort_values + ";"; + batch_query += new_sort_values; + } + if (!new_sort_expressions.empty()) { + new_sort_expressions = + "INSERT INTO {METADATA_CATALOG}.ducklake_sort_expression VALUES " + new_sort_expressions + ";"; + batch_query += new_sort_expressions; + } + + return batch_query; +} + string DuckLakeMetadataManager::WriteNewTags(const vector &new_tags) { if (new_tags.empty()) { return {}; @@ -3336,7 +3492,7 @@ VALUES %s; if (!deleted_table_ids.empty()) { tables_to_delete_from = {"ducklake_table", "ducklake_table_stats", "ducklake_table_column_stats", "ducklake_partition_info", "ducklake_partition_column", "ducklake_column", - "ducklake_column_tag"}; + "ducklake_column_tag", "ducklake_sort_info", "ducklake_sort_expression"}; for (auto &delete_tbl : tables_to_delete_from) { auto result = transaction.Query(StringUtil::Format(R"( DELETE FROM {METADATA_CATALOG}.%s diff --git a/src/storage/ducklake_table_entry.cpp b/src/storage/ducklake_table_entry.cpp index 5c6ae264167..97ad06bf41b 100644 --- a/src/storage/ducklake_table_entry.cpp +++ b/src/storage/ducklake_table_entry.cpp @@ -69,6 +69,9 @@ DuckLakeTableEntry::DuckLakeTableEntry(DuckLakeTableEntry &parent, CreateTableIn if (parent.partition_data) { partition_data = make_uniq(*parent.partition_data); } + if (parent.sort_data) { + sort_data = make_uniq(*parent.sort_data); + } CheckSupportedTypes(); if (local_change.type == LocalChangeType::ADD_COLUMN) { LogicalIndex new_col_idx(columns.LogicalColumnCount() - 1); @@ -131,6 +134,13 @@ DuckLakeTableEntry::DuckLakeTableEntry(DuckLakeTableEntry &parent, CreateTableIn partition_data = std::move(partition_data_p); } +// ALTER TABLE SET SORT KEY +DuckLakeTableEntry::DuckLakeTableEntry(DuckLakeTableEntry &parent, CreateTableInfo &info, + unique_ptr sort_data_p) + : DuckLakeTableEntry(parent, info, LocalChangeType::SET_SORT_KEY) { + sort_data = std::move(sort_data_p); +} + const DuckLakeFieldId &DuckLakeTableEntry::GetFieldId(PhysicalIndex column_index) const { return field_data->GetByRootIndex(column_index); } @@ -306,6 +316,19 @@ void DuckLakeTableEntry::SetPartitionData(unique_ptr partitio partition_data = std::move(partition_data_p); } +void DuckLakeTableEntry::SetSortData(unique_ptr sort_data_p) { + sort_data = std::move(sort_data_p); +} + +void DuckLakeTableEntry::SetColumnComment(FieldIndex field_index, const Value &new_comment) { + auto field_id = GetFieldId(field_index); + if (!field_id) { + return; + } + auto &col = columns.GetColumnMutable(field_id->Name()); + col.SetComment(new_comment); +} + const string &DuckLakeTableEntry::DataPath() const { return data_path; } @@ -343,6 +366,26 @@ string GetPartitionColumnName(ColumnRefExpression &colref) { return colref.GetColumnName(); } +string GetSortColumnName(DuckLakeTableEntry &table, ParsedExpression &expr) { + // Only allow column references, reject expressions + if (expr.type != ExpressionType::COLUMN_REF) { + throw NotImplementedException("SET SORTED BY only supports column references, not expressions: %s", + expr.ToString()); + } + auto &colref = expr.Cast(); + if (colref.IsQualified()) { + throw InvalidInputException("Unexpected qualified column reference - only unqualified columns are supported"); + } + string column_name = colref.GetColumnName(); + + // Validate column exists + if (!table.ColumnExists(column_name)) { + throw BinderException("Column \"%s\" does not exist in table \"%s\"", column_name, table.name); + } + + return column_name; +} + DuckLakePartitionField GetPartitionField(DuckLakeTableEntry &table, ParsedExpression &expr) { string column_name; DuckLakeTransformType transform_type; @@ -1062,6 +1105,38 @@ unique_ptr DuckLakeTableEntry::AlterTable(DuckLakeTransaction &tra return std::move(new_entry); } +unique_ptr DuckLakeTableEntry::AlterTable(DuckLakeTransaction &transaction, SetSortedByInfo &info) { + auto create_info = GetInfo(); + auto &table_info = create_info->Cast(); + + if (info.orders.empty()) { + // RESET SORTED BY - clear sort data + auto new_entry = make_uniq(*this, table_info, unique_ptr()); + return std::move(new_entry); + } + + auto sort_data = make_uniq(); + sort_data->sort_id = transaction.GetLocalCatalogId(); + for (idx_t order_node_idx = 0; order_node_idx < info.orders.size(); order_node_idx++) { + auto &order_node = info.orders[order_node_idx]; + + // FIXME: Currently must be column reference and column must exist. Want it to be an expression. + string column_name = GetSortColumnName(*this, *order_node.expression); + + DuckLakeSortField sort_field; + sort_field.sort_key_index = order_node_idx; + // FIXME: convert to order_node.expression->ToString(); once expressions are supported + sort_field.expression = column_name; + sort_field.dialect = "duckdb"; + sort_field.sort_direction = order_node.type; + sort_field.null_order = order_node.null_order; + sort_data->fields.push_back(sort_field); + } + + auto new_entry = make_uniq(*this, table_info, std::move(sort_data)); + return std::move(new_entry); +} + unique_ptr DuckLakeTableEntry::Alter(DuckLakeTransaction &transaction, AlterTableInfo &info) { if (transaction.HasTransactionInlinedData(GetTableId())) { throw NotImplementedException("ALTER on a table with transaction-local inlined data is not supported"); @@ -1091,6 +1166,8 @@ unique_ptr DuckLakeTableEntry::Alter(DuckLakeTransaction &transact return AlterTable(transaction, info.Cast()); case AlterTableType::SET_DEFAULT: return AlterTable(transaction, info.Cast()); + case AlterTableType::SET_SORTED_BY: + return AlterTable(transaction, info.Cast()); default: throw BinderException("Unsupported ALTER TABLE type in DuckLake"); } diff --git a/src/storage/ducklake_transaction.cpp b/src/storage/ducklake_transaction.cpp index 4acb0e3e19f..c2e02df09ce 100644 --- a/src/storage/ducklake_transaction.cpp +++ b/src/storage/ducklake_transaction.cpp @@ -21,6 +21,17 @@ namespace duckdb { +static bool IsSchemaAlteringChange(LocalChangeType type) { + switch (type) { + case LocalChangeType::SET_SORT_KEY: + case LocalChangeType::SET_COMMENT: + case LocalChangeType::SET_COLUMN_COMMENT: + return false; + default: + return true; + } +} + bool LocalTableDataChanges::IsEmpty() const { if (!new_data_files.empty()) { return false; @@ -99,8 +110,8 @@ bool DuckLakeTransaction::SchemaChangesMade() { } bool DuckLakeTransaction::ChangesMade() { - return SchemaChangesMade() || !table_data_changes.empty() || !dropped_files.empty() || - !new_name_maps.name_maps.empty(); + return SchemaChangesMade() || !altered_tables_same_schema.empty() || !table_data_changes.empty() || + !dropped_files.empty() || !new_name_maps.name_maps.empty(); } struct TransactionChangeInformation { @@ -137,7 +148,8 @@ void GetTransactionTableChanges(reference table_entry, Transaction case LocalChangeType::ADD_COLUMN: case LocalChangeType::REMOVE_COLUMN: case LocalChangeType::CHANGE_COLUMN_TYPE: - case LocalChangeType::SET_DEFAULT: { + case LocalChangeType::SET_DEFAULT: + case LocalChangeType::SET_SORT_KEY: { // this table was altered auto table_id = table.GetTableId(); // don't report transaction-local tables yet - these will get added later on @@ -253,6 +265,21 @@ TransactionChangeInformation DuckLakeTransaction::GetTransactionChanges() { } } } + // Also process altered_tables_same_schema + for (auto &schema_entry : altered_tables_same_schema) { + for (auto &entry : schema_entry.second->GetEntries()) { + switch (entry.second->type) { + case CatalogType::TABLE_ENTRY: + GetTransactionTableChanges(*entry.second, changes); + break; + case CatalogType::VIEW_ENTRY: + GetTransactionViewChanges(*entry.second, changes); + break; + default: + throw InternalException("Unsupported type found in altered_tables_same_schema"); + } + } + } changes.tables_deleted_from = tables_deleted_from; for (auto &entry : table_data_changes) { auto table_id = entry.first; @@ -751,6 +778,37 @@ DuckLakePartitionInfo DuckLakeTransaction::GetNewPartitionKey(DuckLakeCommitStat return partition_key; } +DuckLakeSortInfo DuckLakeTransaction::GetNewSortKey(DuckLakeCommitState &commit_state, DuckLakeTableEntry &table) { + DuckLakeSortInfo sort_key; + sort_key.table_id = commit_state.GetTableId(table); + if (sort_key.table_id.IsTransactionLocal()) { + throw InternalException("Trying to write sort with transaction local table-id"); + } + + // insert the new sort data + auto sort_data = table.GetSortData(); + if (!sort_data) { + // dropping sort data - insert the empty sort key data for this table + return sort_key; + } + + auto sort_id = commit_state.commit_snapshot.next_catalog_id++; + sort_key.id = sort_id; + sort_data->sort_id = sort_id; + for (auto &field : sort_data->fields) { + DuckLakeSortFieldInfo sort_field; + sort_field.sort_key_index = field.sort_key_index; + sort_field.expression = field.expression; + sort_field.dialect = field.dialect; + sort_field.sort_direction = field.sort_direction; + sort_field.null_order = field.null_order; + + sort_key.fields.push_back(std::move(sort_field)); + } + + return sort_key; +} + vector DuckLakeTableEntry::GetTableColumns() const { vector result; auto not_null_fields = GetNotNullFields(); @@ -806,6 +864,7 @@ struct NewTableInfo { vector dropped_columns; vector new_columns; vector new_inlined_data_tables; + vector new_sort_keys; }; struct NewMacroInfo { @@ -857,6 +916,12 @@ void DuckLakeTransaction::GetNewTableInfo(DuckLakeCommitState &commit_state, Duc column_schema_change = true; break; } + case LocalChangeType::SET_SORT_KEY: { + auto sort_key = GetNewSortKey(commit_state, table); + result.new_sort_keys.push_back(std::move(sort_key)); + transaction_changes.altered_tables.insert(table_id); + break; + } case LocalChangeType::SET_COMMENT: { DuckLakeTagInfo comment_info; comment_info.id = commit_state.GetTableId(table).index; @@ -1074,6 +1139,7 @@ void DuckLakeTransaction::GetNewViewInfo(DuckLakeCommitState &commit_state, Duck NewTableInfo DuckLakeTransaction::GetNewTables(DuckLakeCommitState &commit_state, TransactionChangeInformation &transaction_changes) { NewTableInfo result; + // Process new_tables for (auto &schema_entry : new_tables) { for (auto &entry : schema_entry.second->GetEntries()) { switch (entry.second->type) { @@ -1088,6 +1154,21 @@ NewTableInfo DuckLakeTransaction::GetNewTables(DuckLakeCommitState &commit_state } } } + // Also process altered_tables_same_schema + for (auto &schema_entry : altered_tables_same_schema) { + for (auto &entry : schema_entry.second->GetEntries()) { + switch (entry.second->type) { + case CatalogType::TABLE_ENTRY: + GetNewTableInfo(commit_state, *schema_entry.second, *entry.second, result, transaction_changes); + break; + case CatalogType::VIEW_ENTRY: + GetNewViewInfo(commit_state, *schema_entry.second, *entry.second, result, transaction_changes); + break; + default: + throw InternalException("Unknown type in altered_tables_same_schema"); + } + } + } return result; } @@ -1483,7 +1564,7 @@ string DuckLakeTransaction::CommitChanges(DuckLakeCommitState &commit_state, // write new tables vector new_tables_result; vector new_inlined_data_tables_result; - if (!new_tables.empty()) { + if (!new_tables.empty() || !altered_tables_same_schema.empty()) { auto result = GetNewTables(commit_state, transaction_changes); batch_queries += metadata_manager->WriteNewTables(commit_snapshot, result.new_tables, new_schemas_result); batch_queries += metadata_manager->WriteNewPartitionKeys(commit_snapshot, result.new_partition_keys); @@ -1493,6 +1574,7 @@ string DuckLakeTransaction::CommitChanges(DuckLakeCommitState &commit_state, batch_queries += metadata_manager->WriteDroppedColumns(result.dropped_columns); batch_queries += metadata_manager->WriteNewColumns(result.new_columns); batch_queries += metadata_manager->WriteNewInlinedTables(commit_snapshot, result.new_inlined_data_tables); + batch_queries += metadata_manager->WriteNewSortKeys(commit_snapshot, result.new_sort_keys); new_tables_result = result.new_tables; new_inlined_data_tables_result = result.new_inlined_data_tables; } @@ -1715,6 +1797,60 @@ void DuckLakeTransaction::FlushChanges() { connection->Commit(); catalog_version = commit_snapshot.schema_version; + // If there were non-schema-altering changes but schema_version didn't change, + // update the cached schema with the new data (e.g., sort info, comments) + if (!altered_tables_same_schema.empty() && !SchemaChangesMade()) { + for (auto &schema_entry : altered_tables_same_schema) { + for (auto &entry : schema_entry.second->GetEntries()) { + if (entry.second->type == CatalogType::TABLE_ENTRY) { + auto &table = entry.second->Cast(); + auto table_id = table.GetTableId(); + auto local_change = table.GetLocalChange(); + + switch (local_change.type) { + case LocalChangeType::SET_SORT_KEY: { + auto sort_data = table.GetSortData(); + if (sort_data) { + // Copy the sort data to update the cache + auto sort_copy = make_uniq(*sort_data); + ducklake_catalog.UpdateSortDataInCache(commit_snapshot.schema_version, table_id, + std::move(sort_copy)); + } else { + // Sort data was cleared + ducklake_catalog.UpdateSortDataInCache(commit_snapshot.schema_version, table_id, + nullptr); + } + break; + } + case LocalChangeType::SET_COMMENT: { + ducklake_catalog.UpdateTableCommentInCache(commit_snapshot.schema_version, table_id, + table.comment); + break; + } + case LocalChangeType::SET_COLUMN_COMMENT: { + auto field_index = local_change.field_index; + auto &col = table.GetColumnByFieldId(field_index); + ducklake_catalog.UpdateColumnCommentInCache(commit_snapshot.schema_version, table_id, + field_index, col.Comment()); + break; + } + default: + break; + } + } else if (entry.second->type == CatalogType::VIEW_ENTRY) { + auto &view = entry.second->Cast(); + auto view_id = view.GetViewId(); + auto local_change = view.GetLocalChange(); + + if (local_change.type == LocalChangeType::SET_COMMENT) { + ducklake_catalog.UpdateViewCommentInCache(commit_snapshot.schema_version, view_id, + view.comment); + } + } + } + } + } + // finished writing break; } catch (std::exception &ex) { @@ -2298,9 +2434,10 @@ void DuckLakeTransaction::AlterEntry(CatalogEntry &entry, unique_ptr new_entry) { auto &new_table = new_entry->Cast(); - auto &entries = GetOrCreateTransactionLocalEntries(table); + auto change_type = new_table.GetLocalChange().type; + auto &entries = GetOrCreateTransactionLocalEntriesAlter(table, change_type); entries.CreateEntry(std::move(new_entry)); - switch (new_table.GetLocalChange().type) { + switch (change_type) { case LocalChangeType::RENAMED: { // rename - take care of the old table if (table.IsTransactionLocal()) { @@ -2323,6 +2460,7 @@ void DuckLakeTransaction::AlterEntryInternal(DuckLakeTableEntry &table, unique_p case LocalChangeType::REMOVE_COLUMN: case LocalChangeType::CHANGE_COLUMN_TYPE: case LocalChangeType::SET_DEFAULT: + case LocalChangeType::SET_SORT_KEY: break; default: throw NotImplementedException("Alter type not supported in DuckLakeTransaction::AlterEntry"); @@ -2331,9 +2469,10 @@ void DuckLakeTransaction::AlterEntryInternal(DuckLakeTableEntry &table, unique_p void DuckLakeTransaction::AlterEntryInternal(DuckLakeViewEntry &view, unique_ptr new_entry) { auto &new_view = new_entry->Cast(); - auto &entries = GetOrCreateTransactionLocalEntries(view); + auto change_type = new_view.GetLocalChange().type; + auto &entries = GetOrCreateTransactionLocalEntriesAlter(view, change_type); entries.CreateEntry(std::move(new_entry)); - switch (new_view.GetLocalChange().type) { + switch (change_type) { case LocalChangeType::RENAMED: { // rename - take care of the old table if (view.IsTransactionLocal()) { @@ -2386,6 +2525,67 @@ DuckLakeCatalogSet &DuckLakeTransaction::GetOrCreateTransactionLocalEntries(Cata } } +DuckLakeCatalogSet &DuckLakeTransaction::GetOrCreateTransactionLocalEntriesAlter(CatalogEntry &entry, + LocalChangeType change_type) { + auto catalog_type = entry.type; + if (catalog_type == CatalogType::SCHEMA_ENTRY) { + // Schema entries always use new_schemas + if (!new_schemas) { + new_schemas = make_uniq(); + } + return *new_schemas; + } + + auto &schema_name = entry.ParentSchema().name; + + // For schema-altering changes, always use new_tables + if (IsSchemaAlteringChange(change_type)) { + // If there's an existing entry in altered_tables_same_schema for this table, + // move it to new_tables first so the chain is maintained + auto altered_entry = altered_tables_same_schema.find(schema_name); + if (altered_entry != altered_tables_same_schema.end()) { + auto existing = altered_entry->second->DropEntry(entry.name); + if (existing) { + // Move to new_tables + auto entry_it = new_tables.find(schema_name); + if (entry_it == new_tables.end()) { + auto new_set = make_uniq(); + entry_it = new_tables.insert(make_pair(schema_name, std::move(new_set))).first; + } + entry_it->second->CreateEntry(std::move(existing)); + } + // Clean up empty schema entry + if (altered_entry->second->GetEntries().empty()) { + altered_tables_same_schema.erase(altered_entry); + } + } + + auto entry_it = new_tables.find(schema_name); + if (entry_it == new_tables.end()) { + auto new_set = make_uniq(); + entry_it = new_tables.insert(make_pair(schema_name, std::move(new_set))).first; + } + return *entry_it->second; + } + + // For non-schema-altering changes: + // 1. Check if entry already exists in new_tables (schema-altering change was already made) + auto new_tables_entry = new_tables.find(schema_name); + if (new_tables_entry != new_tables.end()) { + if (new_tables_entry->second->GetEntry(entry.name)) { + return *new_tables_entry->second; + } + } + + // 2. Use altered_tables_same_schema + auto altered_entry = altered_tables_same_schema.find(schema_name); + if (altered_entry == altered_tables_same_schema.end()) { + auto new_set = make_uniq(); + altered_entry = altered_tables_same_schema.insert(make_pair(schema_name, std::move(new_set))).first; + } + return *altered_entry->second; +} + optional_ptr DuckLakeTransaction::GetTransactionLocalSchemas() { return new_schemas; } @@ -2405,11 +2605,17 @@ optional_ptr DuckLakeTransaction::GetTransactionLocalEntries switch (catalog_type) { case CatalogType::TABLE_ENTRY: case CatalogType::VIEW_ENTRY: { + // Check new_tables first auto entry = new_tables.find(schema_name); - if (entry == new_tables.end()) { - return nullptr; + if (entry != new_tables.end()) { + return entry->second; } - return entry->second; + // Then check altered_tables_same_schema + auto altered_entry = altered_tables_same_schema.find(schema_name); + if (altered_entry != altered_tables_same_schema.end()) { + return altered_entry->second; + } + return nullptr; } case CatalogType::MACRO_ENTRY: case CatalogType::TABLE_MACRO_ENTRY: @@ -2440,6 +2646,13 @@ optional_ptr DuckLakeTransaction::GetLocalEntryById(TableIndex tab return entry; } } + // Also check altered_tables_same_schema + for (auto &schema_entry : altered_tables_same_schema) { + auto entry = schema_entry.second->GetEntryById(table_id); + if (entry) { + return entry; + } + } return nullptr; } diff --git a/src/storage/ducklake_view_entry.cpp b/src/storage/ducklake_view_entry.cpp index db841350faf..ada46e7a2ed 100644 --- a/src/storage/ducklake_view_entry.cpp +++ b/src/storage/ducklake_view_entry.cpp @@ -98,7 +98,7 @@ bool DuckLakeViewEntry::IsBound() const { void DuckLakeViewEntry::Bind(ClientContext &context) { D_ASSERT(!is_bound); is_bound = true; - std::string create_view_sql = "CREATE VIEW mock_view_name_lake"; + string create_view_sql = "CREATE VIEW mock_view_name_lake"; if (!aliases.empty()) { create_view_sql += "("; for (const auto &alias : aliases) { diff --git a/test/sql/comments/comment_mixed_operations.test b/test/sql/comments/comment_mixed_operations.test new file mode 100644 index 00000000000..be2273403de --- /dev/null +++ b/test/sql/comments/comment_mixed_operations.test @@ -0,0 +1,144 @@ +# name: test/sql/comments/comment_mixed_operations.test +# description: Verify COMMENT ON mixed with schema-altering operations +# group: [comments] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/comment_mixed/') + +statement ok +CREATE TABLE ducklake.test(a INTEGER, b VARCHAR); + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# COMMENT ON should NOT increment +statement ok +COMMENT ON TABLE ducklake.test IS 'first comment' + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# ADD COLUMN SHOULD increment +statement ok +ALTER TABLE ducklake.test ADD COLUMN c BIGINT + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +2 + +# COMMENT ON after ADD COLUMN should NOT increment +statement ok +COMMENT ON COLUMN ducklake.test.c IS 'c column comment' + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +2 + +# Update table comment should NOT increment +statement ok +COMMENT ON TABLE ducklake.test IS 'updated comment' + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +2 + +# RENAME COLUMN SHOULD increment +statement ok +ALTER TABLE ducklake.test RENAME COLUMN b TO b_renamed + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +3 + +# Verify final state +query I +SELECT comment FROM duckdb_tables() WHERE table_name = 'test' +---- +updated comment + +# Clear the table comment (set to NULL) should NOT increment +statement ok +COMMENT ON TABLE ducklake.test IS NULL + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +3 + +# Verify comment was cleared +query I +SELECT comment FROM duckdb_tables() WHERE table_name = 'test' +---- +NULL + +# ============================================ +# VIEW COMMENT MIXED WITH OTHER OPERATIONS +# ============================================ + +# CREATE VIEW SHOULD increment +statement ok +CREATE VIEW ducklake.test_view AS SELECT * FROM ducklake.test + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +4 + +# COMMENT ON VIEW should NOT increment +statement ok +COMMENT ON VIEW ducklake.test_view IS 'view comment' + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +4 + +# Verify view comment was written +query I +SELECT comment FROM duckdb_views() WHERE view_name = 'test_view' +---- +view comment + +# Update view comment should NOT increment +statement ok +COMMENT ON VIEW ducklake.test_view IS 'updated view comment' + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +4 + +# Verify view comment was updated +query I +SELECT comment FROM duckdb_views() WHERE view_name = 'test_view' +---- +updated view comment + +# Clear view comment should NOT increment +statement ok +COMMENT ON VIEW ducklake.test_view IS NULL + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +4 + +# Verify view comment was cleared +query I +SELECT comment FROM duckdb_views() WHERE view_name = 'test_view' +---- +NULL diff --git a/test/sql/comments/comment_same_transaction.test b/test/sql/comments/comment_same_transaction.test new file mode 100644 index 00000000000..a5da1882482 --- /dev/null +++ b/test/sql/comments/comment_same_transaction.test @@ -0,0 +1,83 @@ +# name: test/sql/comments/comment_same_transaction.test +# description: Verify COMMENT ON with other ALTER operations in same transaction +# group: [comments] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/comment_same_txn/') + +statement ok +CREATE TABLE ducklake.test(a INTEGER, b VARCHAR); + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# COMMENT + ADD COLUMN in same transaction +statement ok +BEGIN + +statement ok +COMMENT ON TABLE ducklake.test IS 'my table comment' + +statement ok +ALTER TABLE ducklake.test ADD COLUMN c BIGINT + +statement ok +COMMENT ON COLUMN ducklake.test.c IS 'new column comment' + +statement ok +COMMIT + +# Should increment schema_version once (for ADD COLUMN) +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +2 + +# Verify all changes were committed +query I +SELECT comment FROM duckdb_tables() WHERE table_name = 'test' +---- +my table comment + +query II +SELECT column_name, comment FROM duckdb_columns() WHERE table_name = 'test' ORDER BY column_index +---- +a NULL +b NULL +c new column comment + +# ============================================ +# VIEW COMMENT + CREATE VIEW IN SAME TRANSACTION +# ============================================ + +statement ok +BEGIN + +statement ok +CREATE VIEW ducklake.test_view AS SELECT * FROM ducklake.test + +statement ok +COMMENT ON VIEW ducklake.test_view IS 'new view comment' + +statement ok +COMMIT + +# Should increment schema_version once (for CREATE VIEW) +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +3 + +# Verify view comment was committed +query I +SELECT comment FROM duckdb_views() WHERE view_name = 'test_view' +---- +new view comment diff --git a/test/sql/comments/comment_schema_version.test b/test/sql/comments/comment_schema_version.test new file mode 100644 index 00000000000..6e37327818e --- /dev/null +++ b/test/sql/comments/comment_schema_version.test @@ -0,0 +1,119 @@ +# name: test/sql/comments/comment_schema_version.test +# description: Verify COMMENT ON does not increment schema_version +# group: [comments] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/comment_schema_test/') + +statement ok +CREATE TABLE ducklake.test(a INTEGER, b VARCHAR); + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# COMMENT ON TABLE should NOT increment schema_version +statement ok +COMMENT ON TABLE ducklake.test IS 'table comment' + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# COMMENT ON COLUMN should NOT increment schema_version +statement ok +COMMENT ON COLUMN ducklake.test.a IS 'column a comment' + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# Verify comments were written +query I +SELECT comment FROM duckdb_tables() WHERE table_name = 'test' +---- +table comment + +query I +SELECT comment FROM duckdb_columns() WHERE table_name = 'test' AND column_name = 'a' +---- +column a comment + +# Setting comment to NULL should also NOT increment schema_version +statement ok +COMMENT ON TABLE ducklake.test IS NULL + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +statement ok +COMMENT ON COLUMN ducklake.test.a IS NULL + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# Verify comments were cleared +query I +SELECT comment FROM duckdb_tables() WHERE table_name = 'test' +---- +NULL + +query I +SELECT comment FROM duckdb_columns() WHERE table_name = 'test' AND column_name = 'a' +---- +NULL + +# ============================================ +# VIEW COMMENT TESTS +# ============================================ + +statement ok +CREATE VIEW ducklake.test_view AS SELECT * FROM ducklake.test + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +2 + +# COMMENT ON VIEW should NOT increment schema_version +statement ok +COMMENT ON VIEW ducklake.test_view IS 'view comment' + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +2 + +# Verify view comment was written +query I +SELECT comment FROM duckdb_views() WHERE view_name = 'test_view' +---- +view comment + +# Setting view comment to NULL should also NOT increment schema_version +statement ok +COMMENT ON VIEW ducklake.test_view IS NULL + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +2 + +# Verify view comment was cleared +query I +SELECT comment FROM duckdb_views() WHERE view_name = 'test_view' +---- +NULL diff --git a/test/sql/functions/ducklake_snapshots.test b/test/sql/functions/ducklake_snapshots.test index 8a83917dc6c..c92ea72c24f 100644 --- a/test/sql/functions/ducklake_snapshots.test +++ b/test/sql/functions/ducklake_snapshots.test @@ -148,7 +148,7 @@ COMMENT ON VIEW ducklake.comment_view IS 'con1' query III SELECT snapshot_id, schema_version, changes FROM ducklake.snapshots() WHERE snapshot_id=12 ---- -12 11 {views_altered=[9]} +12 10 {views_altered=[9]} # deletes statement ok @@ -157,4 +157,4 @@ DELETE FROM ducklake.s1.tbl query III SELECT snapshot_id, schema_version, changes FROM ducklake_snapshots('ducklake') WHERE snapshot_id=13 ---- -13 11 {tables_deleted_from=[4]} +13 10 {tables_deleted_from=[4]} diff --git a/test/sql/sorted_table/data_inlining_flush_sorted_alter_table.test b/test/sql/sorted_table/data_inlining_flush_sorted_alter_table.test new file mode 100644 index 00000000000..f5463207184 --- /dev/null +++ b/test/sql/sorted_table/data_inlining_flush_sorted_alter_table.test @@ -0,0 +1,150 @@ +# name: test/sql/sorted_table/data_inlining_flush_sorted_alter_table.test +# description: alter table commands should not interfere with sorted inline flush +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +test-env DATA_PATH __TEST_DIR__ + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/ducklake_inlining_flush_data', DATA_INLINING_ROW_LIMIT 10) + +# Test with an alter table set sorted within the same transaction +statement ok +CREATE TABLE ducklake.test2(i INTEGER); + +query I +SELECT COUNT(*) FROM ducklake.test2 +---- +0 + +loop i 0 10 + +statement ok +INSERT INTO ducklake.test2 VALUES (${i}) + +endloop + +statement ok +BEGIN + +# Ensure that we will respect a sort set within the same transaction +statement ok +ALTER TABLE ducklake.test2 SET SORTED BY (i DESC); + +# flush inlined data +statement ok +CALL ducklake_flush_inlined_data('ducklake', table_name => 'test2') + +query I +SELECT COUNT(*) FROM ducklake.test2 +---- +10 + +# Ordering should reflect the 'i DESC' +query III +SELECT rowid, snapshot_id, * FROM ducklake.test2 +---- +9 11 9 +8 10 8 +7 9 7 +6 8 6 +5 7 5 +4 6 4 +3 5 3 +2 4 2 +1 3 1 +0 2 0 + +statement ok +COMMIT + +# Ordering should reflect the 'i DESC' post commit also +query III +SELECT rowid, snapshot_id, * FROM ducklake.test2 +---- +9 11 9 +8 10 8 +7 9 7 +6 8 6 +5 7 5 +4 6 4 +3 5 3 +2 4 2 +1 3 1 +0 2 0 + + +# Ensure that other transaction local changes have no impact +statement ok +CREATE TABLE ducklake.test3(i INTEGER); + +query I +SELECT COUNT(*) FROM ducklake.test3 +---- +0 + +loop i 0 10 + +statement ok +INSERT INTO ducklake.test3 VALUES (${i}) + +endloop + +statement ok +BEGIN; + +statement ok +ALTER TABLE ducklake.test3 SET SORTED BY (i DESC); + +# Do some other kind of alter table on that table to make sure that it does not interfere +statement ok +ALTER TABLE ducklake.test3 ADD COLUMN new_column INTEGER; + +# flush inlined data +statement ok +CALL ducklake_flush_inlined_data('ducklake', table_name => 'test3') + +query I +SELECT COUNT(*) FROM ducklake.test3 +---- +10 + +# Ordering should reflect the 'i DESC' +query IIII +SELECT rowid, snapshot_id, * FROM ducklake.test3 +---- +9 23 9 NULL +8 22 8 NULL +7 21 7 NULL +6 20 6 NULL +5 19 5 NULL +4 18 4 NULL +3 17 3 NULL +2 16 2 NULL +1 15 1 NULL +0 14 0 NULL + +statement ok +COMMIT; + +# Ordering should reflect the 'i DESC' post commit also +query IIII +SELECT rowid, snapshot_id, * FROM ducklake.test3 +---- +9 23 9 NULL +8 22 8 NULL +7 21 7 NULL +6 20 6 NULL +5 19 5 NULL +4 18 4 NULL +3 17 3 NULL +2 16 2 NULL +1 15 1 NULL +0 14 0 NULL + diff --git a/test/sql/sorted_table/data_inlining_flush_sorted_basic.test b/test/sql/sorted_table/data_inlining_flush_sorted_basic.test new file mode 100644 index 00000000000..eb114f9898a --- /dev/null +++ b/test/sql/sorted_table/data_inlining_flush_sorted_basic.test @@ -0,0 +1,169 @@ +# name: test/sql/sorted_table/data_inlining_flush_sorted_basic.test +# description: test flushing inlined data to disk in order when SET SORTED BY is in use +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +test-env DATA_PATH __TEST_DIR__ + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/ducklake_inlining_flush_data', DATA_INLINING_ROW_LIMIT 10) + +statement ok +CREATE TABLE ducklake.test(i INTEGER); + +query I +SELECT COUNT(*) FROM ducklake.test +---- +0 + +loop i 0 10 + +statement ok +INSERT INTO ducklake.test VALUES (${i}) + +endloop + +# all data is inlined +query I +SELECT COUNT(*) FROM GLOB('${DATA_PATH}/ducklake_inlining_files/**') +---- +0 + +query I +SELECT COUNT(*) FROM ducklake.test +---- +10 + +query III +SELECT rowid, snapshot_id, * FROM ducklake.test ORDER BY ALL +---- +0 2 0 +1 3 1 +2 4 2 +3 5 3 +4 6 4 +5 7 5 +6 8 6 +7 9 7 +8 10 8 +9 11 9 + +query III +SELECT rowid, snapshot_id, * FROM ducklake.test AT (version => 4) ORDER BY ALL +---- +0 2 0 +1 3 1 +2 4 2 + +query IIII +FROM ducklake.table_changes('test', 3, 5) ORDER BY ALL +---- +3 1 insert 1 +4 2 insert 2 +5 3 insert 3 + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (i DESC); + +statement ok +BEGIN + +# flush inlined data +statement ok +CALL ducklake_flush_inlined_data('ducklake', table_name => 'test') + +query I +SELECT COUNT(*) FROM ducklake.test +---- +10 + +# Ordering should reflect the 'i DESC' +query III +SELECT rowid, snapshot_id, * FROM ducklake.test +---- +9 11 9 +8 10 8 +7 9 7 +6 8 6 +5 7 5 +4 6 4 +3 5 3 +2 4 2 +1 3 1 +0 2 0 + +statement ok +COMMIT + +query II +SELECT snapshot_id, changes FROM ducklake.snapshots() WHERE snapshot_id IN (2, 13) ORDER BY snapshot_id +---- +2 {inlined_insert=[1]} +13 {flushed_inlined=[1]} + +# we now have one file +query I +SELECT COUNT(*) FROM GLOB('${DATA_PATH}/ducklake_inlining_flush_data/**') +---- +1 + +# flushing inlined data has no changes +query IIII +FROM ducklake.table_changes('test', 11, 13) ORDER BY ALL +---- + +# Ordering should reflect the 'i DESC' +query III +SELECT rowid, snapshot_id, * FROM ducklake.test +---- +9 11 9 +8 10 8 +7 9 7 +6 8 6 +5 7 5 +4 6 4 +3 5 3 +2 4 2 +1 3 1 +0 2 0 + +query I +SELECT * FROM ducklake.test AT (version => 4) ORDER BY ALL +---- +0 +1 +2 + +# we can still access other change feeds +query IIII +FROM ducklake.table_changes('test', 2, 5) ORDER BY ALL +---- +2 0 insert 0 +3 1 insert 1 +4 2 insert 2 +5 3 insert 3 + +# FIXME: this does not work correctly +mode skip + +query IIII +FROM ducklake.table_changes('test', 3, 5) ORDER BY ALL +---- +3 1 insert 1 +4 2 insert 2 +5 3 insert 3 + +mode unskip + +query III +SELECT rowid, snapshot_id, * FROM ducklake.test AT (version => 4) ORDER BY ALL +---- +0 2 0 +1 3 1 +2 4 2 diff --git a/test/sql/sorted_table/data_inlining_flush_sorted_renamed.test b/test/sql/sorted_table/data_inlining_flush_sorted_renamed.test new file mode 100644 index 00000000000..a2fea5ca147 --- /dev/null +++ b/test/sql/sorted_table/data_inlining_flush_sorted_renamed.test @@ -0,0 +1,52 @@ +# name: test/sql/sorted_table/data_inlining_flush_sorted_renamed.test +# description: if column names have changed, inlining flush should fail until sorted by valid column names +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +test-env DATA_PATH __TEST_DIR__ + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/ducklake_inlining_flush_data', DATA_INLINING_ROW_LIMIT 10) + +# SET SORTED BY then rename columns - flush should error on missing columns +statement ok +CREATE TABLE ducklake.renamed_columns_test (unique_id INTEGER, sort_key_1 INTEGER, sort_key_2 VARCHAR); + +statement ok +ALTER TABLE ducklake.renamed_columns_test SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +# Rename the sort columns to completely different names +statement ok +ALTER TABLE ducklake.renamed_columns_test RENAME COLUMN sort_key_1 TO sort_key_1_changed; + +statement ok +ALTER TABLE ducklake.renamed_columns_test RENAME COLUMN sort_key_2 TO sort_key_2_changed; + +# Insert data (will be inlined initially) +statement ok +INSERT INTO ducklake.renamed_columns_test (unique_id, sort_key_1_changed, sort_key_2_changed) +FROM range(5) t(i) +SELECT + i AS unique_id, + i % 2 AS sort_key_1_changed, + 'woot' || i AS sort_key_2_changed +; + +# Flush should fail because the sort columns no longer exist +statement error +CALL ducklake_flush_inlined_data('ducklake', table_name => 'renamed_columns_test'); +---- +Binder Error: Columns in the SET SORTED BY statement were not found in the DuckLake table. Unmatched columns were: sort_key_1, sort_key_2 + +statement ok +ALTER TABLE ducklake.renamed_columns_test SET SORTED BY (sort_key_1_changed ASC NULLS LAST, sort_key_2_changed ASC NULLS LAST); + +# Once sorting the new column names, we succeed +statement ok +CALL ducklake_flush_inlined_data('ducklake', table_name => 'renamed_columns_test'); \ No newline at end of file diff --git a/test/sql/sorted_table/data_inlining_flush_sorted_reset.test b/test/sql/sorted_table/data_inlining_flush_sorted_reset.test new file mode 100644 index 00000000000..8c22a20d42f --- /dev/null +++ b/test/sql/sorted_table/data_inlining_flush_sorted_reset.test @@ -0,0 +1,82 @@ +# name: test/sql/sorted_table/data_inlining_flush_sorted_reset.test +# description: RESET SORTED BY should flush data in the original order +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +test-env DATA_PATH __TEST_DIR__ + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/ducklake_inlining_flush_data', DATA_INLINING_ROW_LIMIT 10) + +# SET SORTED BY then RESET SORTED BY within same transaction +statement ok +CREATE TABLE ducklake.reset_test(i INTEGER); + +query I +SELECT COUNT(*) FROM ducklake.reset_test +---- +0 + +loop i 0 10 + +statement ok +INSERT INTO ducklake.reset_test VALUES (${i}) + +endloop + +statement ok +BEGIN; + +statement ok +ALTER TABLE ducklake.reset_test SET SORTED BY (i DESC); + +statement ok +ALTER TABLE ducklake.reset_test RESET SORTED BY; + +# flush inlined data +statement ok +CALL ducklake_flush_inlined_data('ducklake', table_name => 'reset_test') + +query I +SELECT COUNT(*) FROM ducklake.reset_test +---- +10 + +# Ordering should retain the original order +query III +SELECT rowid, snapshot_id, * FROM ducklake.reset_test +---- +0 2 0 +1 3 1 +2 4 2 +3 5 3 +4 6 4 +5 7 5 +6 8 6 +7 9 7 +8 10 8 +9 11 9 + +statement ok +COMMIT; + +# Ordering should retain the original order post commit also +query III +SELECT rowid, snapshot_id, * FROM ducklake.reset_test +---- +0 2 0 +1 3 1 +2 4 2 +3 5 3 +4 6 4 +5 7 5 +6 8 6 +7 9 7 +8 10 8 +9 11 9 diff --git a/test/sql/sorted_table/data_inlining_flush_sorted_transaction_renamed.test b/test/sql/sorted_table/data_inlining_flush_sorted_transaction_renamed.test new file mode 100644 index 00000000000..f1988ff689d --- /dev/null +++ b/test/sql/sorted_table/data_inlining_flush_sorted_transaction_renamed.test @@ -0,0 +1,154 @@ +# name: test/sql/sorted_table/data_inlining_flush_sorted_transaction_renamed.test +# description: if column names have changed, inlining flush should fail until sorted by valid column names +# group: [sorted_table] + +# Irrelevant alter table +# Alter table rename that causes inline flush to fail +# Then handle rollback somehow? +# Alter table rename then alter table set sorted by the new name, then flush + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +test-env DATA_PATH __TEST_DIR__ + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/ducklake_inlining_flush_data', DATA_INLINING_ROW_LIMIT 10) + +# SET SORTED BY then rename columns - flush should error on missing columns +statement ok +CREATE TABLE ducklake.renamed_columns_test (unique_id INTEGER, sort_key_1 INTEGER, sort_key_2 VARCHAR); + +# Insert data (will be inlined initially) +statement ok +INSERT INTO ducklake.renamed_columns_test (unique_id, sort_key_1, sort_key_2) +FROM range(5) t(i) +SELECT + i AS unique_id, + i % 2 AS sort_key_1_changed, + 'woot' || i AS sort_key_2_changed +; + +statement ok +BEGIN + +statement ok +ALTER TABLE ducklake.renamed_columns_test SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +# Do an irrelevant ALTER TABLE +statement ok +ALTER TABLE ducklake.renamed_columns_test ADD COLUMN bonus_column VARCHAR; + +statement ok +CALL ducklake_flush_inlined_data('ducklake', table_name => 'renamed_columns_test'); + +statement ok +COMMIT + +query IIII +FROM ducklake.renamed_columns_test +---- +0 0 woot0 NULL +2 0 woot2 NULL +4 0 woot4 NULL +1 1 woot1 NULL +3 1 woot3 NULL + +statement ok +INSERT INTO ducklake.renamed_columns_test (unique_id, sort_key_1, sort_key_2) +FROM range(5) t(i) +SELECT + i + 5 AS unique_id, + (i + 5) % 2 AS sort_key_1, + 'woot' || (i + 5) AS sort_key_2 +ORDER BY + i DESC +; + +# First 5 rows should reflect the sort order, second 5 rows should reflect insertion order +query IIII +FROM ducklake.renamed_columns_test +---- +0 0 woot0 NULL +2 0 woot2 NULL +4 0 woot4 NULL +1 1 woot1 NULL +3 1 woot3 NULL +9 1 woot9 NULL +8 0 woot8 NULL +7 1 woot7 NULL +6 0 woot6 NULL +5 1 woot5 NULL + + +statement ok +ALTER TABLE ducklake.renamed_columns_test RESET SORTED BY; + +statement ok +BEGIN + +statement ok +ALTER TABLE ducklake.renamed_columns_test SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +# Alter a column in the table to reflect +statement ok +ALTER TABLE ducklake.renamed_columns_test RENAME COLUMN sort_key_1 TO sort_key_1_renamed; + +statement error +CALL ducklake_flush_inlined_data('ducklake', table_name => 'renamed_columns_test'); +---- +Binder Error: Columns in the SET SORTED BY statement were not found in the DuckLake table. Unmatched columns were: sort_key_1 + +statement ok +ROLLBACK + +# Now it should all succeed! +statement ok +BEGIN + +# Alter a column in the table to reflect +statement ok +ALTER TABLE ducklake.renamed_columns_test RENAME COLUMN sort_key_1 TO sort_key_1_renamed; + +statement ok +ALTER TABLE ducklake.renamed_columns_test SET SORTED BY (sort_key_1_renamed ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +statement ok +CALL ducklake_flush_inlined_data('ducklake', table_name => 'renamed_columns_test'); + +# The order is already reflected within the commit +query IIII +FROM ducklake.renamed_columns_test +---- +0 0 woot0 NULL +2 0 woot2 NULL +4 0 woot4 NULL +1 1 woot1 NULL +3 1 woot3 NULL +6 0 woot6 NULL +8 0 woot8 NULL +5 1 woot5 NULL +7 1 woot7 NULL +9 1 woot9 NULL + + +statement ok +COMMIT + +query IIII +FROM ducklake.renamed_columns_test +---- +0 0 woot0 NULL +2 0 woot2 NULL +4 0 woot4 NULL +1 1 woot1 NULL +3 1 woot3 NULL +6 0 woot6 NULL +8 0 woot8 NULL +5 1 woot5 NULL +7 1 woot7 NULL +9 1 woot9 NULL diff --git a/test/sql/sorted_table/merge_adjacent_sorted_basic.test b/test/sql/sorted_table/merge_adjacent_sorted_basic.test new file mode 100644 index 00000000000..6657956185c --- /dev/null +++ b/test/sql/sorted_table/merge_adjacent_sorted_basic.test @@ -0,0 +1,219 @@ +# name: test/sql/sorted_table/merge_adjacent_sorted_basic.test +# description: test ducklake merge adjacent files with SET SORTED BY metadata +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/merge_adjacent_options_sorted/') + +statement ok +USE ducklake; + +# Create one Table with two files +statement ok +CREATE TABLE sort_on_compaction (unique_id BIGINT, sort_key_1 BIGINT, sort_key_2 VARCHAR); + +statement ok +INSERT INTO sort_on_compaction (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i AS unique_id, + i % 2 AS sort_key_1, + 'woot' || i AS sort_key_2 +ORDER BY + i DESC +; + +statement ok +INSERT INTO sort_on_compaction (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i + 4 AS unique_id, + (i + 4) % 2 AS sort_key_1, + 'woot' || (i + 4) AS sort_key_2 +ORDER BY + i DESC +; + +query III +FROM sort_on_compaction +---- +3 1 woot3 +2 0 woot2 +1 1 woot1 +0 0 woot0 +7 1 woot7 +6 0 woot6 +5 1 woot5 +4 0 woot4 + +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'sort_on_compaction' +---- +0 1 +1 1 + +statement ok +ALTER TABLE ducklake.sort_on_compaction SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +# Validate snapshot changes - SET SORTED BY should NOT increment schema_version +query I +SELECT max(schema_version) +FROM ducklake.snapshots() +---- +1 + +# Ensure that the snapshot changes match expectations +query III +SELECT snapshot_id, schema_version, changes FROM ducklake_snapshots('ducklake') +---- +0 0 {schemas_created=[main]} +1 1 {tables_created=[main.sort_on_compaction]} +2 1 {tables_inserted_into=[1]} +3 1 {tables_inserted_into=[1]} +4 1 {tables_altered=[1]} + +# Even multiple changes back to back should not change the schema_version +statement ok +ALTER TABLE ducklake.sort_on_compaction SET SORTED BY (sort_key_1 DESC, sort_key_2 DESC); + +statement ok +ALTER TABLE ducklake.sort_on_compaction SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +# Validate snapshot changes - SET SORTED BY should NOT increment schema_version +query I +SELECT max(schema_version) +FROM ducklake.snapshots() +---- +1 + +# Ensure that the snapshot changes match expectations +query III +SELECT snapshot_id, schema_version, changes FROM ducklake_snapshots('ducklake') +---- +0 0 {schemas_created=[main]} +1 1 {tables_created=[main.sort_on_compaction]} +2 1 {tables_inserted_into=[1]} +3 1 {tables_inserted_into=[1]} +4 1 {tables_altered=[1]} +5 1 {tables_altered=[1]} +6 1 {tables_altered=[1]} + +# Do some other kind of alter table on that table to make sure that it does not interfere +statement ok +ALTER TABLE sort_on_compaction ADD COLUMN new_column INTEGER; + +statement ok +CALL ducklake_merge_adjacent_files('ducklake', 'sort_on_compaction'); + +query IIII +FROM sort_on_compaction +---- +0 0 woot0 NULL +2 0 woot2 NULL +4 0 woot4 NULL +6 0 woot6 NULL +1 1 woot1 NULL +3 1 woot3 NULL +5 1 woot5 NULL +7 1 woot7 NULL + +# We write a new file for the table +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'sort_on_compaction' +---- +2 1 + + +# Column does not exist in the table +statement error +ALTER TABLE sort_on_compaction SET SORTED BY (WOOOOT ASC , BLAH ASC); +---- +Column "WOOOOT" does not exist in table "sort_on_compaction" + +# Expressions are not supported, only column refs +statement error +ALTER TABLE sort_on_compaction SET SORTED BY (concat(sort_key_1, '_', sort_key_2) ASC); +---- +SET SORTED BY only supports column references, not expressions + +# Drop the table, then expire the snapshots, then validate that the catalog data has been removed + +# Pre-expiration catalog status: +query IIIIIII +SELECT +si.table_id, si.begin_snapshot, si.end_snapshot, se.sort_key_index, se.expression, se.sort_direction, se.null_order +FROM __ducklake_metadata_ducklake.ducklake_sort_info si +JOIN __ducklake_metadata_ducklake.ducklake_sort_expression se USING (sort_id) +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON si.table_id = t.table_id +WHERE +t.table_name = 'sort_on_compaction' +---- +1 4 5 0 sort_key_1 ASC NULLS_LAST +1 4 5 1 sort_key_2 ASC NULLS_LAST +1 5 6 0 sort_key_1 DESC NULLS_LAST +1 5 6 1 sort_key_2 DESC NULLS_LAST +1 6 NULL 0 sort_key_1 ASC NULLS_LAST +1 6 NULL 1 sort_key_2 ASC NULLS_LAST + + +statement ok +drop table sort_on_compaction; + +# History should be present as long as the snapshots are not yet expired +query IIIIIII +SELECT +si.table_id, si.begin_snapshot, si.end_snapshot, se.sort_key_index, se.expression, se.sort_direction, se.null_order +FROM __ducklake_metadata_ducklake.ducklake_sort_info si +JOIN __ducklake_metadata_ducklake.ducklake_sort_expression se USING (sort_id) +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON si.table_id = t.table_id +WHERE +t.table_name = 'sort_on_compaction' +---- +1 4 5 0 sort_key_1 ASC NULLS_LAST +1 4 5 1 sort_key_2 ASC NULLS_LAST +1 5 6 0 sort_key_1 DESC NULLS_LAST +1 5 6 1 sort_key_2 DESC NULLS_LAST +1 6 9 0 sort_key_1 ASC NULLS_LAST +1 6 9 1 sort_key_2 ASC NULLS_LAST + +statement ok +CALL ducklake_expire_snapshots('ducklake', older_than => now()); + +# This should be empty now that there are no snapshots remaining that rely on this table +query IIII +SELECT +si.sort_id, si.table_id, si.begin_snapshot, si.end_snapshot +FROM __ducklake_metadata_ducklake.ducklake_sort_info si +WHERE +si.table_id = 1 +---- + +# This should be empty now that there are no snapshots remaining that rely on this table +query IIIIIII +SELECT +se.sort_id, se.table_id, se.sort_key_index, se.expression, se.dialect, se.sort_direction, se.null_order +FROM __ducklake_metadata_ducklake.ducklake_sort_expression se +WHERE +se.table_id = 1 +---- + diff --git a/test/sql/sorted_table/merge_adjacent_sorted_drop_recreate.test b/test/sql/sorted_table/merge_adjacent_sorted_drop_recreate.test new file mode 100644 index 00000000000..ca0e8d499eb --- /dev/null +++ b/test/sql/sorted_table/merge_adjacent_sorted_drop_recreate.test @@ -0,0 +1,105 @@ +# name: test/sql/sorted_table/merge_adjacent_sorted_drop_recreate.test +# description: test ducklake merge adjacent files dropping and recreating a table +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/merge_adjacent_options_sorted/') + +statement ok +USE ducklake; + +statement ok +CREATE TABLE table_to_recreate (unique_id_original BIGINT, sort_key_1_original BIGINT, sort_key_2_original VARCHAR); + +# SET SORTED BY on the original table copy +statement ok +ALTER TABLE ducklake.table_to_recreate SET SORTED BY (sort_key_1_original ASC NULLS LAST, sort_key_2_original ASC NULLS LAST); + +# DROP and re CREATE the table, then ensure the SET SORTED BY does not apply +statement ok +DROP TABLE ducklake.table_to_recreate; + +statement ok +CREATE TABLE table_to_recreate (unique_id BIGINT, sort_key_1 BIGINT, sort_key_2 VARCHAR); + +statement ok +INSERT INTO table_to_recreate (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i AS unique_id, + i % 2 AS sort_key_1, + 'woot' || i AS sort_key_2 +ORDER BY + i DESC +; + +statement ok +INSERT INTO table_to_recreate (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i + 4 AS unique_id, + (i + 4) % 2 AS sort_key_1, + 'woot' || (i + 4) AS sort_key_2 +ORDER BY + i DESC +; + +query III +FROM table_to_recreate +---- +3 1 woot3 +2 0 woot2 +1 1 woot1 +0 0 woot0 +7 1 woot7 +6 0 woot6 +5 1 woot5 +4 0 woot4 + +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'table_to_recreate' +---- +0 3 +1 3 + +statement ok +CALL ducklake_merge_adjacent_files('ducklake', 'table_to_recreate'); + + +# The sort order should match the original insert order, not the SET SORTED BY from the old table version +query III +FROM table_to_recreate +---- +3 1 woot3 +2 0 woot2 +1 1 woot1 +0 0 woot0 +7 1 woot7 +6 0 woot6 +5 1 woot5 +4 0 woot4 + +# We write a new file for the table +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'table_to_recreate' +---- +2 3 + diff --git a/test/sql/sorted_table/merge_adjacent_sorted_renamed.test b/test/sql/sorted_table/merge_adjacent_sorted_renamed.test new file mode 100644 index 00000000000..8c5fe5a64d5 --- /dev/null +++ b/test/sql/sorted_table/merge_adjacent_sorted_renamed.test @@ -0,0 +1,66 @@ +# name: test/sql/sorted_table/merge_adjacent_sorted_renamed.test +# description: ensure that setting identical sort orders does not create extra catalog entries +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/merge_adjacent_options_sorted/') + +statement ok +USE ducklake; + +# SET SORTED BY then rename columns - compaction should error on missing columns +statement ok +CREATE TABLE renamed_columns_test (unique_id BIGINT, sort_key_1 BIGINT, sort_key_2 VARCHAR); + +statement ok +ALTER TABLE ducklake.renamed_columns_test SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +# Rename the sort columns to completely different names +statement ok +ALTER TABLE renamed_columns_test RENAME COLUMN sort_key_1 TO sort_key_1_changed; + +statement ok +ALTER TABLE renamed_columns_test RENAME COLUMN sort_key_2 TO sort_key_2_changed; + +# Insert data twice to create two files +statement ok +INSERT INTO renamed_columns_test (unique_id, sort_key_1_changed, sort_key_2_changed) +FROM range(4) t(i) +SELECT + i AS unique_id, + i % 2 AS sort_key_1_changed, + 'woot' || i AS sort_key_2_changed +ORDER BY + i DESC +; + +statement ok +INSERT INTO renamed_columns_test (unique_id, sort_key_1_changed, sort_key_2_changed) +FROM range(4) t(i) +SELECT + i + 4 AS unique_id, + (i + 4) % 2 AS sort_key_1_changed, + 'woot' || (i + 4) AS sort_key_2_changed +ORDER BY + i DESC +; + +# Compaction should fail because the sort columns no longer exist +statement error +CALL ducklake_merge_adjacent_files('ducklake', 'renamed_columns_test'); +---- +Binder Error: Columns in the SET SORTED BY statement were not found in the DuckLake table. Unmatched columns were: sort_key_1, sort_key_2 + +statement ok +ALTER TABLE ducklake.renamed_columns_test SET SORTED BY (sort_key_1_changed ASC NULLS LAST, sort_key_2_changed ASC NULLS LAST); + +statement ok +CALL ducklake_merge_adjacent_files('ducklake', 'renamed_columns_test'); \ No newline at end of file diff --git a/test/sql/sorted_table/merge_adjacent_sorted_repeated.test b/test/sql/sorted_table/merge_adjacent_sorted_repeated.test new file mode 100644 index 00000000000..b7fef67ffc8 --- /dev/null +++ b/test/sql/sorted_table/merge_adjacent_sorted_repeated.test @@ -0,0 +1,208 @@ +# name: test/sql/sorted_table/merge_adjacent_sorted_repeated.test +# description: ensure that setting identical sort orders does not create extra catalog entries +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/merge_adjacent_options_sorted/') + +statement ok +USE ducklake; + +# Multiple SET SORTED BY calls with the same sort should not cause duplicate entries in the catalog +statement ok +CREATE TABLE prevent_duplicates (unique_id BIGINT, sort_key_1 BIGINT, sort_key_2 VARCHAR); + +statement ok +INSERT INTO prevent_duplicates (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i AS unique_id, + i % 2 AS sort_key_1, + 'woot' || i AS sort_key_2 +ORDER BY + i DESC +; + +statement ok +INSERT INTO prevent_duplicates (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i + 4 AS unique_id, + (i + 4) % 2 AS sort_key_1, + 'woot' || (i + 4) AS sort_key_2 +ORDER BY + i DESC +; + +query III +FROM prevent_duplicates +---- +3 1 woot3 +2 0 woot2 +1 1 woot1 +0 0 woot0 +7 1 woot7 +6 0 woot6 +5 1 woot5 +4 0 woot4 + +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'prevent_duplicates' +---- +0 1 +1 1 + +statement ok +ALTER TABLE ducklake.prevent_duplicates SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +statement ok +ALTER TABLE ducklake.prevent_duplicates SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +statement ok +CALL ducklake_merge_adjacent_files('ducklake', 'prevent_duplicates'); + +query III +FROM prevent_duplicates +---- +0 0 woot0 +2 0 woot2 +4 0 woot4 +6 0 woot6 +1 1 woot1 +3 1 woot3 +5 1 woot5 +7 1 woot7 + +# We write a new file for the table +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'prevent_duplicates' +---- +2 1 + +# We should only have 2 rows in the metadata table for this DuckLake table +query IIII +SELECT +si.table_id, si.begin_snapshot, si.end_snapshot, se.sort_key_index +FROM __ducklake_metadata_ducklake.ducklake_sort_info si +JOIN __ducklake_metadata_ducklake.ducklake_sort_expression se USING (sort_id) +WHERE +si.table_id = 1 +---- +1 4 NULL 0 +1 4 NULL 1 + + +# Multiple SET SORTED BY calls with different sort orders should have separate catalog entries +statement ok +CREATE TABLE multiple_set_sorted (unique_id BIGINT, sort_key_1 BIGINT, sort_key_2 VARCHAR); + +statement ok +INSERT INTO multiple_set_sorted (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i AS unique_id, + i % 2 AS sort_key_1, + 'woot' || i AS sort_key_2 +ORDER BY + i DESC +; + +statement ok +INSERT INTO multiple_set_sorted (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i + 4 AS unique_id, + (i + 4) % 2 AS sort_key_1, + 'woot' || (i + 4) AS sort_key_2 +ORDER BY + i DESC +; + +query III +FROM multiple_set_sorted +---- +3 1 woot3 +2 0 woot2 +1 1 woot1 +0 0 woot0 +7 1 woot7 +6 0 woot6 +5 1 woot5 +4 0 woot4 + +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'multiple_set_sorted' +---- +3 4 +4 4 + +statement ok +ALTER TABLE ducklake.multiple_set_sorted SET SORTED BY (sort_key_1 DESC NULLS FIRST, sort_key_2 DESC NULLS FIRST); + +statement ok +ALTER TABLE ducklake.multiple_set_sorted SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +statement ok +CALL ducklake_merge_adjacent_files('ducklake', 'multiple_set_sorted'); + +query III +FROM multiple_set_sorted +---- +0 0 woot0 +2 0 woot2 +4 0 woot4 +6 0 woot6 +1 1 woot1 +3 1 woot3 +5 1 woot5 +7 1 woot7 + +# We write a new file for the table +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'multiple_set_sorted' +---- +5 4 + +# We should have 4 rows in the metadata table for this DuckLake table +query IIIIIIII +SELECT +si.table_id, si.begin_snapshot, si.end_snapshot, se.sort_key_index, +se.expression, se.dialect, se.sort_direction, se.null_order +FROM __ducklake_metadata_ducklake.ducklake_sort_info si +JOIN __ducklake_metadata_ducklake.ducklake_sort_expression se USING (sort_id) +WHERE +si.table_id = 4 +ORDER BY si.table_id, si.begin_snapshot, si.end_snapshot, se.sort_key_index +---- +4 10 11 0 sort_key_1 duckdb DESC NULLS_FIRST +4 10 11 1 sort_key_2 duckdb DESC NULLS_FIRST +4 11 NULL 0 sort_key_1 duckdb ASC NULLS_LAST +4 11 NULL 1 sort_key_2 duckdb ASC NULLS_LAST diff --git a/test/sql/sorted_table/merge_adjacent_sorted_reset.test b/test/sql/sorted_table/merge_adjacent_sorted_reset.test new file mode 100644 index 00000000000..9f9eaa09a8d --- /dev/null +++ b/test/sql/sorted_table/merge_adjacent_sorted_reset.test @@ -0,0 +1,99 @@ +# name: test/sql/sorted_table/merge_adjacent_sorted_reset.test +# description: test ducklake merge adjacent files in original order with RESET SORTED BY +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/merge_adjacent_options_sorted/') + +statement ok +USE ducklake; + +# SET SORTED then RESET SORTED to compact in insert order +statement ok +CREATE TABLE reset_sorted_test (unique_id BIGINT, sort_key_1 BIGINT, sort_key_2 VARCHAR); + +statement ok +INSERT INTO reset_sorted_test (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i AS unique_id, + i % 2 AS sort_key_1, + 'woot' || i AS sort_key_2 +ORDER BY + i DESC +; + +statement ok +INSERT INTO reset_sorted_test (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i + 4 AS unique_id, + (i + 4) % 2 AS sort_key_1, + 'woot' || (i + 4) AS sort_key_2 +ORDER BY + i DESC +; + +query III +FROM reset_sorted_test +---- +3 1 woot3 +2 0 woot2 +1 1 woot1 +0 0 woot0 +7 1 woot7 +6 0 woot6 +5 1 woot5 +4 0 woot4 + +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'reset_sorted_test' +---- +0 1 +1 1 + +statement ok +ALTER TABLE ducklake.reset_sorted_test SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +statement ok +ALTER TABLE ducklake.reset_sorted_test RESET SORTED BY; + +statement ok +CALL ducklake_merge_adjacent_files('ducklake', 'reset_sorted_test'); + +# We should receive the insertion order post compaction due to the RESET SORTED BY +query III +FROM reset_sorted_test +---- +3 1 woot3 +2 0 woot2 +1 1 woot1 +0 0 woot0 +7 1 woot7 +6 0 woot6 +5 1 woot5 +4 0 woot4 + +# We write a new file for the table +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'reset_sorted_test' +---- +2 1 diff --git a/test/sql/sorted_table/merge_adjacent_sorted_transaction_alter_table_unrelated.test b/test/sql/sorted_table/merge_adjacent_sorted_transaction_alter_table_unrelated.test new file mode 100644 index 00000000000..84f011491f5 --- /dev/null +++ b/test/sql/sorted_table/merge_adjacent_sorted_transaction_alter_table_unrelated.test @@ -0,0 +1,120 @@ +# name: test/sql/sorted_table/merge_adjacent_sorted_transaction_alter_table_unrelated.test +# description: test ducklake merge adjacent files with SET SORTED BY metadata within transactions +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/merge_adjacent_options_sorted/') + +statement ok +USE ducklake; + +# Create one Table with two files +statement ok +CREATE TABLE sort_on_compaction (unique_id BIGINT, sort_key_1 BIGINT, sort_key_2 VARCHAR); + +statement ok +INSERT INTO sort_on_compaction (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i AS unique_id, + i % 2 AS sort_key_1, + 'woot' || i AS sort_key_2 +ORDER BY + i DESC +; + +statement ok +INSERT INTO sort_on_compaction (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i + 4 AS unique_id, + (i + 4) % 2 AS sort_key_1, + 'woot' || (i + 4) AS sort_key_2 +ORDER BY + i DESC +; + +query III +FROM sort_on_compaction +---- +3 1 woot3 +2 0 woot2 +1 1 woot1 +0 0 woot0 +7 1 woot7 +6 0 woot6 +5 1 woot5 +4 0 woot4 + +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'sort_on_compaction' +---- +0 1 +1 1 + +statement ok +BEGIN + +statement ok +ALTER TABLE ducklake.sort_on_compaction SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +# Do some other kind of alter table on that table to make sure that it does not interfere +statement ok +ALTER TABLE sort_on_compaction ADD COLUMN new_column INTEGER; + +statement ok +COMMIT + +statement ok +CALL ducklake_merge_adjacent_files('ducklake', 'sort_on_compaction'); + + +# Now we have the new sort order and the new column +query IIII +FROM sort_on_compaction +---- +0 0 woot0 NULL +2 0 woot2 NULL +4 0 woot4 NULL +6 0 woot6 NULL +1 1 woot1 NULL +3 1 woot3 NULL +5 1 woot5 NULL +7 1 woot7 NULL + + +# Ensure that the snapshot changes match expectations +query III +SELECT snapshot_id, schema_version, changes FROM ducklake_snapshots('ducklake') +---- +0 0 {schemas_created=[main]} +1 1 {tables_created=[main.sort_on_compaction]} +2 1 {tables_inserted_into=[1]} +3 1 {tables_inserted_into=[1]} +4 2 {tables_altered=[1]} +5 2 {} + + +# We wrote a new file for the table +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'sort_on_compaction' +---- +2 1 diff --git a/test/sql/sorted_table/merge_adjacent_sorted_transaction_renamed.test b/test/sql/sorted_table/merge_adjacent_sorted_transaction_renamed.test new file mode 100644 index 00000000000..4816e8284c1 --- /dev/null +++ b/test/sql/sorted_table/merge_adjacent_sorted_transaction_renamed.test @@ -0,0 +1,139 @@ +# name: test/sql/sorted_table/merge_adjacent_sorted_transaction_renamed.test +# description: test ducklake merge adjacent files with SET SORTED BY metadata within transactions where columns are renamed +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + + + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/merge_adjacent_options_sorted/') + +statement ok +USE ducklake; + +# Create one Table with two files +statement ok +CREATE TABLE sort_on_compaction (unique_id BIGINT, sort_key_1 BIGINT, sort_key_2 VARCHAR); + +statement ok +INSERT INTO sort_on_compaction (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i AS unique_id, + i % 2 AS sort_key_1, + 'woot' || i AS sort_key_2 +ORDER BY + i DESC +; + +statement ok +INSERT INTO sort_on_compaction (unique_id, sort_key_1, sort_key_2) +FROM range(4) t(i) +SELECT + i + 4 AS unique_id, + (i + 4) % 2 AS sort_key_1, + 'woot' || (i + 4) AS sort_key_2 +ORDER BY + i DESC +; + +query III +FROM sort_on_compaction +---- +3 1 woot3 +2 0 woot2 +1 1 woot1 +0 0 woot0 +7 1 woot7 +6 0 woot6 +5 1 woot5 +4 0 woot4 + +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'sort_on_compaction' +---- +0 1 +1 1 + +statement ok +BEGIN + +statement ok +ALTER TABLE ducklake.sort_on_compaction SET SORTED BY (sort_key_1 ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +# Alter the column names to make the sorted by constraint stale +statement ok +ALTER TABLE sort_on_compaction RENAME COLUMN sort_key_1 TO sort_key_1_renamed; + +statement ok +COMMIT + +# The compaction should fail due to the stale sort order +statement error +CALL ducklake_merge_adjacent_files('ducklake', 'sort_on_compaction'); +---- +Binder Error: Columns in the SET SORTED BY statement were not found in the DuckLake table. Unmatched columns were: sort_key_1 + +statement ok +BEGIN + +# Do some other kind of alter table on that table to make sure that it does not interfere +statement ok +ALTER TABLE sort_on_compaction RENAME COLUMN sort_key_1_renamed TO sort_key_1_renamed_again; + +statement ok +ALTER TABLE ducklake.sort_on_compaction SET SORTED BY (sort_key_1_renamed_again ASC NULLS LAST, sort_key_2 ASC NULLS LAST); + +statement ok +COMMIT + +statement ok +CALL ducklake_merge_adjacent_files('ducklake', 'sort_on_compaction'); + +# Now we have the new sort order and the new column +query III +FROM sort_on_compaction +---- +0 0 woot0 +2 0 woot2 +4 0 woot4 +6 0 woot6 +1 1 woot1 +3 1 woot3 +5 1 woot5 +7 1 woot7 + + +# Ensure that the snapshot changeßs match expectations +query III +SELECT snapshot_id, schema_version, changes FROM ducklake_snapshots('ducklake') +---- +0 0 {schemas_created=[main]} +1 1 {tables_created=[main.sort_on_compaction]} +2 1 {tables_inserted_into=[1]} +3 1 {tables_inserted_into=[1]} +4 2 {tables_altered=[1]} +5 3 {tables_altered=[1]} +6 3 {} + + +# We wrote a new file for the table +query II +SELECT df.data_file_id, df.table_id +FROM __ducklake_metadata_ducklake.ducklake_data_file df +JOIN __ducklake_metadata_ducklake.ducklake_table t +ON df.table_id = t.table_id +WHERE +t.table_name = 'sort_on_compaction' +---- +2 1 diff --git a/test/sql/sorted_table/reset_sorted_by_schema_version.test b/test/sql/sorted_table/reset_sorted_by_schema_version.test new file mode 100644 index 00000000000..9bbe6700ee3 --- /dev/null +++ b/test/sql/sorted_table/reset_sorted_by_schema_version.test @@ -0,0 +1,44 @@ +# name: test/sql/sorted_table/reset_sorted_by_schema_version.test +# description: Verify RESET SORTED BY does not increment schema_version +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/reset_schema_test/') + +statement ok +CREATE TABLE ducklake.test(a INTEGER, b VARCHAR); + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC); + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +statement ok +ALTER TABLE ducklake.test RESET SORTED BY; + +# RESET SORTED BY should also NOT increment schema_version +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# Verify sort data was cleared (end_snapshot set) +query I +SELECT COUNT(*) FROM __ducklake_metadata_ducklake.ducklake_sort_info +WHERE table_id = 1 AND end_snapshot IS NULL +---- +0 diff --git a/test/sql/sorted_table/schema_version_mixed_operations.test b/test/sql/sorted_table/schema_version_mixed_operations.test new file mode 100644 index 00000000000..d433146deff --- /dev/null +++ b/test/sql/sorted_table/schema_version_mixed_operations.test @@ -0,0 +1,74 @@ +# name: test/sql/sorted_table/schema_version_mixed_operations.test +# description: Verify schema_version increments correctly with mixed operations +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/mixed_ops_test/') + +statement ok +CREATE TABLE ducklake.test(a INTEGER, b VARCHAR); + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# SET SORTED BY should NOT increment +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC); + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# ADD COLUMN SHOULD increment schema_version +statement ok +ALTER TABLE ducklake.test ADD COLUMN c BIGINT; + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +2 + +# SET SORTED BY after ADD COLUMN should NOT increment +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a DESC, c ASC); + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +2 + +# RENAME COLUMN SHOULD increment schema_version +statement ok +ALTER TABLE ducklake.test RENAME COLUMN b TO b_renamed; + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +3 + +# SET SORTED BY with new column name should NOT increment +statement ok +ALTER TABLE ducklake.test SET SORTED BY (b_renamed ASC); + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +3 + +# DROP COLUMN SHOULD increment schema_version +statement ok +ALTER TABLE ducklake.test DROP COLUMN c; + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +4 diff --git a/test/sql/sorted_table/schema_version_same_transaction.test b/test/sql/sorted_table/schema_version_same_transaction.test new file mode 100644 index 00000000000..3a0e64f42ad --- /dev/null +++ b/test/sql/sorted_table/schema_version_same_transaction.test @@ -0,0 +1,73 @@ +# name: test/sql/sorted_table/schema_version_same_transaction.test +# description: Verify schema_version with multiple operations in same transaction +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/same_txn_test/') + +statement ok +CREATE TABLE ducklake.test(a INTEGER, b VARCHAR); + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# Multiple SET SORTED BY in same transaction - should NOT increment +statement ok +BEGIN + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC); + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (b DESC); + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a DESC, b ASC); + +statement ok +COMMIT + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# SET SORTED BY + ADD COLUMN in same transaction - SHOULD increment (once) +statement ok +BEGIN + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC); + +statement ok +ALTER TABLE ducklake.test ADD COLUMN c BIGINT; + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC, c DESC); + +statement ok +COMMIT + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +2 + +# Verify sort data was committed correctly +query III +SELECT expression, sort_direction, null_order +FROM __ducklake_metadata_ducklake.ducklake_sort_expression se +JOIN __ducklake_metadata_ducklake.ducklake_sort_info si USING (sort_id) +WHERE si.table_id = 1 AND si.end_snapshot IS NULL +ORDER BY se.sort_key_index +---- +a ASC NULLS_LAST +c DESC NULLS_LAST diff --git a/test/sql/sorted_table/set_sorted_by_rollback_basic.test b/test/sql/sorted_table/set_sorted_by_rollback_basic.test new file mode 100644 index 00000000000..1ae403a7ce5 --- /dev/null +++ b/test/sql/sorted_table/set_sorted_by_rollback_basic.test @@ -0,0 +1,104 @@ +# name: test/sql/sorted_table/set_sorted_by_rollback_basic.test +# description: Verify SET SORTED BY is rolled back correctly when transaction is rolled back +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/rollback_basic/') + +statement ok +CREATE TABLE ducklake.test(a INTEGER, b VARCHAR); + +# Record initial state +query I +SELECT max(snapshot_id) FROM ducklake.snapshots() +---- +1 + +# Verify no sort info exists initially +query I +SELECT COUNT(*) +FROM __ducklake_metadata_ducklake.ducklake_sort_info +WHERE table_id = 1 AND end_snapshot IS NULL +---- +0 + +# BEGIN transaction and SET SORTED BY +statement ok +BEGIN + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC NULLS LAST); + +# ROLLBACK the transaction +statement ok +ROLLBACK + +# Verify snapshot_id has NOT changed (no new snapshot created) +query I +SELECT max(snapshot_id) FROM ducklake.snapshots() +---- +1 + +# Verify sort info does NOT exist after rollback +query I +SELECT COUNT(*) +FROM __ducklake_metadata_ducklake.ducklake_sort_info +WHERE table_id = 1 AND end_snapshot IS NULL +---- +0 + +# ============================================ +# Test multiple SET SORTED BY then rollback +# ============================================ + +statement ok +BEGIN + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC); + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (b DESC); + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a DESC, b ASC); + +statement ok +ROLLBACK + +# Verify no changes persisted +query I +SELECT max(snapshot_id) FROM ducklake.snapshots() +---- +1 + +query I +SELECT COUNT(*) +FROM __ducklake_metadata_ducklake.ducklake_sort_info +WHERE table_id = 1 AND end_snapshot IS NULL +---- +0 + +# ============================================ +# Verify SET SORTED BY still works after rollback +# ============================================ + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC NULLS LAST, b DESC NULLS LAST); + +# This should now have sort info committed +query III +SELECT expression, sort_direction, null_order +FROM __ducklake_metadata_ducklake.ducklake_sort_expression se +JOIN __ducklake_metadata_ducklake.ducklake_sort_info si USING (sort_id) +WHERE si.table_id = 1 AND si.end_snapshot IS NULL +ORDER BY se.sort_key_index +---- +a ASC NULLS_LAST +b DESC NULLS_LAST diff --git a/test/sql/sorted_table/set_sorted_by_rollback_mixed.test b/test/sql/sorted_table/set_sorted_by_rollback_mixed.test new file mode 100644 index 00000000000..fea6b01fb0e --- /dev/null +++ b/test/sql/sorted_table/set_sorted_by_rollback_mixed.test @@ -0,0 +1,209 @@ +# name: test/sql/sorted_table/set_sorted_by_rollback_mixed.test +# description: Verify SET SORTED BY with other ALTER TABLE operations is rolled back correctly +# group: [sorted_table] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db + +statement ok +ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '__TEST_DIR__/rollback_mixed/') + +statement ok +CREATE TABLE ducklake.test(a INTEGER, b VARCHAR); + +# Record initial state +query I +SELECT max(snapshot_id) FROM ducklake.snapshots() +---- +1 + +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# Verify initial column structure +query I +SELECT column_name FROM duckdb_columns() WHERE table_name = 'test' ORDER BY column_index +---- +a +b + +# ============================================ +# SET SORTED BY + ADD COLUMN then ROLLBACK +# ============================================ + +statement ok +BEGIN + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC NULLS LAST); + +statement ok +ALTER TABLE ducklake.test ADD COLUMN c BIGINT; + +# Verify changes visible within transaction +query I +SELECT column_name FROM duckdb_columns() WHERE table_name = 'test' ORDER BY column_index +---- +a +b +c + +statement ok +ROLLBACK + +# Verify column was rolled back +query I +SELECT column_name FROM duckdb_columns() WHERE table_name = 'test' ORDER BY column_index +---- +a +b + +# Verify sort info was rolled back +query I +SELECT COUNT(*) +FROM __ducklake_metadata_ducklake.ducklake_sort_info +WHERE table_id = 1 AND end_snapshot IS NULL +---- +0 + +# Verify snapshot_id unchanged +query I +SELECT max(snapshot_id) FROM ducklake.snapshots() +---- +1 + +# Verify schema_version unchanged +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +1 + +# ============================================ +# ADD COLUMN + SET SORTED BY (reverse order) then ROLLBACK +# ============================================ + +statement ok +BEGIN + +statement ok +ALTER TABLE ducklake.test ADD COLUMN c BIGINT; + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC, c DESC); + +# Verify changes visible within transaction +query I +SELECT column_name FROM duckdb_columns() WHERE table_name = 'test' ORDER BY column_index +---- +a +b +c + +statement ok +ROLLBACK + +# Verify both changes were rolled back +query I +SELECT column_name FROM duckdb_columns() WHERE table_name = 'test' ORDER BY column_index +---- +a +b + +query I +SELECT COUNT(*) +FROM __ducklake_metadata_ducklake.ducklake_sort_info +WHERE table_id = 1 AND end_snapshot IS NULL +---- +0 + +# ============================================ +# Multiple ALTER TABLE operations then ROLLBACK +# ============================================ + +statement ok +BEGIN + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC); + +statement ok +ALTER TABLE ducklake.test ADD COLUMN c BIGINT; + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC, c DESC); + +statement ok +ALTER TABLE ducklake.test ADD COLUMN d VARCHAR; + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC, c DESC, d ASC); + +# Verify all changes visible within transaction +query I +SELECT column_name FROM duckdb_columns() WHERE table_name = 'test' ORDER BY column_index +---- +a +b +c +d + +statement ok +ROLLBACK + +# Verify all changes were rolled back +query I +SELECT column_name FROM duckdb_columns() WHERE table_name = 'test' ORDER BY column_index +---- +a +b + +query I +SELECT COUNT(*) +FROM __ducklake_metadata_ducklake.ducklake_sort_info +WHERE table_id = 1 AND end_snapshot IS NULL +---- +0 + +query I +SELECT max(snapshot_id) FROM ducklake.snapshots() +---- +1 + +# ============================================ +# Verify operations still work after rollback +# ============================================ + +statement ok +ALTER TABLE ducklake.test ADD COLUMN c BIGINT; + +statement ok +ALTER TABLE ducklake.test SET SORTED BY (a ASC NULLS LAST, c DESC NULLS LAST); + +# Verify changes were committed +query I +SELECT column_name FROM duckdb_columns() WHERE table_name = 'test' ORDER BY column_index +---- +a +b +c + +query III +SELECT expression, sort_direction, null_order +FROM __ducklake_metadata_ducklake.ducklake_sort_expression se +JOIN __ducklake_metadata_ducklake.ducklake_sort_info si USING (sort_id) +WHERE si.table_id = 1 AND si.end_snapshot IS NULL +ORDER BY se.sort_key_index +---- +a ASC NULLS_LAST +c DESC NULLS_LAST + +# schema_version should have incremented once for ADD COLUMN +query I +SELECT max(schema_version) FROM ducklake.snapshots() +---- +2