From 152fc42e76d1dfb9597069e5c91818a8edeb1852 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 26 Feb 2025 11:42:14 -0300 Subject: [PATCH 01/42] Add suport for s3 hive partition style writes --- .../StorageObjectStorageSink.cpp | 23 +++++ .../ObjectStorage/StorageObjectStorageSink.h | 1 + src/Storages/PartitionedSink.cpp | 98 +++++++++++++++++-- src/Storages/PartitionedSink.h | 7 +- 4 files changed, 119 insertions(+), 10 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index bd7cee407e62..08a012aef68a 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -143,6 +143,29 @@ SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String ); } +SinkPtr PartitionedStorageObjectStorageSink::createSinkForHivePartition(const std::string & partition_id) +{ + validateNamespace(configuration->getNamespace()); + + auto partition_key = replaceWildcards(configuration->getPath(), partition_id); + validateKey(partition_key); + + if (auto new_key = checkAndGetNewFileOnInsertIfNeeded( + *object_storage, *configuration, query_settings, partition_key, /* sequence_number */1)) + { + partition_key = *new_key; + } + + return std::make_shared( + object_storage, + configuration, + format_settings, + sample_block, + context, + partition_key); +} + + void PartitionedStorageObjectStorageSink::validateKey(const String & str) { /// See: diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index 97fd3d9b4179..e64f05d65be7 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -50,6 +50,7 @@ class PartitionedStorageObjectStorageSink : public PartitionedSink const ASTPtr & partition_by); SinkPtr createSinkForPartition(const String & partition_id) override; + SinkPtr createSinkForHivePartition(const std::string & partition_id) override; private: void validateKey(const String & str); diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index 0f93d1a5b75c..731577b3a91c 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -3,6 +3,9 @@ #include "PartitionedSink.h" #include +#include +#include +#include #include #include @@ -22,6 +25,87 @@ namespace ErrorCodes extern const int CANNOT_PARSE_TEXT; } +namespace Setting +{ +extern const SettingsBool use_hive_partitioning; +} + +namespace +{ + +Names extractPartitionRequiredColumns(const ASTPtr & partition_by, const Block & sample_block, ContextPtr context) +{ + auto pby_clone = partition_by->clone(); + auto syntax_result = TreeRewriter(context).analyze(pby_clone, sample_block.getNamesAndTypesList()); + auto exp_analyzer = ExpressionAnalyzer(pby_clone, syntax_result, context).getActions(false); + return exp_analyzer->getRequiredColumns(); +} + +/* + * Constructs expression that generates a hive style partition. + * partition by (year, country, city) on hive partition style looks like the following: 'year=2022/country=brazil/state=sc' + * It does that using concat function on partition column names and values. + * `concat(partition_column_names[0], '=', toString(partition_value[0]), '/', ...)` + * */ +void buildHivePartitionExpressionAnalyzer(const ASTFunction * tuple_function, + const ASTPtr partition_by, + ExpressionActionsPtr & partition_by_expr, + String & partition_by_column_name, + const Block & sample_block, + ContextPtr context) +{ + const Names partition_expression_required_columns = extractPartitionRequiredColumns(partition_by, sample_block, context); + ASTs concat_args; + + chassert(tuple_function->arguments->children.size() == partition_expression_required_columns.size()); + + for (size_t i = 0; i < tuple_function->arguments->children.size(); i++) + { + const auto & child = tuple_function->arguments->children[i]; + + concat_args.push_back(std::make_shared(partition_expression_required_columns[i] + "=")); + + concat_args.push_back(makeASTFunction("toString", child)); + + if (i != tuple_function->arguments->children.size() - 1) + { + concat_args.push_back(std::make_shared("/")); + } + } + + ASTPtr hive_expr = makeASTFunction("concat", std::move(concat_args)); + auto hive_syntax_result = TreeRewriter(context).analyze(hive_expr, sample_block.getNamesAndTypesList()); + partition_by_expr = ExpressionAnalyzer(hive_expr, hive_syntax_result, context).getActions(false); + partition_by_column_name = hive_expr->getColumnName(); +} + +void buildPartitionExpressionAnalyzer( + const ASTPtr partition_by, + ExpressionActionsPtr & partition_by_expr, + String & partition_by_column_name, + const Block & sample_block, + ContextPtr context) +{ + if (context->getSettingsRef()[Setting::use_hive_partitioning]) + { + // in case the partition by expression results in a tuple, the vanilla `toString` won't do the trick for hive style + if (const auto * tuple_func = partition_by->as(); + tuple_func && tuple_func->name == "tuple") + { + buildHivePartitionExpressionAnalyzer(tuple_func, partition_by, partition_by_expr, partition_by_column_name, sample_block, context); + return; + } + } + + ASTs arguments(1, partition_by); + ASTPtr partition_by_string = makeASTFunction("toString", std::move(arguments)); + auto syntax_result = TreeRewriter(context).analyze(partition_by_string, sample_block.getNamesAndTypesList()); + partition_by_expr = ExpressionAnalyzer(partition_by_string, syntax_result, context).getActions(false); + partition_by_column_name = partition_by_string->getColumnName(); +} + +} + PartitionedSink::PartitionedSink( const ASTPtr & partition_by, ContextPtr context_, @@ -30,21 +114,17 @@ PartitionedSink::PartitionedSink( , context(context_) , sample_block(sample_block_) { - ASTs arguments(1, partition_by); - ASTPtr partition_by_string = makeASTFunction("toString", std::move(arguments)); - - auto syntax_result = TreeRewriter(context).analyze(partition_by_string, sample_block.getNamesAndTypesList()); - partition_by_expr = ExpressionAnalyzer(partition_by_string, syntax_result, context).getActions(false); - partition_by_column_name = partition_by_string->getColumnName(); + buildPartitionExpressionAnalyzer(partition_by, partition_by_expr, partition_by_column_name, sample_block, context); } -SinkPtr PartitionedSink::getSinkForPartitionKey(StringRef partition_key) +SinkPtr PartitionedSink::getSinkForPartitionKey(StringRef partition_key, bool hive) { auto it = partition_id_to_sink.find(partition_key); if (it == partition_id_to_sink.end()) { - auto sink = createSinkForPartition(partition_key.toString()); + auto partition_key_str = partition_key.toString(); + auto sink = hive ? createSinkForHivePartition(partition_key_str) : createSinkForPartition(partition_key_str); std::tie(it, std::ignore) = partition_id_to_sink.emplace(partition_key, sink); } @@ -103,7 +183,7 @@ void PartitionedSink::consume(Chunk & chunk) for (const auto & [partition_key, partition_index] : partition_id_to_chunk_index) { - auto sink = getSinkForPartitionKey(partition_key); + auto sink = getSinkForPartitionKey(partition_key, context->getSettingsRef()[Setting::use_hive_partitioning]); sink->consume(partition_index_to_chunk[partition_index]); } } diff --git a/src/Storages/PartitionedSink.h b/src/Storages/PartitionedSink.h index 6487eaecfd12..14772afa17de 100644 --- a/src/Storages/PartitionedSink.h +++ b/src/Storages/PartitionedSink.h @@ -30,6 +30,11 @@ class PartitionedSink : public SinkToStorage virtual SinkPtr createSinkForPartition(const String & partition_id) = 0; + virtual SinkPtr createSinkForHivePartition(const String &) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Hive partition style not implemented for storage {}", getName()); + } + static void validatePartitionKey(const String & str, bool allow_slash); static String replaceWildcards(const String & haystack, const String & partition_id); @@ -46,7 +51,7 @@ class PartitionedSink : public SinkToStorage IColumn::Selector chunk_row_index_to_partition_index; Arena partition_keys_arena; - SinkPtr getSinkForPartitionKey(StringRef partition_key); + SinkPtr getSinkForPartitionKey(StringRef partition_key, bool hive); }; } From 9af41a72e57202627e564d19f0e19555627904ce Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 26 Feb 2025 11:50:21 -0300 Subject: [PATCH 02/42] storageurl and storagefile --- src/Storages/StorageFile.cpp | 5 +++++ src/Storages/StorageURL.cpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index ba29d02e5b35..0c4501a78c65 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1943,6 +1943,11 @@ class PartitionedStorageFileSink : public PartitionedSink flags); } + SinkPtr createSinkForHivePartition(const String & partition_id) override + { + return createSinkForPartition(partition_id); + } + private: const String path; StorageMetadataPtr metadata_snapshot; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 382e2cfc658e..83a8ecfdc3e1 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -724,6 +724,11 @@ class PartitionedStorageURLSink : public PartitionedSink partition_path, format, format_settings, sample_block, context, timeouts, compression_method, headers, http_method); } + SinkPtr createSinkForHivePartition(const String & partition_id) override + { + return createSinkForPartition(partition_id); + } + private: const String uri; const String format; From 1aa7db47f6b7e035066721dbd5151be629ba6bd2 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 26 Feb 2025 12:11:36 -0300 Subject: [PATCH 03/42] simplify code --- .../StorageObjectStorageSink.cpp | 23 ------------------- .../ObjectStorage/StorageObjectStorageSink.h | 1 - src/Storages/PartitionedSink.cpp | 6 ++--- src/Storages/PartitionedSink.h | 7 +----- src/Storages/StorageFile.cpp | 5 ---- src/Storages/StorageURL.cpp | 5 ---- 6 files changed, 4 insertions(+), 43 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 08a012aef68a..bd7cee407e62 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -143,29 +143,6 @@ SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String ); } -SinkPtr PartitionedStorageObjectStorageSink::createSinkForHivePartition(const std::string & partition_id) -{ - validateNamespace(configuration->getNamespace()); - - auto partition_key = replaceWildcards(configuration->getPath(), partition_id); - validateKey(partition_key); - - if (auto new_key = checkAndGetNewFileOnInsertIfNeeded( - *object_storage, *configuration, query_settings, partition_key, /* sequence_number */1)) - { - partition_key = *new_key; - } - - return std::make_shared( - object_storage, - configuration, - format_settings, - sample_block, - context, - partition_key); -} - - void PartitionedStorageObjectStorageSink::validateKey(const String & str) { /// See: diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index e64f05d65be7..97fd3d9b4179 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -50,7 +50,6 @@ class PartitionedStorageObjectStorageSink : public PartitionedSink const ASTPtr & partition_by); SinkPtr createSinkForPartition(const String & partition_id) override; - SinkPtr createSinkForHivePartition(const std::string & partition_id) override; private: void validateKey(const String & str); diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index 731577b3a91c..1d6ac70867bb 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -118,13 +118,13 @@ PartitionedSink::PartitionedSink( } -SinkPtr PartitionedSink::getSinkForPartitionKey(StringRef partition_key, bool hive) +SinkPtr PartitionedSink::getSinkForPartitionKey(StringRef partition_key) { auto it = partition_id_to_sink.find(partition_key); if (it == partition_id_to_sink.end()) { auto partition_key_str = partition_key.toString(); - auto sink = hive ? createSinkForHivePartition(partition_key_str) : createSinkForPartition(partition_key_str); + auto sink = createSinkForPartition(partition_key_str); std::tie(it, std::ignore) = partition_id_to_sink.emplace(partition_key, sink); } @@ -183,7 +183,7 @@ void PartitionedSink::consume(Chunk & chunk) for (const auto & [partition_key, partition_index] : partition_id_to_chunk_index) { - auto sink = getSinkForPartitionKey(partition_key, context->getSettingsRef()[Setting::use_hive_partitioning]); + auto sink = getSinkForPartitionKey(partition_key); sink->consume(partition_index_to_chunk[partition_index]); } } diff --git a/src/Storages/PartitionedSink.h b/src/Storages/PartitionedSink.h index 14772afa17de..6487eaecfd12 100644 --- a/src/Storages/PartitionedSink.h +++ b/src/Storages/PartitionedSink.h @@ -30,11 +30,6 @@ class PartitionedSink : public SinkToStorage virtual SinkPtr createSinkForPartition(const String & partition_id) = 0; - virtual SinkPtr createSinkForHivePartition(const String &) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Hive partition style not implemented for storage {}", getName()); - } - static void validatePartitionKey(const String & str, bool allow_slash); static String replaceWildcards(const String & haystack, const String & partition_id); @@ -51,7 +46,7 @@ class PartitionedSink : public SinkToStorage IColumn::Selector chunk_row_index_to_partition_index; Arena partition_keys_arena; - SinkPtr getSinkForPartitionKey(StringRef partition_key, bool hive); + SinkPtr getSinkForPartitionKey(StringRef partition_key); }; } diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 0c4501a78c65..ba29d02e5b35 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1943,11 +1943,6 @@ class PartitionedStorageFileSink : public PartitionedSink flags); } - SinkPtr createSinkForHivePartition(const String & partition_id) override - { - return createSinkForPartition(partition_id); - } - private: const String path; StorageMetadataPtr metadata_snapshot; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 83a8ecfdc3e1..382e2cfc658e 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -724,11 +724,6 @@ class PartitionedStorageURLSink : public PartitionedSink partition_path, format, format_settings, sample_block, context, timeouts, compression_method, headers, http_method); } - SinkPtr createSinkForHivePartition(const String & partition_id) override - { - return createSinkForPartition(partition_id); - } - private: const String uri; const String format; From 4f2ef27d0a0f0587168608eb83ac41bb0ad1a484 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 26 Feb 2025 12:28:21 -0300 Subject: [PATCH 04/42] reduce changes --- src/Storages/PartitionedSink.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index 1d6ac70867bb..ba79213440f8 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -123,8 +123,7 @@ SinkPtr PartitionedSink::getSinkForPartitionKey(StringRef partition_key) auto it = partition_id_to_sink.find(partition_key); if (it == partition_id_to_sink.end()) { - auto partition_key_str = partition_key.toString(); - auto sink = createSinkForPartition(partition_key_str); + auto sink = createSinkForPartition(partition_key.toString()); std::tie(it, std::ignore) = partition_id_to_sink.emplace(partition_key, sink); } From 301e61e0d3aa802dbd4e6b3ccbd54672ff7a2533 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 3 Mar 2025 15:38:55 -0300 Subject: [PATCH 05/42] add tests for s3, enforce some rules --- .../ObjectStorage/StorageObjectStorage.cpp | 2 +- .../StorageObjectStorageSink.cpp | 83 +++++++++++-------- .../registerStorageObjectStorage.cpp | 14 +++- src/Storages/PartitionedSink.cpp | 32 +++++-- src/Storages/PartitionedSink.h | 6 +- src/Storages/StorageFile.cpp | 21 +++-- src/Storages/StorageURL.cpp | 14 +++- ...03363_hive_style_partition_write.reference | 11 +++ .../03363_hive_style_partition_write.sql | 28 +++++++ 9 files changed, 158 insertions(+), 53 deletions(-) create mode 100644 tests/queries/0_stateless/03363_hive_style_partition_write.reference create mode 100644 tests/queries/0_stateless/03363_hive_style_partition_write.sql diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 1ca4383a82fc..21fbc7e0a682 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -380,7 +380,7 @@ SinkToStoragePtr StorageObjectStorage::write( configuration->getPath()); } - if (configuration->withPartitionWildcard()) + if (partition_by || configuration->withPartitionWildcard()) { ASTPtr partition_by_ast = nullptr; if (auto insert_query = std::dynamic_pointer_cast(query)) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index bd7cee407e62..c0ac6ad81094 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -20,6 +20,50 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +namespace +{ + void validateKey(const String & str) + { + /// See: + /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html + /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject + + if (str.empty() || str.size() > 1024) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size()); + + if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); + + PartitionedSink::validatePartitionKey(str, true); + } + + void validateNamespace(const String & str, PartitionedStorageObjectStorageSink::ConfigurationPtr configuration) + { + configuration->validateNamespace(str); + + if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in bucket name"); + + PartitionedSink::validatePartitionKey(str, false); + } + + std::string getFilePath(const String & partition_id, PartitionedStorageObjectStorageSink::ConfigurationPtr configuration) + { + if (!configuration->withPartitionWildcard()) + { + return configuration->getPath() + "/" + partition_id; + } + + auto partition_bucket = PartitionedSink::replaceWildcards(configuration->getNamespace(), partition_id); + validateNamespace(partition_bucket, configuration); + + auto partition_key = PartitionedSink::replaceWildcards(configuration->getPath(), partition_id); + validateKey(partition_key); + + return partition_key; + } +} + StorageObjectStorageSink::StorageObjectStorageSink( ObjectStoragePtr object_storage, ConfigurationPtr configuration, @@ -103,7 +147,7 @@ PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( const Block & sample_block_, ContextPtr context_, const ASTPtr & partition_by) - : PartitionedSink(partition_by, context_, sample_block_) + : PartitionedSink(partition_by, context_, sample_block_, configuration_->format) , object_storage(object_storage_) , configuration(configuration_) , query_settings(configuration_->getQuerySettings(context_)) @@ -121,16 +165,12 @@ StorageObjectStorageSink::~StorageObjectStorageSink() SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String & partition_id) { - auto partition_bucket = replaceWildcards(configuration->getNamespace(), partition_id); - validateNamespace(partition_bucket); - - auto partition_key = replaceWildcards(configuration->getPath(), partition_id); - validateKey(partition_key); + auto file_path = getFilePath(partition_id, configuration); if (auto new_key = checkAndGetNewFileOnInsertIfNeeded( - *object_storage, *configuration, query_settings, partition_key, /* sequence_number */1)) + *object_storage, *configuration, query_settings, file_path, /* sequence_number */1)) { - partition_key = *new_key; + file_path = *new_key; } return std::make_shared( @@ -139,33 +179,8 @@ SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String format_settings, sample_block, context, - partition_key + file_path ); } -void PartitionedStorageObjectStorageSink::validateKey(const String & str) -{ - /// See: - /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html - /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject - - if (str.empty() || str.size() > 1024) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size()); - - if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); - - validatePartitionKey(str, true); -} - -void PartitionedStorageObjectStorageSink::validateNamespace(const String & str) -{ - configuration->validateNamespace(str); - - if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in bucket name"); - - validatePartitionKey(str, false); -} - } diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index 4c7c8abebe11..6768d91092cf 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -15,6 +15,12 @@ namespace DB { +namespace Setting +{ +extern const SettingsBool use_hive_partitioning; +} + + namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -69,8 +75,12 @@ createStorageObjectStorage(const StorageFactory::Arguments & args, StorageObject if (args.storage_def->partition_by) partition_by = args.storage_def->partition_by->clone(); - return std::make_shared( - cluster_name, + if (context->getSettingsRef()[Setting::use_hive_partitioning] && configuration->withPartitionWildcard()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The _partition_id macro can't be used with hive partitioning"); + } + + return std::make_shared( configuration, configuration->createObjectStorage(context, /* is_readonly */ false), args.getContext(), diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index ba79213440f8..5321d00dc610 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -41,6 +41,21 @@ Names extractPartitionRequiredColumns(const ASTPtr & partition_by, const Block & return exp_analyzer->getRequiredColumns(); } +static std::string formatToFileExtension(const std::string & format) +{ + if (format == "Parquet") + { + return "parquet"; + } + + if (format == "CSV") + { + return "csv"; + } + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for format {}", format); +} + /* * Constructs expression that generates a hive style partition. * partition by (year, country, city) on hive partition style looks like the following: 'year=2022/country=brazil/state=sc' @@ -48,6 +63,7 @@ Names extractPartitionRequiredColumns(const ASTPtr & partition_by, const Block & * `concat(partition_column_names[0], '=', toString(partition_value[0]), '/', ...)` * */ void buildHivePartitionExpressionAnalyzer(const ASTFunction * tuple_function, + const std::string & format, const ASTPtr partition_by, ExpressionActionsPtr & partition_by_expr, String & partition_by_column_name, @@ -67,12 +83,12 @@ void buildHivePartitionExpressionAnalyzer(const ASTFunction * tuple_function, concat_args.push_back(makeASTFunction("toString", child)); - if (i != tuple_function->arguments->children.size() - 1) - { - concat_args.push_back(std::make_shared("/")); - } + concat_args.push_back(std::make_shared("/")); } + concat_args.push_back(makeASTFunction("generateSnowflakeID")); + concat_args.push_back(std::make_shared("." + formatToFileExtension(format))); + ASTPtr hive_expr = makeASTFunction("concat", std::move(concat_args)); auto hive_syntax_result = TreeRewriter(context).analyze(hive_expr, sample_block.getNamesAndTypesList()); partition_by_expr = ExpressionAnalyzer(hive_expr, hive_syntax_result, context).getActions(false); @@ -81,6 +97,7 @@ void buildHivePartitionExpressionAnalyzer(const ASTFunction * tuple_function, void buildPartitionExpressionAnalyzer( const ASTPtr partition_by, + const std::string & format, ExpressionActionsPtr & partition_by_expr, String & partition_by_column_name, const Block & sample_block, @@ -92,7 +109,7 @@ void buildPartitionExpressionAnalyzer( if (const auto * tuple_func = partition_by->as(); tuple_func && tuple_func->name == "tuple") { - buildHivePartitionExpressionAnalyzer(tuple_func, partition_by, partition_by_expr, partition_by_column_name, sample_block, context); + buildHivePartitionExpressionAnalyzer(tuple_func, format, partition_by, partition_by_expr, partition_by_column_name, sample_block, context); return; } } @@ -109,12 +126,13 @@ void buildPartitionExpressionAnalyzer( PartitionedSink::PartitionedSink( const ASTPtr & partition_by, ContextPtr context_, - const Block & sample_block_) + const Block & sample_block_, + const std::string & format) : SinkToStorage(sample_block_) , context(context_) , sample_block(sample_block_) { - buildPartitionExpressionAnalyzer(partition_by, partition_by_expr, partition_by_column_name, sample_block, context); + buildPartitionExpressionAnalyzer(partition_by, format, partition_by_expr, partition_by_column_name, sample_block, context); } diff --git a/src/Storages/PartitionedSink.h b/src/Storages/PartitionedSink.h index 6487eaecfd12..9a56eb639fc7 100644 --- a/src/Storages/PartitionedSink.h +++ b/src/Storages/PartitionedSink.h @@ -16,7 +16,11 @@ class PartitionedSink : public SinkToStorage public: static constexpr auto PARTITION_ID_WILDCARD = "{_partition_id}"; - PartitionedSink(const ASTPtr & partition_by, ContextPtr context_, const Block & sample_block_); + PartitionedSink( + const ASTPtr & partition_by, + ContextPtr context_, + const Block & sample_block_, + const std::string & format); ~PartitionedSink() override; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index ba29d02e5b35..ba0a345a0a8c 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1907,7 +1907,7 @@ class PartitionedStorageFileSink : public PartitionedSink const String format_name_, ContextPtr context_, int flags_) - : PartitionedSink(partition_by, context_, metadata_snapshot_->getSampleBlock()) + : PartitionedSink(partition_by, context_, metadata_snapshot_->getSampleBlock(), format_name_) , path(path_) , metadata_snapshot(metadata_snapshot_) , table_name_for_log(table_name_for_log_) @@ -1923,19 +1923,28 @@ class PartitionedStorageFileSink : public PartitionedSink SinkPtr createSinkForPartition(const String & partition_id) override { - auto partition_path = PartitionedSink::replaceWildcards(path, partition_id); + std::string filepath; - fs::create_directories(fs::path(partition_path).parent_path()); + if (path.contains("{_partition_id}")) + { + filepath = PartitionedSink::replaceWildcards(path, partition_id); + } + else + { + filepath = path + "/" + partition_id; + } + + fs::create_directories(fs::path(filepath).parent_path()); - PartitionedSink::validatePartitionKey(partition_path, true); - checkCreationIsAllowed(context, context->getUserFilesPath(), partition_path, /*can_be_directory=*/ true); + PartitionedSink::validatePartitionKey(filepath, true); + checkCreationIsAllowed(context, context->getUserFilesPath(), filepath, /*can_be_directory=*/ true); return std::make_shared( metadata_snapshot, table_name_for_log, -1, /* use_table_fd */false, base_path, - partition_path, + filepath, compression_method, format_settings, format_name, diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 382e2cfc658e..782bde47a78f 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -703,7 +703,7 @@ class PartitionedStorageURLSink : public PartitionedSink const CompressionMethod compression_method_, const HTTPHeaderEntries & headers_, const String & http_method_) - : PartitionedSink(partition_by, context_, sample_block_) + : PartitionedSink(partition_by, context_, sample_block_, format_) , uri(uri_) , format(format_) , format_settings(format_settings_) @@ -718,7 +718,17 @@ class PartitionedStorageURLSink : public PartitionedSink SinkPtr createSinkForPartition(const String & partition_id) override { - auto partition_path = PartitionedSink::replaceWildcards(uri, partition_id); + std::string partition_path; + + if (uri.contains("{_partition_id}")) + { + partition_path = PartitionedSink::replaceWildcards(uri, partition_id); + } + else + { + partition_path = uri + "/" + partition_id; + } + context->getRemoteHostFilter().checkURL(Poco::URI(partition_path)); return std::make_shared( partition_path, format, format_settings, sample_block, context, timeouts, compression_method, headers, http_method); diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.reference b/tests/queries/0_stateless/03363_hive_style_partition_write.reference new file mode 100644 index 000000000000..872a65d43225 --- /dev/null +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.reference @@ -0,0 +1,11 @@ +test/t_03363_s3_sink_root/year=2022/country=USA/.parquet 1 +test/t_03363_s3_sink_root/year=2022/country=Canada/.parquet 2 +test/t_03363_s3_sink_root/year=2023/country=USA/.parquet 3 +test/t_03363_s3_sink_root/year=2023/country=Mexico/.parquet 4 +test/t_03363_s3_sink_root/year=2024/country=France/.parquet 5 +test/t_03363_s3_sink_root/year=2024/country=Germany/.parquet 6 +test/t_03363_s3_sink_root/year=2024/country=Germany/.parquet 7 +test/t_03363_s3_sink_root/year=1999/country=Brazil/.parquet 8 +test/t_03363_s3_sink_root/year=2100/country=Japan/.parquet 9 +test/t_03363_s3_sink_root/year=2024/country=/.parquet 10 +test/t_03363_s3_sink_root/year=2024/country=CN/.parquet 11 diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql new file mode 100644 index 000000000000..06d44d205557 --- /dev/null +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -0,0 +1,28 @@ +-- Tags: no-parallel, no-fasttest, no-random-settings + +DROP TABLE IF EXISTS t_03363_s3_sink, t_03363_s3_sink_read; + +CREATE TABLE t_03363_s3_sink (year UInt16, country String, random UInt8) +ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root', format = Parquet) +PARTITION BY (year, country); + +INSERT INTO t_03363_s3_sink SETTINGS use_hive_partitioning=1, s3_truncate_on_insert=1 VALUES + (2022, 'USA', 1), + (2022, 'Canada', 2), + (2023, 'USA', 3), + (2023, 'Mexico', 4), + (2024, 'France', 5), + (2024, 'Germany', 6), + (2024, 'Germany', 7), + (1999, 'Brazil', 8), + (2100, 'Japan', 9), + (2024, '', 10), + (2024, 'CN', 11); + +-- while reading from object storage partitioned table is not supported, an auxiliary table must be created +CREATE TABLE t_03363_s3_sink_read (year UInt16, country String, random UInt8) +ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root/**.parquet', format = Parquet); + +select replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, * from t_03363_s3_sink_read order by random SETTINGS use_hive_partitioning=1; + +DROP TABLE t_03363_s3_sink, t_03363_s3_sink_read; From 7d25cce5a88d829d17e67eb0dd568f2c7f418dd0 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 5 Mar 2025 09:56:05 -0300 Subject: [PATCH 06/42] some refactoring --- src/Functions/generateSnowflakeID.cpp | 8 + src/Functions/generateSnowflakeID.h | 10 ++ .../ObjectStorage/StorageObjectStorage.cpp | 3 +- .../StorageObjectStorageSink.cpp | 27 +-- .../ObjectStorage/StorageObjectStorageSink.h | 7 +- src/Storages/PartitionStrategy.cpp | 157 ++++++++++++++++++ src/Storages/PartitionStrategy.h | 54 ++++++ src/Storages/PartitionedSink.cpp | 117 ++----------- src/Storages/PartitionedSink.h | 9 +- src/Storages/StorageFile.cpp | 22 +-- src/Storages/StorageURL.cpp | 19 +-- 11 files changed, 271 insertions(+), 162 deletions(-) create mode 100644 src/Functions/generateSnowflakeID.h create mode 100644 src/Storages/PartitionStrategy.cpp create mode 100644 src/Storages/PartitionStrategy.h diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index c95e3edf4ca6..76f7e252ae16 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -147,6 +148,13 @@ struct Data } +uint64_t generateSnowflakeID() +{ + Data data; + SnowflakeId snowflake_id = data.reserveRange(getMachineId(), 1); + return fromSnowflakeId(snowflake_id); +} + class FunctionGenerateSnowflakeID : public IFunction { public: diff --git a/src/Functions/generateSnowflakeID.h b/src/Functions/generateSnowflakeID.h new file mode 100644 index 000000000000..954a919c2dee --- /dev/null +++ b/src/Functions/generateSnowflakeID.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +namespace DB +{ + +uint64_t generateSnowflakeID(); + +} diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 21fbc7e0a682..71d3df136c1c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -393,8 +393,9 @@ SinkToStoragePtr StorageObjectStorage::write( if (partition_by_ast) { + auto partition_strategy = PartitionStrategyProvider::get(partition_by_ast, sample_block, local_context, configuration->format); return std::make_shared( - object_storage, configuration, format_settings, sample_block, local_context, partition_by_ast); + partition_strategy, object_storage, configuration, format_settings, sample_block, local_context); } } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index c0ac6ad81094..459a498aa2f4 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -46,22 +46,6 @@ namespace PartitionedSink::validatePartitionKey(str, false); } - - std::string getFilePath(const String & partition_id, PartitionedStorageObjectStorageSink::ConfigurationPtr configuration) - { - if (!configuration->withPartitionWildcard()) - { - return configuration->getPath() + "/" + partition_id; - } - - auto partition_bucket = PartitionedSink::replaceWildcards(configuration->getNamespace(), partition_id); - validateNamespace(partition_bucket, configuration); - - auto partition_key = PartitionedSink::replaceWildcards(configuration->getPath(), partition_id); - validateKey(partition_key); - - return partition_key; - } } StorageObjectStorageSink::StorageObjectStorageSink( @@ -141,13 +125,13 @@ void StorageObjectStorageSink::cancelBuffers() } PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( + std::shared_ptr partition_strategy_, ObjectStoragePtr object_storage_, ConfigurationPtr configuration_, std::optional format_settings_, const Block & sample_block_, - ContextPtr context_, - const ASTPtr & partition_by) - : PartitionedSink(partition_by, context_, sample_block_, configuration_->format) + ContextPtr context_) + : PartitionedSink(partition_strategy_, context_, sample_block_) , object_storage(object_storage_) , configuration(configuration_) , query_settings(configuration_->getQuerySettings(context_)) @@ -165,7 +149,10 @@ StorageObjectStorageSink::~StorageObjectStorageSink() SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String & partition_id) { - auto file_path = getFilePath(partition_id, configuration); + auto file_path = getPartitionStrategy()->getPath(configuration->getPath(), partition_id); + + validateNamespace(configuration->getNamespace(), configuration); + validateKey(file_path); if (auto new_key = checkAndGetNewFileOnInsertIfNeeded( *object_storage, *configuration, query_settings, file_path, /* sequence_number */1)) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index 97fd3d9b4179..bda52af12b21 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -42,19 +42,16 @@ class PartitionedStorageObjectStorageSink : public PartitionedSink using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; PartitionedStorageObjectStorageSink( + std::shared_ptr partition_strategy_, ObjectStoragePtr object_storage_, ConfigurationPtr configuration_, std::optional format_settings_, const Block & sample_block_, - ContextPtr context_, - const ASTPtr & partition_by); + ContextPtr context_); SinkPtr createSinkForPartition(const String & partition_id) override; private: - void validateKey(const String & str); - void validateNamespace(const String & str); - ObjectStoragePtr object_storage; ConfigurationPtr configuration; diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp new file mode 100644 index 000000000000..a7c8f05a29c9 --- /dev/null +++ b/src/Storages/PartitionStrategy.cpp @@ -0,0 +1,157 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace Setting +{ +extern const SettingsBool use_hive_partitioning; +} + +namespace ErrorCodes +{ +extern const int LOGICAL_ERROR; +} + + +namespace +{ + Names extractPartitionRequiredColumns(ASTPtr & partition_by, const Block & sample_block, ContextPtr context) + { + auto pby_clone = partition_by->clone(); + auto syntax_result = TreeRewriter(context).analyze(pby_clone, sample_block.getNamesAndTypesList()); + auto exp_analyzer = ExpressionAnalyzer(pby_clone, syntax_result, context).getActions(false); + return exp_analyzer->getRequiredColumns(); + } + + static std::string formatToFileExtension(const std::string & format) + { + if (format == "Parquet") + { + return "parquet"; + } + + if (format == "CSV") + { + return "csv"; + } + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for format {}", format); + } +} + +PartitionStrategy::PartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_) +: partition_by(partition_by_), sample_block(sample_block_), context(context_) +{ + +} + +std::shared_ptr PartitionStrategyProvider::get(ASTPtr partition_by, const Block & sample_block, ContextPtr context, const std::string & file_format) +{ + if (context->getSettingsRef()[Setting::use_hive_partitioning]) + { + if (file_format.empty()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "File format can't be empty for hive style partitioning"); + } + return std::make_shared(partition_by, sample_block, context, file_format); + } + + return std::make_shared(partition_by, sample_block, context); +} + +StringfiedPartitionStrategy::StringfiedPartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_) + : PartitionStrategy(partition_by_, sample_block_, context_) +{ +} + +StringfiedPartitionStrategy::PartitionExpressionActionsAndColumnName StringfiedPartitionStrategy::getExpression() +{ + StringfiedPartitionStrategy::PartitionExpressionActionsAndColumnName actions_with_column_name; + + ASTs arguments(1, partition_by); + ASTPtr partition_by_string = makeASTFunction("toString", std::move(arguments)); + auto syntax_result = TreeRewriter(context).analyze(partition_by_string, sample_block.getNamesAndTypesList()); + actions_with_column_name.actions = ExpressionAnalyzer(partition_by_string, syntax_result, context).getActions(false); + actions_with_column_name.column_name = partition_by_string->getColumnName(); + + return actions_with_column_name; +} + +std::string StringfiedPartitionStrategy::getPath( + const std::string & prefix, + const std::string & partition_key) +{ + return PartitionedSink::replaceWildcards(prefix, partition_key); +} + +HiveStylePartitionStrategy::HiveStylePartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_, const std::string & file_format_) +: PartitionStrategy(partition_by_, sample_block_, context_), file_format(file_format_) +{ + +} + +HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName HiveStylePartitionStrategy::getExpression() +{ + StringfiedPartitionStrategy::PartitionExpressionActionsAndColumnName actions_with_column_name; + + const Names partition_expression_required_columns = extractPartitionRequiredColumns(partition_by, sample_block, context); + ASTs concat_args; + + if (const auto * tuple_function = partition_by->as(); + tuple_function && tuple_function->name == "tuple") + { + chassert(tuple_function->arguments->children.size() == partition_expression_required_columns.size()); + + for (size_t i = 0; i < tuple_function->arguments->children.size(); i++) + { + const auto & child = tuple_function->arguments->children[i]; + + concat_args.push_back(std::make_shared(partition_expression_required_columns[i] + "=")); + + concat_args.push_back(makeASTFunction("toString", child)); + + concat_args.push_back(std::make_shared("/")); + } + } + else + { + chassert(partition_expression_required_columns.size() == 1); + + ASTs to_string_args = {1, partition_by}; + concat_args.push_back(std::make_shared(partition_expression_required_columns[0] + "=")); + concat_args.push_back(makeASTFunction("toString", std::move(to_string_args))); + concat_args.push_back(std::make_shared("/")); + } + + ASTPtr hive_expr = makeASTFunction("concat", std::move(concat_args)); + auto hive_syntax_result = TreeRewriter(context).analyze(hive_expr, sample_block.getNamesAndTypesList()); + actions_with_column_name.actions = ExpressionAnalyzer(hive_expr, hive_syntax_result, context).getActions(false); + actions_with_column_name.column_name = hive_expr->getColumnName(); + + return actions_with_column_name; +} + +std::string HiveStylePartitionStrategy::getPath( + const std::string & prefix, + const std::string & partition_key) +{ + std::string path; + + if (!prefix.empty()) + { + path += prefix + "/"; + } + + return path + partition_key + "/" + std::to_string(generateSnowflakeID()) + "." + formatToFileExtension(file_format); +} + +} diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h new file mode 100644 index 000000000000..e44df91bfab3 --- /dev/null +++ b/src/Storages/PartitionStrategy.h @@ -0,0 +1,54 @@ +#pragma once + +#include +#include + +namespace DB +{ + +struct PartitionStrategy +{ + struct PartitionExpressionActionsAndColumnName + { + ExpressionActionsPtr actions; + std::string column_name; + }; + + PartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_); + + virtual ~PartitionStrategy() = default; + + virtual PartitionExpressionActionsAndColumnName getExpression() = 0; + virtual std::string getPath(const std::string & prefix, const std::string & partition_key) = 0; + +protected: + ASTPtr partition_by; + const Block & sample_block; + ContextPtr context; +}; + +struct PartitionStrategyProvider +{ + static std::shared_ptr get(ASTPtr partition_by, const Block & sample_block, ContextPtr context, const std::string & file_format); +}; + +struct StringfiedPartitionStrategy : public PartitionStrategy +{ + StringfiedPartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_); + + PartitionExpressionActionsAndColumnName getExpression() override; + std::string getPath(const std::string & prefix, const std::string & partition_key) override; +}; + +struct HiveStylePartitionStrategy : public PartitionStrategy +{ + HiveStylePartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_, const std::string & file_format_); + + PartitionExpressionActionsAndColumnName getExpression() override; + std::string getPath(const std::string & prefix, const std::string & partition_key) override; + +private: + std::string file_format; +}; + +} diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index 5321d00dc610..a3f3945a45d1 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -4,12 +4,10 @@ #include #include -#include #include #include -#include -#include +#include #include @@ -25,114 +23,18 @@ namespace ErrorCodes extern const int CANNOT_PARSE_TEXT; } -namespace Setting -{ -extern const SettingsBool use_hive_partitioning; -} - -namespace -{ - -Names extractPartitionRequiredColumns(const ASTPtr & partition_by, const Block & sample_block, ContextPtr context) -{ - auto pby_clone = partition_by->clone(); - auto syntax_result = TreeRewriter(context).analyze(pby_clone, sample_block.getNamesAndTypesList()); - auto exp_analyzer = ExpressionAnalyzer(pby_clone, syntax_result, context).getActions(false); - return exp_analyzer->getRequiredColumns(); -} - -static std::string formatToFileExtension(const std::string & format) -{ - if (format == "Parquet") - { - return "parquet"; - } - - if (format == "CSV") - { - return "csv"; - } - - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for format {}", format); -} - -/* - * Constructs expression that generates a hive style partition. - * partition by (year, country, city) on hive partition style looks like the following: 'year=2022/country=brazil/state=sc' - * It does that using concat function on partition column names and values. - * `concat(partition_column_names[0], '=', toString(partition_value[0]), '/', ...)` - * */ -void buildHivePartitionExpressionAnalyzer(const ASTFunction * tuple_function, - const std::string & format, - const ASTPtr partition_by, - ExpressionActionsPtr & partition_by_expr, - String & partition_by_column_name, - const Block & sample_block, - ContextPtr context) -{ - const Names partition_expression_required_columns = extractPartitionRequiredColumns(partition_by, sample_block, context); - ASTs concat_args; - - chassert(tuple_function->arguments->children.size() == partition_expression_required_columns.size()); - - for (size_t i = 0; i < tuple_function->arguments->children.size(); i++) - { - const auto & child = tuple_function->arguments->children[i]; - - concat_args.push_back(std::make_shared(partition_expression_required_columns[i] + "=")); - - concat_args.push_back(makeASTFunction("toString", child)); - - concat_args.push_back(std::make_shared("/")); - } - - concat_args.push_back(makeASTFunction("generateSnowflakeID")); - concat_args.push_back(std::make_shared("." + formatToFileExtension(format))); - - ASTPtr hive_expr = makeASTFunction("concat", std::move(concat_args)); - auto hive_syntax_result = TreeRewriter(context).analyze(hive_expr, sample_block.getNamesAndTypesList()); - partition_by_expr = ExpressionAnalyzer(hive_expr, hive_syntax_result, context).getActions(false); - partition_by_column_name = hive_expr->getColumnName(); -} - -void buildPartitionExpressionAnalyzer( - const ASTPtr partition_by, - const std::string & format, - ExpressionActionsPtr & partition_by_expr, - String & partition_by_column_name, - const Block & sample_block, - ContextPtr context) -{ - if (context->getSettingsRef()[Setting::use_hive_partitioning]) - { - // in case the partition by expression results in a tuple, the vanilla `toString` won't do the trick for hive style - if (const auto * tuple_func = partition_by->as(); - tuple_func && tuple_func->name == "tuple") - { - buildHivePartitionExpressionAnalyzer(tuple_func, format, partition_by, partition_by_expr, partition_by_column_name, sample_block, context); - return; - } - } - - ASTs arguments(1, partition_by); - ASTPtr partition_by_string = makeASTFunction("toString", std::move(arguments)); - auto syntax_result = TreeRewriter(context).analyze(partition_by_string, sample_block.getNamesAndTypesList()); - partition_by_expr = ExpressionAnalyzer(partition_by_string, syntax_result, context).getActions(false); - partition_by_column_name = partition_by_string->getColumnName(); -} - -} - PartitionedSink::PartitionedSink( - const ASTPtr & partition_by, + std::shared_ptr partition_strategy_, ContextPtr context_, - const Block & sample_block_, - const std::string & format) + const Block & sample_block_) : SinkToStorage(sample_block_) + , partition_strategy(partition_strategy_) , context(context_) , sample_block(sample_block_) { - buildPartitionExpressionAnalyzer(partition_by, format, partition_by_expr, partition_by_column_name, sample_block, context); + auto actions_with_column_name = partition_strategy->getExpression(); + partition_by_expr = actions_with_column_name.actions; + partition_by_column_name = actions_with_column_name.column_name; } @@ -205,6 +107,11 @@ void PartitionedSink::consume(Chunk & chunk) } } +std::shared_ptr PartitionedSink::getPartitionStrategy() +{ + return partition_strategy; +} + void PartitionedSink::onException(std::exception_ptr exception) { for (auto & [_, sink] : partition_id_to_sink) diff --git a/src/Storages/PartitionedSink.h b/src/Storages/PartitionedSink.h index 9a56eb639fc7..487a08d93342 100644 --- a/src/Storages/PartitionedSink.h +++ b/src/Storages/PartitionedSink.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB @@ -17,10 +18,9 @@ class PartitionedSink : public SinkToStorage static constexpr auto PARTITION_ID_WILDCARD = "{_partition_id}"; PartitionedSink( - const ASTPtr & partition_by, + std::shared_ptr partition_strategy_, ContextPtr context_, - const Block & sample_block_, - const std::string & format); + const Block & sample_block_); ~PartitionedSink() override; @@ -38,7 +38,10 @@ class PartitionedSink : public SinkToStorage static String replaceWildcards(const String & haystack, const String & partition_id); + std::shared_ptr getPartitionStrategy(); + private: + std::shared_ptr partition_strategy; ContextPtr context; Block sample_block; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index ba0a345a0a8c..84311fa9e27e 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1896,7 +1896,7 @@ class PartitionedStorageFileSink : public PartitionedSink { public: PartitionedStorageFileSink( - const ASTPtr & partition_by, + std::shared_ptr partition_strategy_, const StorageMetadataPtr & metadata_snapshot_, const String & table_name_for_log_, std::unique_lock && lock_, @@ -1907,7 +1907,7 @@ class PartitionedStorageFileSink : public PartitionedSink const String format_name_, ContextPtr context_, int flags_) - : PartitionedSink(partition_by, context_, metadata_snapshot_->getSampleBlock(), format_name_) + : PartitionedSink(partition_strategy_, context_, metadata_snapshot_->getSampleBlock()) , path(path_) , metadata_snapshot(metadata_snapshot_) , table_name_for_log(table_name_for_log_) @@ -1923,16 +1923,7 @@ class PartitionedStorageFileSink : public PartitionedSink SinkPtr createSinkForPartition(const String & partition_id) override { - std::string filepath; - - if (path.contains("{_partition_id}")) - { - filepath = PartitionedSink::replaceWildcards(path, partition_id); - } - else - { - filepath = path + "/" + partition_id; - } + std::string filepath = getPartitionStrategy()->getPath(path, partition_id); fs::create_directories(fs::path(filepath).parent_path()); @@ -1985,17 +1976,18 @@ SinkToStoragePtr StorageFile::write( if (context->getSettingsRef()[Setting::engine_file_truncate_on_insert]) flags |= O_TRUNC; - bool has_wildcards = path_for_partitioned_write.contains(PartitionedSink::PARTITION_ID_WILDCARD); const auto * insert_query = dynamic_cast(query.get()); - bool is_partitioned_implementation = insert_query && insert_query->partition_by && has_wildcards; + bool is_partitioned_implementation = insert_query && insert_query->partition_by; if (is_partitioned_implementation) { if (path_for_partitioned_write.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty path for partitioned write"); + auto partition_strategy = PartitionStrategyProvider::get(insert_query->partition_by, metadata_snapshot->getSampleBlock(), context, format_name); + return std::make_shared( - insert_query->partition_by, + partition_strategy, metadata_snapshot, getStorageID().getNameForLogs(), std::unique_lock{rwlock, getLockTimeout(context)}, diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 782bde47a78f..968d521cf376 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -693,7 +693,7 @@ class PartitionedStorageURLSink : public PartitionedSink { public: PartitionedStorageURLSink( - const ASTPtr & partition_by, + std::shared_ptr partition_strategy_, const String & uri_, const String & format_, const std::optional & format_settings_, @@ -703,7 +703,7 @@ class PartitionedStorageURLSink : public PartitionedSink const CompressionMethod compression_method_, const HTTPHeaderEntries & headers_, const String & http_method_) - : PartitionedSink(partition_by, context_, sample_block_, format_) + : PartitionedSink(partition_strategy_, context_, sample_block_) , uri(uri_) , format(format_) , format_settings(format_settings_) @@ -718,16 +718,7 @@ class PartitionedStorageURLSink : public PartitionedSink SinkPtr createSinkForPartition(const String & partition_id) override { - std::string partition_path; - - if (uri.contains("{_partition_id}")) - { - partition_path = PartitionedSink::replaceWildcards(uri, partition_id); - } - else - { - partition_path = uri + "/" + partition_id; - } + std::string partition_path = getPartitionStrategy()->getPath(uri, partition_id); context->getRemoteHostFilter().checkURL(Poco::URI(partition_path)); return std::make_shared( @@ -1357,8 +1348,10 @@ SinkToStoragePtr IStorageURLBase::write(const ASTPtr & query, const StorageMetad if (is_partitioned_implementation) { + auto partition_strategy = PartitionStrategyProvider::get(partition_by_ast, metadata_snapshot->getSampleBlock(), context, format_name); + return std::make_shared( - partition_by_ast, + partition_strategy, uri, format_name, format_settings, From f3f7e9aff1a12fd90119f95e7afcf56b39ec2ce5 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 5 Mar 2025 10:11:35 -0300 Subject: [PATCH 07/42] extern not_implemented --- src/Storages/PartitionStrategy.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index a7c8f05a29c9..ea8bf864e229 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -19,6 +19,7 @@ extern const SettingsBool use_hive_partitioning; namespace ErrorCodes { extern const int LOGICAL_ERROR; +extern const int NOT_IMPLEMENTED; } From 6a75897df9e106e5d49296e017feef54c9c3f0c1 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 5 Mar 2025 14:32:06 -0300 Subject: [PATCH 08/42] copy sample bock --- src/Storages/PartitionStrategy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h index e44df91bfab3..84972273213e 100644 --- a/src/Storages/PartitionStrategy.h +++ b/src/Storages/PartitionStrategy.h @@ -23,7 +23,7 @@ struct PartitionStrategy protected: ASTPtr partition_by; - const Block & sample_block; + Block sample_block; ContextPtr context; }; From 839af353f0508d128876c88a2f276284e90e16da Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 6 Mar 2025 11:24:51 -0300 Subject: [PATCH 09/42] focus on engine s3 only, new argument to control partition style --- src/Storages/ObjectStorage/S3/Configuration.cpp | 5 ++++- .../ObjectStorage/StorageObjectStorage.cpp | 12 ++++++++++-- src/Storages/ObjectStorage/StorageObjectStorage.h | 1 + .../ObjectStorage/registerStorageObjectStorage.cpp | 8 ++++---- src/Storages/PartitionStrategy.cpp | 14 ++++++-------- src/Storages/PartitionStrategy.h | 7 ++++++- .../03363_hive_style_partition_write.sql | 4 ++-- 7 files changed, 33 insertions(+), 18 deletions(-) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index d29e4bc130ac..667c745acd5f 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -74,7 +74,8 @@ static const std::unordered_set optional_configuration_keys = "max_single_part_upload_size", "max_connections", "expiration_window_seconds", - "no_sign_request" + "no_sign_request", + "partitioning_style" }; String StorageS3Configuration::getDataSourceDescription() const @@ -172,6 +173,8 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect else url = S3::URI(collection.get("url"), settings[Setting::allow_archive_path_syntax]); + partitioning_style = collection.getOrDefault("partitioning_style", "auto"); + auth_settings[S3AuthSetting::access_key_id] = collection.getOrDefault("access_key_id", ""); auth_settings[S3AuthSetting::secret_access_key] = collection.getOrDefault("secret_access_key", ""); auth_settings[S3AuthSetting::use_environment_credentials] = collection.getOrDefault("use_environment_credentials", 1); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 71d3df136c1c..1108ee25659d 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -101,7 +101,15 @@ StorageObjectStorage::StorageObjectStorage( , distributed_processing(distributed_processing_) , log(getLogger(fmt::format("Storage{}({})", configuration->getEngineName(), table_id_.getFullTableName()))) { - try + if (configuration->partitioning_style != "auto" && configuration->withPartitionWildcard()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The {} macro can't be used with {} partitioning style", + PartitionedSink::PARTITION_ID_WILDCARD, configuration->partitioning_style); + } + + bool do_lazy_init = lazy_init && !columns_.empty() && !configuration->format.empty(); + bool failed_init = false; + auto do_init = [&]() { if (!lazy_init) { @@ -393,7 +401,7 @@ SinkToStoragePtr StorageObjectStorage::write( if (partition_by_ast) { - auto partition_strategy = PartitionStrategyProvider::get(partition_by_ast, sample_block, local_context, configuration->format); + auto partition_strategy = PartitionStrategyProvider::get(partition_by_ast, sample_block, local_context, configuration->format, configuration->partitioning_style); return std::make_shared( partition_strategy, object_storage, configuration, format_settings, sample_block, local_context); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 6b5d6e1d423c..532273871bce 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -244,6 +244,7 @@ class StorageObjectStorage::Configuration String format = "auto"; String compression_method = "auto"; String structure = "auto"; + std::string partitioning_style = "auto"; virtual void update(ObjectStoragePtr object_storage, ContextPtr local_context); void updateIfRequired(ObjectStoragePtr object_storage, ContextPtr local_context); diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index 6768d91092cf..0be5176ccc7c 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -75,10 +75,10 @@ createStorageObjectStorage(const StorageFactory::Arguments & args, StorageObject if (args.storage_def->partition_by) partition_by = args.storage_def->partition_by->clone(); - if (context->getSettingsRef()[Setting::use_hive_partitioning] && configuration->withPartitionWildcard()) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "The _partition_id macro can't be used with hive partitioning"); - } +// if (context->getSettingsRef()[Setting::use_hive_partitioning] && configuration->withPartitionWildcard()) +// { +// throw Exception(ErrorCodes::BAD_ARGUMENTS, "The _partition_id macro can't be used with hive partitioning"); +// } return std::make_shared( configuration, diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index ea8bf864e229..f94ada4a33e6 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -6,16 +6,10 @@ #include #include #include -#include namespace DB { -namespace Setting -{ -extern const SettingsBool use_hive_partitioning; -} - namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -55,9 +49,13 @@ PartitionStrategy::PartitionStrategy(ASTPtr partition_by_, const Block & sample_ } -std::shared_ptr PartitionStrategyProvider::get(ASTPtr partition_by, const Block & sample_block, ContextPtr context, const std::string & file_format) +std::shared_ptr PartitionStrategyProvider::get(ASTPtr partition_by, + const Block & sample_block, + ContextPtr context, + const std::string & file_format, + const std::string & partitioning_style) { - if (context->getSettingsRef()[Setting::use_hive_partitioning]) + if (partitioning_style == "hive") { if (file_format.empty()) { diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h index 84972273213e..eecca53a36fe 100644 --- a/src/Storages/PartitionStrategy.h +++ b/src/Storages/PartitionStrategy.h @@ -29,7 +29,12 @@ struct PartitionStrategy struct PartitionStrategyProvider { - static std::shared_ptr get(ASTPtr partition_by, const Block & sample_block, ContextPtr context, const std::string & file_format); + static std::shared_ptr get( + ASTPtr partition_by, + const Block & sample_block, + ContextPtr context, + const std::string & file_format, + const std::string & partitioning_style = ""); }; struct StringfiedPartitionStrategy : public PartitionStrategy diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql index 06d44d205557..4ca46d9e9144 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -3,10 +3,10 @@ DROP TABLE IF EXISTS t_03363_s3_sink, t_03363_s3_sink_read; CREATE TABLE t_03363_s3_sink (year UInt16, country String, random UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root', format = Parquet) +ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root', format = Parquet, partitioning_style='hive') PARTITION BY (year, country); -INSERT INTO t_03363_s3_sink SETTINGS use_hive_partitioning=1, s3_truncate_on_insert=1 VALUES +INSERT INTO t_03363_s3_sink SETTINGS s3_truncate_on_insert=1 VALUES (2022, 'USA', 1), (2022, 'Canada', 2), (2023, 'USA', 3), From c8805b5815822b6edfb5d40280d52349b5e17d4c Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 6 Mar 2025 11:38:20 -0300 Subject: [PATCH 10/42] small adjustments --- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 1 + src/Storages/ObjectStorage/registerStorageObjectStorage.cpp | 5 ----- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 1108ee25659d..0719b05fa1f7 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -42,6 +42,7 @@ namespace ErrorCodes extern const int DATABASE_ACCESS_DENIED; extern const int NOT_IMPLEMENTED; extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } namespace StorageObjectStorageSetting diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index 0be5176ccc7c..b1e2748a1c8d 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -75,11 +75,6 @@ createStorageObjectStorage(const StorageFactory::Arguments & args, StorageObject if (args.storage_def->partition_by) partition_by = args.storage_def->partition_by->clone(); -// if (context->getSettingsRef()[Setting::use_hive_partitioning] && configuration->withPartitionWildcard()) -// { -// throw Exception(ErrorCodes::BAD_ARGUMENTS, "The _partition_id macro can't be used with hive partitioning"); -// } - return std::make_shared( configuration, configuration->createObjectStorage(context, /* is_readonly */ false), From 1926495f1beab542f3fb4736e828b33968af8955 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 6 Mar 2025 13:56:14 -0300 Subject: [PATCH 11/42] tests --- .../0_stateless/03363_hive_style_partition_write.sql | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql index 4ca46d9e9144..04556789a5c9 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -23,6 +23,10 @@ INSERT INTO t_03363_s3_sink SETTINGS s3_truncate_on_insert=1 VALUES CREATE TABLE t_03363_s3_sink_read (year UInt16, country String, random UInt8) ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root/**.parquet', format = Parquet); -select replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, * from t_03363_s3_sink_read order by random SETTINGS use_hive_partitioning=1; +select replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, * from t_03363_s3_sink_read order by random SETTINGS use_hive_partitioning=1, input_format_parquet_use_native_reader=0; DROP TABLE t_03363_s3_sink, t_03363_s3_sink_read; + +CREATE TABLE t_03363_s3_sink (year UInt16, country String, random UInt8) +ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root/{_partition_id}', format = Parquet, partitioning_style='hive') +PARTITION BY (year, country) -- {serverError BAD_ARGUMENTS}; From 33b61d4413c3c005b669475e6c9dd7ab34ebe889 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Sun, 9 Mar 2025 09:30:34 -0300 Subject: [PATCH 12/42] comment out test until we figure out what to do with that syntax --- tests/queries/0_stateless/01944_insert_partition_by.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01944_insert_partition_by.sql b/tests/queries/0_stateless/01944_insert_partition_by.sql index ac38fcee4905..4df2887df68b 100644 --- a/tests/queries/0_stateless/01944_insert_partition_by.sql +++ b/tests/queries/0_stateless/01944_insert_partition_by.sql @@ -7,5 +7,5 @@ INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.cs INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc}{abc'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc*abc'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/{_partition_id}', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError BAD_ARGUMENTS } -INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError BAD_ARGUMENTS } -INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'aa/bb'); -- { serverError CANNOT_PARSE_TEXT } +-- INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError BAD_ARGUMENTS } +-- INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'aa/bb'); -- { serverError CANNOT_PARSE_TEXT } From 35d875ac4fa6172d863ba8d565f0de7ed45e5224 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Sun, 9 Mar 2025 16:44:08 -0300 Subject: [PATCH 13/42] fix tests --- .../0_stateless/03363_hive_style_partition_write.sql | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql index 04556789a5c9..6b36c048f9af 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -2,7 +2,7 @@ DROP TABLE IF EXISTS t_03363_s3_sink, t_03363_s3_sink_read; -CREATE TABLE t_03363_s3_sink (year UInt16, country String, random UInt8) +CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root', format = Parquet, partitioning_style='hive') PARTITION BY (year, country); @@ -20,13 +20,14 @@ INSERT INTO t_03363_s3_sink SETTINGS s3_truncate_on_insert=1 VALUES (2024, 'CN', 11); -- while reading from object storage partitioned table is not supported, an auxiliary table must be created -CREATE TABLE t_03363_s3_sink_read (year UInt16, country String, random UInt8) +CREATE TABLE t_03363_s3_sink_read (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root/**.parquet', format = Parquet); -select replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, * from t_03363_s3_sink_read order by random SETTINGS use_hive_partitioning=1, input_format_parquet_use_native_reader=0; +-- distinct because minio isn't cleaned up +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from t_03363_s3_sink_read order by counter SETTINGS use_hive_partitioning=1, input_format_parquet_use_native_reader=0; DROP TABLE t_03363_s3_sink, t_03363_s3_sink_read; -CREATE TABLE t_03363_s3_sink (year UInt16, country String, random UInt8) +CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root/{_partition_id}', format = Parquet, partitioning_style='hive') PARTITION BY (year, country) -- {serverError BAD_ARGUMENTS}; From fc546c9fdf8d4402617a867bbc179f574e6c4255 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 20 Mar 2025 14:03:21 -0300 Subject: [PATCH 14/42] fix --- .../ObjectStorage/StorageObjectStorage.cpp | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 0719b05fa1f7..3528aa0a2037 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -112,26 +112,31 @@ StorageObjectStorage::StorageObjectStorage( bool failed_init = false; auto do_init = [&]() { - if (!lazy_init) + try { if (configuration->hasExternalDynamicMetadata()) configuration->updateAndGetCurrentSchema(object_storage, context); else configuration->update(object_storage, context); } - } - catch (...) - { - // If we don't have format or schema yet, we can't ignore failed configuration update, because relevant configuration is crucial for format and schema inference - if (mode <= LoadingStrictnessLevel::CREATE || columns_.empty() || (configuration->format == "auto")) - { - throw; - } - else + catch (...) { - tryLogCurrentException(log); + // If we don't have format or schema yet, we can't ignore failed configuration update, + // because relevant configuration is crucial for format and schema inference + if (mode <= LoadingStrictnessLevel::CREATE || columns_.empty() || (configuration->format == "auto")) + { + throw; + } + else + { + tryLogCurrentException(log); + failed_init = true; + } } - } + }; + + if (!do_lazy_init) + do_init(); std::string sample_path; ColumnsDescription columns{columns_}; From f9777a505cd890368a206c81489e027568b0dae9 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 20 Mar 2025 14:39:57 -0300 Subject: [PATCH 15/42] conflicts --- src/Storages/ObjectStorage/registerStorageObjectStorage.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index b1e2748a1c8d..ad3e21c0beda 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -75,7 +75,8 @@ createStorageObjectStorage(const StorageFactory::Arguments & args, StorageObject if (args.storage_def->partition_by) partition_by = args.storage_def->partition_by->clone(); - return std::make_shared( + return std::make_shared( + cluster_name, configuration, configuration->createObjectStorage(context, /* is_readonly */ false), args.getContext(), From de868f3f40dacf101b8758992d6dc7b6532aa56f Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 21 Mar 2025 15:23:15 -0300 Subject: [PATCH 16/42] do not allow partition_id to be specified in the bucket portion --- .../ObjectStorage/StorageObjectStorageSource.cpp | 8 -------- .../ObjectStorage/registerStorageObjectStorage.cpp | 10 ++++------ 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 49686ab706e7..568de3c95803 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -126,10 +126,6 @@ std::shared_ptr StorageObjectStorageSourc if (distributed_processing) return std::make_shared(local_context->getReadTaskCallback(), local_context->getSettingsRef()[Setting::max_threads]); - if (configuration->isNamespaceWithGlobs()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Expression can not have wildcards inside {} name", configuration->getNamespaceType()); - const bool is_archive = configuration->isArchive(); configuration->updateIfRequired(object_storage, local_context); @@ -620,10 +616,6 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( , local_context(context_) , file_progress_callback(file_progress_callback_) { - if (configuration->isNamespaceWithGlobs()) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expression can not have wildcards inside namespace name"); - } if (configuration->isPathWithGlobs()) { const auto key_with_globs = configuration_->getPath(); diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index ad3e21c0beda..422586449073 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -15,12 +15,6 @@ namespace DB { -namespace Setting -{ -extern const SettingsBool use_hive_partitioning; -} - - namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -53,6 +47,10 @@ createStorageObjectStorage(const StorageFactory::Arguments & args, StorageObject StorageObjectStorage::Configuration::initialize(*configuration, args.engine_args, context, false, std::move(queue_settings)); + if (configuration->isNamespaceWithGlobs()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expression can not have wildcards inside {} name", configuration->getNamespaceType()); + // Use format settings from global server context + settings from // the SETTINGS clause of the create query. Settings from current // session and user are ignored. From b4bb1a2f24ae11fb3e2eba5395cd80856a2127c4 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 24 Mar 2025 16:13:38 -0300 Subject: [PATCH 17/42] add tests to validate partition id macro isn't used where it is not supposed to be --- ...3_globbed_path_in_bucket_portion.reference | 0 ...3364_s3_globbed_path_in_bucket_portion.sql | 29 +++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.reference create mode 100644 tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql diff --git a/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.reference b/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.reference new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql b/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql new file mode 100644 index 000000000000..15610dfd7660 --- /dev/null +++ b/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql @@ -0,0 +1,29 @@ +-- virtual hosted style +create table s3_03364 (id UInt32) engine=S3('http://{_partition_id}.s3.region.amazonaws.com/key'); -- {serverError BAD_ARGUMENTS} +create table s3_03364 (id UInt32) engine=S3('http://{_partition_id}something.s3.region.amazonaws.com/key'); -- {serverError BAD_ARGUMENTS} + +select * from s3('http://{_partition_id}.s3.region.amazonaws.com/key', 'Parquet'); -- {serverError BAD_ARGUMENTS} +select * from s3('http://{_partition_id}something.s3.region.amazonaws.com/key', 'Parquet'); -- {serverError BAD_ARGUMENTS} + +insert into table function s3('http://{_partition_id}.s3.region.amazonaws.com/key', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} +insert into table function s3('http://{_partition_id}something.s3.region.amazonaws.com/key', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} + +-- path style +create table s3_03364 (id UInt32) engine=S3('http://s3.region.amazonaws.com/{_partition_id}'); -- {serverError BAD_ARGUMENTS} +create table s3_03364 (id UInt32) engine=S3('http://s3.region.amazonaws.com/{_partition_id}/key'); -- {serverError BAD_ARGUMENTS} + +select * from s3('http://s3.region.amazonaws.com/{_partition_id}', 'Parquet'); -- {serverError BAD_ARGUMENTS} +select * from s3('http://s3.region.amazonaws.com/{_partition_id}/key', 'Parquet'); -- {serverError BAD_ARGUMENTS} + +insert into table function s3('http://s3.region.amazonaws.com/{_partition_id}', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} +insert into table function s3('http://s3.region.amazonaws.com/{_partition_id}/key', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} + +-- aws private link style +create table s3_03364 (id UInt32) engine=S3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}'); -- {serverError BAD_ARGUMENTS} +create table s3_03364 (id UInt32) engine=S3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}/key'); -- {serverError BAD_ARGUMENTS} + +select * from s3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}', 'Parquet'); -- {serverError BAD_ARGUMENTS} +select * from s3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}/key', 'Parquet'); -- {serverError BAD_ARGUMENTS} + +insert into table function s3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} +insert into table function s3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}/key', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} From 4622f9e6f27ef372540ed3b82b979554e0a13474 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 24 Mar 2025 16:17:11 -0300 Subject: [PATCH 18/42] remove no longer needed test --- tests/queries/0_stateless/01944_insert_partition_by.sql | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/queries/0_stateless/01944_insert_partition_by.sql b/tests/queries/0_stateless/01944_insert_partition_by.sql index 4df2887df68b..03bbd17b8ce7 100644 --- a/tests/queries/0_stateless/01944_insert_partition_by.sql +++ b/tests/queries/0_stateless/01944_insert_partition_by.sql @@ -7,5 +7,3 @@ INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.cs INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc}{abc'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc*abc'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/{_partition_id}', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError BAD_ARGUMENTS } --- INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError BAD_ARGUMENTS } --- INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'aa/bb'); -- { serverError CANNOT_PARSE_TEXT } From 3ca53e1f08f232b5da5c0b40c6a9642de09dce8a Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 24 Mar 2025 17:33:04 -0300 Subject: [PATCH 19/42] some silly implementation for formatToFileExtension --- src/Storages/PartitionStrategy.cpp | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index f94ada4a33e6..13843af8a1c5 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -27,19 +28,23 @@ namespace return exp_analyzer->getRequiredColumns(); } - static std::string formatToFileExtension(const std::string & format) + /* + * This isn't ideal, but I guess multiple formats can be specified and introduced. + * So I think it is simpler to keep it this way. + * + * Or perhaps implement something like `IInputFormat::getFileExtension()` + */ + std::string formatToFileExtension(const std::string & format) { - if (format == "Parquet") - { - return "parquet"; - } + std::string lower_case_format; + lower_case_format.resize(format.size()); - if (format == "CSV") + for (std::size_t i = 0; i < format.size(); i++) { - return "csv"; + lower_case_format[i] = tolower(format[i]); } - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for format {}", format); + return lower_case_format; } } From 82ee8218cea0b9dcfbff426f4ebb6686fa377d03 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 24 Mar 2025 17:35:59 -0300 Subject: [PATCH 20/42] move check for wildcard inside initialize method --- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 4 ++++ src/Storages/ObjectStorage/registerStorageObjectStorage.cpp | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 3528aa0a2037..50d674cd53bf 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -582,6 +582,10 @@ void StorageObjectStorage::Configuration::initialize( else configuration.fromAST(engine_args, local_context, with_table_structure); + if (configuration.isNamespaceWithGlobs()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expression can not have wildcards inside {} name", configuration.getNamespaceType()); + if (configuration.format == "auto") { if (configuration.isDataLakeConfiguration()) diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index 422586449073..4c7c8abebe11 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -47,10 +47,6 @@ createStorageObjectStorage(const StorageFactory::Arguments & args, StorageObject StorageObjectStorage::Configuration::initialize(*configuration, args.engine_args, context, false, std::move(queue_settings)); - if (configuration->isNamespaceWithGlobs()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Expression can not have wildcards inside {} name", configuration->getNamespaceType()); - // Use format settings from global server context + settings from // the SETTINGS clause of the create query. Settings from current // session and user are ignored. From d9409cc1291e22b73f0a377d9a8ea50d336563be Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 24 Mar 2025 17:44:36 -0300 Subject: [PATCH 21/42] remove some unrelated changes --- .../ObjectStorage/StorageObjectStorage.cpp | 33 ++++++++----------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 50d674cd53bf..c57366f3c59f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -108,35 +108,28 @@ StorageObjectStorage::StorageObjectStorage( PartitionedSink::PARTITION_ID_WILDCARD, configuration->partitioning_style); } - bool do_lazy_init = lazy_init && !columns_.empty() && !configuration->format.empty(); - bool failed_init = false; - auto do_init = [&]() + try { - try + if (!lazy_init) { if (configuration->hasExternalDynamicMetadata()) configuration->updateAndGetCurrentSchema(object_storage, context); else configuration->update(object_storage, context); } - catch (...) + } + catch (...) + { + // If we don't have format or schema yet, we can't ignore failed configuration update, because relevant configuration is crucial for format and schema inference + if (mode <= LoadingStrictnessLevel::CREATE || columns_.empty() || (configuration->format == "auto")) { - // If we don't have format or schema yet, we can't ignore failed configuration update, - // because relevant configuration is crucial for format and schema inference - if (mode <= LoadingStrictnessLevel::CREATE || columns_.empty() || (configuration->format == "auto")) - { - throw; - } - else - { - tryLogCurrentException(log); - failed_init = true; - } + throw; } - }; - - if (!do_lazy_init) - do_init(); + else + { + tryLogCurrentException(log); + } + } std::string sample_path; ColumnsDescription columns{columns_}; From b1bca06fdd8249558c646503ec8d2070f2c56669 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 27 Mar 2025 16:35:19 -0300 Subject: [PATCH 22/42] add some sanity checks for partitioning configuration, tests for insert table function --- .../ObjectStorage/S3/Configuration.cpp | 2 + .../ObjectStorage/StorageObjectStorage.cpp | 96 +++++++++++++++---- src/Storages/PartitionStrategy.cpp | 2 +- tests/config/config.d/named_collection.xml | 7 ++ ...03363_hive_style_partition_write.reference | 44 ++++++--- .../03363_hive_style_partition_write.sql | 44 +++++++-- 6 files changed, 153 insertions(+), 42 deletions(-) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 667c745acd5f..0039b4c65328 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -391,6 +391,8 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ if (no_sign_request) auth_settings[S3AuthSetting::no_sign_request] = no_sign_request; + partitioning_style = "hive"; + static_configuration = !auth_settings[S3AuthSetting::access_key_id].value.empty() || auth_settings[S3AuthSetting::no_sign_request].changed; auth_settings[S3AuthSetting::no_sign_request] = no_sign_request; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index c57366f3c59f..68c831429e86 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -50,6 +50,76 @@ namespace StorageObjectStorageSetting extern const StorageObjectStorageSettingsBool allow_dynamic_metadata_for_data_lakes; } +namespace +{ + void sanityCheckPartitioningConfiguration( + const ASTPtr & table_level_partition_by, + const ASTPtr & query_partition_by, + const StorageObjectStorage::ConfigurationPtr & configuration) + { + if (!table_level_partition_by && !query_partition_by) + { + // do we want to assert that `partitioning_style` is not set to something different style AND + // wildcard is not set either? + return; + } + + if (table_level_partition_by && query_partition_by) + { + // should never happen because parser should not allow that + throw Exception(ErrorCodes::LOGICAL_ERROR, "Table level partition expression and query level partition expression can't be specified together, this is a bug"); + } + + static std::unordered_map partitioning_style_to_wildcard_acceptance = + { + {"auto", true}, + {"hive", false} + }; + + if (!partitioning_style_to_wildcard_acceptance.contains(configuration->partitioning_style)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Unknown partitioning style '{}'", + configuration->partitioning_style); + } + + if (configuration->withPartitionWildcard() && !partitioning_style_to_wildcard_acceptance.at(configuration->partitioning_style)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The {} wildcard can't be used with {} partitioning style", + PartitionedSink::PARTITION_ID_WILDCARD, configuration->partitioning_style); + } + + if (!configuration->withPartitionWildcard() && partitioning_style_to_wildcard_acceptance.at(configuration->partitioning_style)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Partitioning style '{}' requires {} wildcard", + configuration->partitioning_style, + PartitionedSink::PARTITION_ID_WILDCARD); + } + } + + ASTPtr getPartitionByAst(const ASTPtr & table_level_partition_by, const ASTPtr & query, const StorageObjectStorage::ConfigurationPtr & configuration) + { + ASTPtr query_partition_by = nullptr; + if (const auto insert_query = std::dynamic_pointer_cast(query)) + { + if (insert_query->partition_by) + { + query_partition_by = insert_query->partition_by; + } + } + + sanityCheckPartitioningConfiguration(table_level_partition_by, query_partition_by, configuration); + + if (table_level_partition_by) + { + return table_level_partition_by; + } + + return query_partition_by; + } +} + String StorageObjectStorage::getPathSample(ContextPtr context) { auto query_settings = configuration->getQuerySettings(context); @@ -102,11 +172,7 @@ StorageObjectStorage::StorageObjectStorage( , distributed_processing(distributed_processing_) , log(getLogger(fmt::format("Storage{}({})", configuration->getEngineName(), table_id_.getFullTableName()))) { - if (configuration->partitioning_style != "auto" && configuration->withPartitionWildcard()) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "The {} macro can't be used with {} partitioning style", - PartitionedSink::PARTITION_ID_WILDCARD, configuration->partitioning_style); - } + sanityCheckPartitioningConfiguration(partition_by, nullptr, configuration); try { @@ -387,23 +453,11 @@ SinkToStoragePtr StorageObjectStorage::write( configuration->getPath()); } - if (partition_by || configuration->withPartitionWildcard()) + if (ASTPtr partition_by_ast = getPartitionByAst(partition_by, query, configuration)) { - ASTPtr partition_by_ast = nullptr; - if (auto insert_query = std::dynamic_pointer_cast(query)) - { - if (insert_query->partition_by) - partition_by_ast = insert_query->partition_by; - else - partition_by_ast = partition_by; - } - - if (partition_by_ast) - { - auto partition_strategy = PartitionStrategyProvider::get(partition_by_ast, sample_block, local_context, configuration->format, configuration->partitioning_style); - return std::make_shared( - partition_strategy, object_storage, configuration, format_settings, sample_block, local_context); - } + auto partition_strategy = PartitionStrategyProvider::get(partition_by_ast, sample_block, local_context, configuration->format, configuration->partitioning_style); + return std::make_shared( + partition_strategy, object_storage, configuration, format_settings, sample_block, local_context); } auto paths = configuration->getPaths(); diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index 13843af8a1c5..dcde0290ddea 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include diff --git a/tests/config/config.d/named_collection.xml b/tests/config/config.d/named_collection.xml index 555c3b7f65ba..0d65fa4f1bb2 100644 --- a/tests/config/config.d/named_collection.xml +++ b/tests/config/config.d/named_collection.xml @@ -32,6 +32,13 @@ testtest auto + + http://localhost:11111/test/ + test + testtest + auto + hive + http://localhost:11111/test/ test diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.reference b/tests/queries/0_stateless/03363_hive_style_partition_write.reference index 872a65d43225..f85610ca7313 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.reference +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.reference @@ -1,11 +1,33 @@ -test/t_03363_s3_sink_root/year=2022/country=USA/.parquet 1 -test/t_03363_s3_sink_root/year=2022/country=Canada/.parquet 2 -test/t_03363_s3_sink_root/year=2023/country=USA/.parquet 3 -test/t_03363_s3_sink_root/year=2023/country=Mexico/.parquet 4 -test/t_03363_s3_sink_root/year=2024/country=France/.parquet 5 -test/t_03363_s3_sink_root/year=2024/country=Germany/.parquet 6 -test/t_03363_s3_sink_root/year=2024/country=Germany/.parquet 7 -test/t_03363_s3_sink_root/year=1999/country=Brazil/.parquet 8 -test/t_03363_s3_sink_root/year=2100/country=Japan/.parquet 9 -test/t_03363_s3_sink_root/year=2024/country=/.parquet 10 -test/t_03363_s3_sink_root/year=2024/country=CN/.parquet 11 +test/t_03363_parquet/year=2022/country=USA/.parquet 1 +test/t_03363_parquet/year=2022/country=Canada/.parquet 2 +test/t_03363_parquet/year=2023/country=USA/.parquet 3 +test/t_03363_parquet/year=2023/country=Mexico/.parquet 4 +test/t_03363_parquet/year=2024/country=France/.parquet 5 +test/t_03363_parquet/year=2024/country=Germany/.parquet 6 +test/t_03363_parquet/year=2024/country=Germany/.parquet 7 +test/t_03363_parquet/year=1999/country=Brazil/.parquet 8 +test/t_03363_parquet/year=2100/country=Japan/.parquet 9 +test/t_03363_parquet/year=2024/country=/.parquet 10 +test/t_03363_parquet/year=2024/country=CN/.parquet 11 +test/t_03363_csv/year=2022/country=USA/.csv 1 +test/t_03363_csv/year=2022/country=Canada/.csv 2 +test/t_03363_csv/year=2023/country=USA/.csv 3 +test/t_03363_csv/year=2023/country=Mexico/.csv 4 +test/t_03363_csv/year=2024/country=France/.csv 5 +test/t_03363_csv/year=2024/country=Germany/.csv 6 +test/t_03363_csv/year=2024/country=Germany/.csv 7 +test/t_03363_csv/year=1999/country=Brazil/.csv 8 +test/t_03363_csv/year=2100/country=Japan/.csv 9 +test/t_03363_csv/year=2024/country=/.csv 10 +test/t_03363_csv/year=2024/country=CN/.csv 11 +test/t_03363_function/year=2022/country=USA/.parquet 1 +test/t_03363_function/year=2022/country=Canada/.parquet 2 +test/t_03363_function/year=2023/country=USA/.parquet 3 +test/t_03363_function/year=2023/country=Mexico/.parquet 4 +test/t_03363_function/year=2024/country=France/.parquet 5 +test/t_03363_function/year=2024/country=Germany/.parquet 6 +test/t_03363_function/year=2024/country=Germany/.parquet 7 +test/t_03363_function/year=1999/country=Brazil/.parquet 8 +test/t_03363_function/year=2100/country=Japan/.parquet 9 +test/t_03363_function/year=2024/country=/.parquet 10 +test/t_03363_function/year=2024/country=CN/.parquet 11 diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql index 6b36c048f9af..f578e88c1b44 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -1,12 +1,12 @@ -- Tags: no-parallel, no-fasttest, no-random-settings -DROP TABLE IF EXISTS t_03363_s3_sink, t_03363_s3_sink_read; +DROP TABLE IF EXISTS t_03363_parquet, t_03363_parquet_read; -CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root', format = Parquet, partitioning_style='hive') +CREATE TABLE t_03363_parquet (year UInt16, country String, counter UInt8) +ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partitioning_style='hive') PARTITION BY (year, country); -INSERT INTO t_03363_s3_sink SETTINGS s3_truncate_on_insert=1 VALUES +INSERT INTO t_03363_parquet SETTINGS s3_truncate_on_insert=1 VALUES (2022, 'USA', 1), (2022, 'Canada', 2), (2023, 'USA', 3), @@ -20,14 +20,40 @@ INSERT INTO t_03363_s3_sink SETTINGS s3_truncate_on_insert=1 VALUES (2024, 'CN', 11); -- while reading from object storage partitioned table is not supported, an auxiliary table must be created -CREATE TABLE t_03363_s3_sink_read (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root/**.parquet', format = Parquet); +CREATE TABLE t_03363_parquet_read (year UInt16, country String, counter UInt8) +ENGINE = S3(s3_conn, filename = 't_03363_parquet/**.parquet', format = Parquet); -- distinct because minio isn't cleaned up -select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from t_03363_s3_sink_read order by counter SETTINGS use_hive_partitioning=1, input_format_parquet_use_native_reader=0; +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from t_03363_parquet_read order by counter SETTINGS use_hive_partitioning=1, input_format_parquet_use_native_reader=0; + +-- CSV test +CREATE TABLE t_03363_csv (year UInt16, country String, counter UInt8) +ENGINE = S3(s3_conn, filename = 't_03363_csv', format = CSV, partitioning_style='hive') +PARTITION BY (year, country); + +INSERT INTO t_03363_csv SETTINGS s3_truncate_on_insert=1 VALUES + (2022, 'USA', 1), + (2022, 'Canada', 2), + (2023, 'USA', 3), + (2023, 'Mexico', 4), + (2024, 'France', 5), + (2024, 'Germany', 6), + (2024, 'Germany', 7), + (1999, 'Brazil', 8), + (2100, 'Japan', 9), + (2024, '', 10), + (2024, 'CN', 11); + +CREATE TABLE t_03363_csv_read (year UInt16, country String, counter UInt8) +ENGINE = S3(s3_conn, filename = 't_03363_csv/**.csv', format = CSV); + +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.csv', '/.csv') AS _path, counter from t_03363_csv_read order by counter SETTINGS use_hive_partitioning=1; -DROP TABLE t_03363_s3_sink, t_03363_s3_sink_read; +-- s3 table function +INSERT INTO FUNCTION s3(s3_conn_hive_partitioning_style, filename='t_03363_function', format=Parquet) PARTITION BY (year, country) SELECT * FROM t_03363_parquet_read; +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn_hive_partitioning_style, filename='t_03363_function/**.parquet') order by counter SETTINGS use_hive_partitioning=1; +-- hive with partition id placeholder CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_s3_sink_root/{_partition_id}', format = Parquet, partitioning_style='hive') +ENGINE = S3(s3_conn, filename = 't_03363_parquet/{_partition_id}', format = Parquet, partitioning_style='hive') PARTITION BY (year, country) -- {serverError BAD_ARGUMENTS}; From 3f65f270ecf23440d606b847ec42eb13519bafea Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 28 Mar 2025 12:04:12 -0300 Subject: [PATCH 23/42] extract named argument partitioning_style --- .../ObjectStorage/S3/Configuration.cpp | 63 ++++++++++++++++++- tests/config/config.d/named_collection.xml | 7 --- .../03363_hive_style_partition_write.sql | 20 ++++-- 3 files changed, 77 insertions(+), 13 deletions(-) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 0039b4c65328..40c5ad44ffcc 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -53,6 +53,65 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +namespace +{ + std::optional extractNamedArgument(ASTs & arguments, const std::string & argument_name, ASTs::iterator & argument_position) + { + for (auto arg_it = arguments.begin(); arg_it != arguments.end(); ++arg_it) + { + auto argument = *arg_it; + const auto * type_ast_function = argument->as(); + + if (!type_ast_function || type_ast_function->name != "equals" || !type_ast_function->arguments || type_ast_function->arguments->children.size() != 2) + { + continue; + } + + auto name = type_ast_function->arguments->children[0]->as(); + + if (name && name->name() == argument_name) + { + auto value = type_ast_function->arguments->children[1]->as(); + + if (!value) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Wrong parameter type for '{}'", + name->name()); + } + + if (value->value.getType() != Field::Types::String) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Wrong parameter type for '{}'", + name->name()); + } + + argument_position = arg_it; + + return value->value.safeGet(); + } + } + + return std::nullopt; + } + + std::optional extractNamedArgumentAndRemoveFromList(ASTs & arguments, const std::string & argument_name) + { + ASTs::iterator iterator; + auto named_arg_opt = extractNamedArgument(arguments, argument_name, iterator); + + if (named_arg_opt) + { + arguments.erase(iterator); + } + + return named_arg_opt; + } +} + static const std::unordered_set required_configuration_keys = { "url", }; @@ -196,6 +255,8 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ { size_t count = StorageURL::evalArgsAndCollectHeaders(args, headers_from_ast, context); + partitioning_style = extractNamedArgumentAndRemoveFromList(args, "partitioning_style").value_or("auto"); + if (count == 0 || count > getMaxNumberOfArguments(with_structure)) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Storage S3 requires 1 to {} arguments. All supported signatures:\n{}", @@ -391,8 +452,6 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ if (no_sign_request) auth_settings[S3AuthSetting::no_sign_request] = no_sign_request; - partitioning_style = "hive"; - static_configuration = !auth_settings[S3AuthSetting::access_key_id].value.empty() || auth_settings[S3AuthSetting::no_sign_request].changed; auth_settings[S3AuthSetting::no_sign_request] = no_sign_request; diff --git a/tests/config/config.d/named_collection.xml b/tests/config/config.d/named_collection.xml index 0d65fa4f1bb2..555c3b7f65ba 100644 --- a/tests/config/config.d/named_collection.xml +++ b/tests/config/config.d/named_collection.xml @@ -32,13 +32,6 @@ testtest auto - - http://localhost:11111/test/ - test - testtest - auto - hive - http://localhost:11111/test/ test diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql index f578e88c1b44..f838d90b6c07 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -1,6 +1,6 @@ -- Tags: no-parallel, no-fasttest, no-random-settings -DROP TABLE IF EXISTS t_03363_parquet, t_03363_parquet_read; +DROP TABLE IF EXISTS t_03363_parquet, t_03363_parquet_read, t_03363_csv, t_03363_csv_read; CREATE TABLE t_03363_parquet (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partitioning_style='hive') @@ -50,10 +50,22 @@ ENGINE = S3(s3_conn, filename = 't_03363_csv/**.csv', format = CSV); select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.csv', '/.csv') AS _path, counter from t_03363_csv_read order by counter SETTINGS use_hive_partitioning=1; -- s3 table function -INSERT INTO FUNCTION s3(s3_conn_hive_partitioning_style, filename='t_03363_function', format=Parquet) PARTITION BY (year, country) SELECT * FROM t_03363_parquet_read; -select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn_hive_partitioning_style, filename='t_03363_function/**.parquet') order by counter SETTINGS use_hive_partitioning=1; +INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function', format=Parquet, partitioning_style='hive') PARTITION BY (year, country) SELECT * FROM t_03363_parquet_read; +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03363_function/**.parquet') order by counter SETTINGS use_hive_partitioning=1; -- hive with partition id placeholder CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_parquet/{_partition_id}', format = Parquet, partitioning_style='hive') -PARTITION BY (year, country) -- {serverError BAD_ARGUMENTS}; +PARTITION BY (year, country); -- {serverError BAD_ARGUMENTS}; + +-- unknown partitioning style +CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) +ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partitioning_style='abc') +PARTITION BY (year, country); -- {serverError BAD_ARGUMENTS}; + +-- auto partitioning style without partition_id wildcard +CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) +ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet) +PARTITION BY (year, country); -- {serverError BAD_ARGUMENTS}; + +DROP TABLE IF EXISTS t_03363_parquet, t_03363_parquet_read, t_03363_csv, t_03363_csv_read; From 7ae3743af8beea1fb3ade425176bdc3f833b220f Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 28 Mar 2025 13:37:34 -0300 Subject: [PATCH 24/42] address pr comments --- .../ObjectStorage/StorageObjectStorage.cpp | 6 ++-- src/Storages/PartitionStrategy.cpp | 30 +++++++------------ 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 68c831429e86..21df6c8f9d9b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -83,13 +83,15 @@ namespace configuration->partitioning_style); } - if (configuration->withPartitionWildcard() && !partitioning_style_to_wildcard_acceptance.at(configuration->partitioning_style)) + bool has_partition_wildcard = configuration->withPartitionWildcard(); + + if (has_partition_wildcard && !partitioning_style_to_wildcard_acceptance.at(configuration->partitioning_style)) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "The {} wildcard can't be used with {} partitioning style", PartitionedSink::PARTITION_ID_WILDCARD, configuration->partitioning_style); } - if (!configuration->withPartitionWildcard() && partitioning_style_to_wildcard_acceptance.at(configuration->partitioning_style)) + if (!has_partition_wildcard && partitioning_style_to_wildcard_acceptance.at(configuration->partitioning_style)) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partitioning style '{}' requires {} wildcard", diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index dcde0290ddea..d2e99acdb8bd 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -8,6 +8,8 @@ #include #include +#include "Interpreters/ExternalLoaderDictionaryStorageConfigRepository.h" + namespace DB { @@ -27,25 +29,6 @@ namespace auto exp_analyzer = ExpressionAnalyzer(pby_clone, syntax_result, context).getActions(false); return exp_analyzer->getRequiredColumns(); } - - /* - * This isn't ideal, but I guess multiple formats can be specified and introduced. - * So I think it is simpler to keep it this way. - * - * Or perhaps implement something like `IInputFormat::getFileExtension()` - */ - std::string formatToFileExtension(const std::string & format) - { - std::string lower_case_format; - lower_case_format.resize(format.size()); - - for (std::size_t i = 0; i < format.size(); i++) - { - lower_case_format[i] = tolower(format[i]); - } - - return lower_case_format; - } } PartitionStrategy::PartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_) @@ -155,7 +138,14 @@ std::string HiveStylePartitionStrategy::getPath( path += prefix + "/"; } - return path + partition_key + "/" + std::to_string(generateSnowflakeID()) + "." + formatToFileExtension(file_format); + /* + * File extension is toLower(format) + * This isn't ideal, but I guess multiple formats can be specified and introduced. + * So I think it is simpler to keep it this way. + * + * Or perhaps implement something like `IInputFormat::getFileExtension()` + */ + return path + partition_key + "/" + std::to_string(generateSnowflakeID()) + "." + Poco::toLower(file_format); } } From 35e9c1840e0d2f5cb6e8e4650da4fa9bae40d89c Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 28 Mar 2025 13:38:18 -0300 Subject: [PATCH 25/42] remove unnecessary namespace qualifiers --- src/Storages/PartitionStrategy.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index d2e99acdb8bd..1949479abda7 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include "Interpreters/ExternalLoaderDictionaryStorageConfigRepository.h" @@ -62,7 +61,7 @@ StringfiedPartitionStrategy::StringfiedPartitionStrategy(ASTPtr partition_by_, c StringfiedPartitionStrategy::PartitionExpressionActionsAndColumnName StringfiedPartitionStrategy::getExpression() { - StringfiedPartitionStrategy::PartitionExpressionActionsAndColumnName actions_with_column_name; + PartitionExpressionActionsAndColumnName actions_with_column_name; ASTs arguments(1, partition_by); ASTPtr partition_by_string = makeASTFunction("toString", std::move(arguments)); @@ -88,7 +87,7 @@ HiveStylePartitionStrategy::HiveStylePartitionStrategy(ASTPtr partition_by_, con HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName HiveStylePartitionStrategy::getExpression() { - StringfiedPartitionStrategy::PartitionExpressionActionsAndColumnName actions_with_column_name; + PartitionExpressionActionsAndColumnName actions_with_column_name; const Names partition_expression_required_columns = extractPartitionRequiredColumns(partition_by, sample_block, context); ASTs concat_args; From 1ae8b93df7ca5727a028ca31b0394b9df30f2817 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Sat, 29 Mar 2025 11:57:09 -0300 Subject: [PATCH 26/42] add no fast test to test --- .../0_stateless/03364_s3_globbed_path_in_bucket_portion.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql b/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql index 15610dfd7660..4b2ee57c7630 100644 --- a/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql +++ b/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql @@ -1,3 +1,4 @@ +-- Tags: no-fasttest -- virtual hosted style create table s3_03364 (id UInt32) engine=S3('http://{_partition_id}.s3.region.amazonaws.com/key'); -- {serverError BAD_ARGUMENTS} create table s3_03364 (id UInt32) engine=S3('http://{_partition_id}something.s3.region.amazonaws.com/key'); -- {serverError BAD_ARGUMENTS} From abdd84aa063c8b2035592258c91ae296a05290b5 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 2 Apr 2025 07:45:36 -0300 Subject: [PATCH 27/42] write_partition_columns_into_files argument --- .../ObjectStorage/S3/Configuration.cpp | 25 ++- .../ObjectStorage/StorageObjectStorage.cpp | 8 +- .../ObjectStorage/StorageObjectStorage.h | 4 + .../StorageObjectStorageSink.cpp | 2 +- src/Storages/PartitionStrategy.cpp | 171 +++++++++++++----- src/Storages/PartitionStrategy.h | 40 +++- src/Storages/PartitionedSink.cpp | 16 +- ...03363_hive_style_partition_write.reference | 9 +- .../03363_hive_style_partition_write.sql | 16 +- 9 files changed, 205 insertions(+), 86 deletions(-) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 40c5ad44ffcc..03ea400933f3 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -55,7 +55,8 @@ namespace ErrorCodes namespace { - std::optional extractNamedArgument(ASTs & arguments, const std::string & argument_name, ASTs::iterator & argument_position) + template + std::optional extractNamedArgument(ASTs & arguments, const std::string & argument_name, ASTs::iterator & argument_position) { for (auto arg_it = arguments.begin(); arg_it != arguments.end(); ++arg_it) { @@ -81,27 +82,20 @@ namespace name->name()); } - if (value->value.getType() != Field::Types::String) - { - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Wrong parameter type for '{}'", - name->name()); - } - argument_position = arg_it; - return value->value.safeGet(); + return value->value.safeGet(); } } return std::nullopt; } - std::optional extractNamedArgumentAndRemoveFromList(ASTs & arguments, const std::string & argument_name) + template + std::optional extractNamedArgumentAndRemoveFromList(ASTs & arguments, const std::string & argument_name) { ASTs::iterator iterator; - auto named_arg_opt = extractNamedArgument(arguments, argument_name, iterator); + auto named_arg_opt = extractNamedArgument(arguments, argument_name, iterator); if (named_arg_opt) { @@ -134,7 +128,8 @@ static const std::unordered_set optional_configuration_keys = "max_connections", "expiration_window_seconds", "no_sign_request", - "partitioning_style" + "partitioning_style", + "write_partition_columns_into_files" }; String StorageS3Configuration::getDataSourceDescription() const @@ -233,6 +228,7 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect url = S3::URI(collection.get("url"), settings[Setting::allow_archive_path_syntax]); partitioning_style = collection.getOrDefault("partitioning_style", "auto"); + write_partition_columns_into_files = collection.getOrDefault("write_partition_columns_into_files", false); auth_settings[S3AuthSetting::access_key_id] = collection.getOrDefault("access_key_id", ""); auth_settings[S3AuthSetting::secret_access_key] = collection.getOrDefault("secret_access_key", ""); @@ -255,7 +251,8 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ { size_t count = StorageURL::evalArgsAndCollectHeaders(args, headers_from_ast, context); - partitioning_style = extractNamedArgumentAndRemoveFromList(args, "partitioning_style").value_or("auto"); + partitioning_style = extractNamedArgumentAndRemoveFromList(args, "partitioning_style").value_or("auto"); + write_partition_columns_into_files = extractNamedArgumentAndRemoveFromList(args, "write_partition_columns_into_files").value_or(false); if (count == 0 || count > getMaxNumberOfArguments(with_structure)) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 21df6c8f9d9b..f43a6b1b301c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -457,7 +457,13 @@ SinkToStoragePtr StorageObjectStorage::write( if (ASTPtr partition_by_ast = getPartitionByAst(partition_by, query, configuration)) { - auto partition_strategy = PartitionStrategyProvider::get(partition_by_ast, sample_block, local_context, configuration->format, configuration->partitioning_style); + auto partition_strategy = PartitionStrategyProvider::get( + partition_by_ast, + sample_block, + local_context, + configuration->format, + configuration->partitioning_style, + configuration->write_partition_columns_into_files); return std::make_shared( partition_strategy, object_storage, configuration, format_settings, sample_block, local_context); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 532273871bce..cfb71b112c91 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -245,6 +245,10 @@ class StorageObjectStorage::Configuration String compression_method = "auto"; String structure = "auto"; std::string partitioning_style = "auto"; + /* + * Only supported by hive partitioning style for now + */ + bool write_partition_columns_into_files = false; virtual void update(ObjectStoragePtr object_storage, ContextPtr local_context); void updateIfRequired(ObjectStoragePtr object_storage, ContextPtr local_context); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 459a498aa2f4..ab66606b3020 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -164,7 +164,7 @@ SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String object_storage, configuration, format_settings, - sample_block, + getPartitionStrategy()->getBlockWithoutPartitionColumnsIfNeeded(), context, file_path ); diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index 1949479abda7..9736728142e2 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -6,8 +6,11 @@ #include #include #include - -#include "Interpreters/ExternalLoaderDictionaryStorageConfigRepository.h" +#include +#include +#include +#include +#include namespace DB { @@ -18,16 +21,76 @@ extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; } - namespace { Names extractPartitionRequiredColumns(ASTPtr & partition_by, const Block & sample_block, ContextPtr context) { auto pby_clone = partition_by->clone(); - auto syntax_result = TreeRewriter(context).analyze(pby_clone, sample_block.getNamesAndTypesList()); + // sample_block columns might be out of order, and we need to respect the partition by expression order + auto key_description = KeyDescription::getKeyFromAST(pby_clone, ColumnsDescription::fromNamesAndTypes(sample_block.getNamesAndTypes()), context); + auto syntax_result = TreeRewriter(context).analyze(pby_clone, key_description.sample_block.getNamesAndTypesList()); auto exp_analyzer = ExpressionAnalyzer(pby_clone, syntax_result, context).getActions(false); return exp_analyzer->getRequiredColumns(); } + + HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName buildExpressionHive( + ASTPtr partition_by, + const Names & partition_expression_required_columns, + const Block & sample_block, + ContextPtr context) + { + HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName actions_with_column_name; + ASTs concat_args; + + if (const auto * tuple_function = partition_by->as(); + tuple_function && tuple_function->name == "tuple") + { + chassert(tuple_function->arguments->children.size() == partition_expression_required_columns.size()); + + for (size_t i = 0; i < tuple_function->arguments->children.size(); i++) + { + const auto & child = tuple_function->arguments->children[i]; + + concat_args.push_back(std::make_shared(partition_expression_required_columns[i] + "=")); + + concat_args.push_back(makeASTFunction("toString", child)); + + concat_args.push_back(std::make_shared("/")); + } + } + else + { + chassert(partition_expression_required_columns.size() == 1); + + ASTs to_string_args = {1, partition_by}; + concat_args.push_back(std::make_shared(partition_expression_required_columns[0] + "=")); + concat_args.push_back(makeASTFunction("toString", std::move(to_string_args))); + concat_args.push_back(std::make_shared("/")); + } + + ASTPtr hive_expr = makeASTFunction("concat", std::move(concat_args)); + auto hive_syntax_result = TreeRewriter(context).analyze(hive_expr, sample_block.getNamesAndTypesList()); + actions_with_column_name.actions = ExpressionAnalyzer(hive_expr, hive_syntax_result, context).getActions(false); + actions_with_column_name.column_name = hive_expr->getColumnName(); + + return actions_with_column_name; + } + + Block buildBlockWithoutPartitionColumns( + const Block & sample_block, + const std::unordered_set & partition_expression_required_columns_set) + { + Block result; + for (size_t i = 0; i < sample_block.columns(); i++) + { + if (!partition_expression_required_columns_set.contains(sample_block.getByPosition(i).name)) + { + result.insert(sample_block.getByPosition(i)); + } + } + + return result; + } } PartitionStrategy::PartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_) @@ -40,7 +103,8 @@ std::shared_ptr PartitionStrategyProvider::get(ASTPtr partiti const Block & sample_block, ContextPtr context, const std::string & file_format, - const std::string & partitioning_style) + const std::string & partitioning_style, + bool write_partition_columns_into_files) { if (partitioning_style == "hive") { @@ -48,7 +112,13 @@ std::shared_ptr PartitionStrategyProvider::get(ASTPtr partiti { throw Exception(ErrorCodes::LOGICAL_ERROR, "File format can't be empty for hive style partitioning"); } - return std::make_shared(partition_by, sample_block, context, file_format); + + return std::make_shared( + partition_by, + sample_block, + context, + file_format, + write_partition_columns_into_files); } return std::make_shared(partition_by, sample_block, context); @@ -79,50 +149,24 @@ std::string StringfiedPartitionStrategy::getPath( return PartitionedSink::replaceWildcards(prefix, partition_key); } -HiveStylePartitionStrategy::HiveStylePartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_, const std::string & file_format_) -: PartitionStrategy(partition_by_, sample_block_, context_), file_format(file_format_) +HiveStylePartitionStrategy::HiveStylePartitionStrategy( + ASTPtr partition_by_, + const Block & sample_block_, + ContextPtr context_, + const std::string & file_format_, + bool write_partition_columns_into_files_) + : PartitionStrategy(partition_by_, sample_block_, context_), + file_format(file_format_), + write_partition_columns_into_files(write_partition_columns_into_files_), + partition_expression_required_columns(extractPartitionRequiredColumns(partition_by, sample_block, context)), + partition_expression_required_columns_set(partition_expression_required_columns.begin(), partition_expression_required_columns.end()) { - + actions_with_column_name = buildExpressionHive(partition_by, partition_expression_required_columns, sample_block, context); + block_without_partition_columns = buildBlockWithoutPartitionColumns(sample_block, partition_expression_required_columns_set); } HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName HiveStylePartitionStrategy::getExpression() { - PartitionExpressionActionsAndColumnName actions_with_column_name; - - const Names partition_expression_required_columns = extractPartitionRequiredColumns(partition_by, sample_block, context); - ASTs concat_args; - - if (const auto * tuple_function = partition_by->as(); - tuple_function && tuple_function->name == "tuple") - { - chassert(tuple_function->arguments->children.size() == partition_expression_required_columns.size()); - - for (size_t i = 0; i < tuple_function->arguments->children.size(); i++) - { - const auto & child = tuple_function->arguments->children[i]; - - concat_args.push_back(std::make_shared(partition_expression_required_columns[i] + "=")); - - concat_args.push_back(makeASTFunction("toString", child)); - - concat_args.push_back(std::make_shared("/")); - } - } - else - { - chassert(partition_expression_required_columns.size() == 1); - - ASTs to_string_args = {1, partition_by}; - concat_args.push_back(std::make_shared(partition_expression_required_columns[0] + "=")); - concat_args.push_back(makeASTFunction("toString", std::move(to_string_args))); - concat_args.push_back(std::make_shared("/")); - } - - ASTPtr hive_expr = makeASTFunction("concat", std::move(concat_args)); - auto hive_syntax_result = TreeRewriter(context).analyze(hive_expr, sample_block.getNamesAndTypesList()); - actions_with_column_name.actions = ExpressionAnalyzer(hive_expr, hive_syntax_result, context).getActions(false); - actions_with_column_name.column_name = hive_expr->getColumnName(); - return actions_with_column_name; } @@ -147,4 +191,41 @@ std::string HiveStylePartitionStrategy::getPath( return path + partition_key + "/" + std::to_string(generateSnowflakeID()) + "." + Poco::toLower(file_format); } +Chunk HiveStylePartitionStrategy::getChunkWithoutPartitionColumnsIfNeeded(const Chunk & chunk) +{ + Chunk result; + + if (write_partition_columns_into_files) + { + for (const auto & column : chunk.getColumns()) + { + result.addColumn(column); + } + + return result; + } + + chassert(chunk.getColumns().size() == sample_block.columns()); + + for (size_t i = 0; i < sample_block.columns(); i++) + { + if (!partition_expression_required_columns_set.contains(sample_block.getByPosition(i).name)) + { + result.addColumn(chunk.getColumns()[i]); + } + } + + return result; +} + +Block HiveStylePartitionStrategy::getBlockWithoutPartitionColumnsIfNeeded() +{ + if (write_partition_columns_into_files) + { + return sample_block; + } + + return block_without_partition_columns; +} + } diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h index eecca53a36fe..6fc9006cf515 100644 --- a/src/Storages/PartitionStrategy.h +++ b/src/Storages/PartitionStrategy.h @@ -3,6 +3,8 @@ #include #include +#include + namespace DB { @@ -21,6 +23,23 @@ struct PartitionStrategy virtual PartitionExpressionActionsAndColumnName getExpression() = 0; virtual std::string getPath(const std::string & prefix, const std::string & partition_key) = 0; + virtual Chunk getChunkWithoutPartitionColumnsIfNeeded(const Chunk & chunk) + { + Chunk result; + + for (const auto & column : chunk.getColumns()) + { + result.addColumn(column); + } + + return result; + } + + virtual Block getBlockWithoutPartitionColumnsIfNeeded() + { + return sample_block; + } + protected: ASTPtr partition_by; Block sample_block; @@ -34,10 +53,11 @@ struct PartitionStrategyProvider const Block & sample_block, ContextPtr context, const std::string & file_format, - const std::string & partitioning_style = ""); + const std::string & partitioning_style = "", + bool omit_partition_columns_from_file = true); }; -struct StringfiedPartitionStrategy : public PartitionStrategy +struct StringfiedPartitionStrategy : PartitionStrategy { StringfiedPartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_); @@ -45,15 +65,27 @@ struct StringfiedPartitionStrategy : public PartitionStrategy std::string getPath(const std::string & prefix, const std::string & partition_key) override; }; -struct HiveStylePartitionStrategy : public PartitionStrategy +struct HiveStylePartitionStrategy : PartitionStrategy { - HiveStylePartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_, const std::string & file_format_); + HiveStylePartitionStrategy( + ASTPtr partition_by_, + const Block & sample_block_, + ContextPtr context_, + const std::string & file_format_, + bool write_partition_columns_into_files_); PartitionExpressionActionsAndColumnName getExpression() override; std::string getPath(const std::string & prefix, const std::string & partition_key) override; + Chunk getChunkWithoutPartitionColumnsIfNeeded(const Chunk & chunk) override; + Block getBlockWithoutPartitionColumnsIfNeeded() override; private: std::string file_format; + bool write_partition_columns_into_files; + Names partition_expression_required_columns; + std::unordered_set partition_expression_required_columns_set; + PartitionExpressionActionsAndColumnName actions_with_column_name; + Block block_without_partition_columns; }; } diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index a3f3945a45d1..ec99748f4475 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -50,16 +50,20 @@ SinkPtr PartitionedSink::getSinkForPartitionKey(StringRef partition_key) return it->second; } -void PartitionedSink::consume(Chunk & chunk) +void PartitionedSink::consume(Chunk & input_chunk) { - const auto & columns = chunk.getColumns(); - Block block_with_partition_by_expr = sample_block.cloneWithoutColumns(); - block_with_partition_by_expr.setColumns(columns); + block_with_partition_by_expr.setColumns(input_chunk.getColumns()); partition_by_expr->execute(block_with_partition_by_expr); const auto * partition_by_result_column = block_with_partition_by_expr.getByName(partition_by_column_name).column.get(); + /* + * `write_partition_columns_into_files` + */ + const auto chunk = getPartitionStrategy()->getChunkWithoutPartitionColumnsIfNeeded(input_chunk); + const auto & columns_to_consume = chunk.getColumns(); + size_t chunk_rows = chunk.getNumRows(); chunk_row_index_to_partition_index.resize(chunk_rows); @@ -75,7 +79,7 @@ void PartitionedSink::consume(Chunk & chunk) chunk_row_index_to_partition_index[row] = it->getMapped(); } - size_t columns_size = columns.size(); + size_t columns_size = columns_to_consume.size(); size_t partitions_size = partition_id_to_chunk_index.size(); Chunks partition_index_to_chunk; @@ -83,7 +87,7 @@ void PartitionedSink::consume(Chunk & chunk) for (size_t column_index = 0; column_index < columns_size; ++column_index) { - MutableColumns partition_index_to_column_split = columns[column_index]->scatter(partitions_size, chunk_row_index_to_partition_index); + MutableColumns partition_index_to_column_split = columns_to_consume[column_index]->scatter(partitions_size, chunk_row_index_to_partition_index); /// Add chunks into partition_index_to_chunk with sizes of result columns if (column_index == 0) diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.reference b/tests/queries/0_stateless/03363_hive_style_partition_write.reference index f85610ca7313..1daa2f76501b 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.reference +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.reference @@ -7,8 +7,7 @@ test/t_03363_parquet/year=2024/country=Germany/.parquet 6 test/t_03363_parquet/year=2024/country=Germany/.parquet 7 test/t_03363_parquet/year=1999/country=Brazil/.parquet 8 test/t_03363_parquet/year=2100/country=Japan/.parquet 9 -test/t_03363_parquet/year=2024/country=/.parquet 10 -test/t_03363_parquet/year=2024/country=CN/.parquet 11 +test/t_03363_parquet/year=2024/country=CN/.parquet 10 test/t_03363_csv/year=2022/country=USA/.csv 1 test/t_03363_csv/year=2022/country=Canada/.csv 2 test/t_03363_csv/year=2023/country=USA/.csv 3 @@ -18,8 +17,7 @@ test/t_03363_csv/year=2024/country=Germany/.csv 6 test/t_03363_csv/year=2024/country=Germany/.csv 7 test/t_03363_csv/year=1999/country=Brazil/.csv 8 test/t_03363_csv/year=2100/country=Japan/.csv 9 -test/t_03363_csv/year=2024/country=/.csv 10 -test/t_03363_csv/year=2024/country=CN/.csv 11 +test/t_03363_csv/year=2024/country=CN/.csv 10 test/t_03363_function/year=2022/country=USA/.parquet 1 test/t_03363_function/year=2022/country=Canada/.parquet 2 test/t_03363_function/year=2023/country=USA/.parquet 3 @@ -29,5 +27,4 @@ test/t_03363_function/year=2024/country=Germany/.parquet 6 test/t_03363_function/year=2024/country=Germany/.parquet 7 test/t_03363_function/year=1999/country=Brazil/.parquet 8 test/t_03363_function/year=2100/country=Japan/.parquet 9 -test/t_03363_function/year=2024/country=/.parquet 10 -test/t_03363_function/year=2024/country=CN/.parquet 11 +test/t_03363_function/year=2024/country=CN/.parquet 10 diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql index f838d90b6c07..b71a8502b396 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -6,7 +6,7 @@ CREATE TABLE t_03363_parquet (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partitioning_style='hive') PARTITION BY (year, country); -INSERT INTO t_03363_parquet SETTINGS s3_truncate_on_insert=1 VALUES +INSERT INTO t_03363_parquet VALUES (2022, 'USA', 1), (2022, 'Canada', 2), (2023, 'USA', 3), @@ -16,8 +16,7 @@ INSERT INTO t_03363_parquet SETTINGS s3_truncate_on_insert=1 VALUES (2024, 'Germany', 7), (1999, 'Brazil', 8), (2100, 'Japan', 9), - (2024, '', 10), - (2024, 'CN', 11); + (2024, 'CN', 10); -- while reading from object storage partitioned table is not supported, an auxiliary table must be created CREATE TABLE t_03363_parquet_read (year UInt16, country String, counter UInt8) @@ -31,7 +30,7 @@ CREATE TABLE t_03363_csv (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_csv', format = CSV, partitioning_style='hive') PARTITION BY (year, country); -INSERT INTO t_03363_csv SETTINGS s3_truncate_on_insert=1 VALUES +INSERT INTO t_03363_csv VALUES (2022, 'USA', 1), (2022, 'Canada', 2), (2023, 'USA', 3), @@ -41,17 +40,16 @@ INSERT INTO t_03363_csv SETTINGS s3_truncate_on_insert=1 VALUES (2024, 'Germany', 7), (1999, 'Brazil', 8), (2100, 'Japan', 9), - (2024, '', 10), - (2024, 'CN', 11); + (2024, 'CN', 10); CREATE TABLE t_03363_csv_read (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_csv/**.csv', format = CSV); -select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.csv', '/.csv') AS _path, counter from t_03363_csv_read order by counter SETTINGS use_hive_partitioning=1; +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.csv', '/.csv') AS _path, counter from t_03363_csv_read order by counter; -- s3 table function -INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function', format=Parquet, partitioning_style='hive') PARTITION BY (year, country) SELECT * FROM t_03363_parquet_read; -select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03363_function/**.parquet') order by counter SETTINGS use_hive_partitioning=1; +INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function', format=Parquet, partitioning_style='hive') PARTITION BY (year, country) SELECT country, year, counter FROM t_03363_parquet_read; +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03363_function/**.parquet') order by counter; -- hive with partition id placeholder CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) From 6a6c9a744de2f39e63486c18c6474ac28954bf4a Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 2 Apr 2025 08:05:25 -0300 Subject: [PATCH 28/42] Revert "write_partition_columns_into_files argument" This reverts commit abdd84aa063c8b2035592258c91ae296a05290b5. --- .../ObjectStorage/S3/Configuration.cpp | 25 +-- .../ObjectStorage/StorageObjectStorage.cpp | 8 +- .../ObjectStorage/StorageObjectStorage.h | 4 - .../StorageObjectStorageSink.cpp | 2 +- src/Storages/PartitionStrategy.cpp | 171 +++++------------- src/Storages/PartitionStrategy.h | 40 +--- src/Storages/PartitionedSink.cpp | 16 +- ...03363_hive_style_partition_write.reference | 9 +- .../03363_hive_style_partition_write.sql | 16 +- 9 files changed, 86 insertions(+), 205 deletions(-) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 03ea400933f3..40c5ad44ffcc 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -55,8 +55,7 @@ namespace ErrorCodes namespace { - template - std::optional extractNamedArgument(ASTs & arguments, const std::string & argument_name, ASTs::iterator & argument_position) + std::optional extractNamedArgument(ASTs & arguments, const std::string & argument_name, ASTs::iterator & argument_position) { for (auto arg_it = arguments.begin(); arg_it != arguments.end(); ++arg_it) { @@ -82,20 +81,27 @@ namespace name->name()); } + if (value->value.getType() != Field::Types::String) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Wrong parameter type for '{}'", + name->name()); + } + argument_position = arg_it; - return value->value.safeGet(); + return value->value.safeGet(); } } return std::nullopt; } - template - std::optional extractNamedArgumentAndRemoveFromList(ASTs & arguments, const std::string & argument_name) + std::optional extractNamedArgumentAndRemoveFromList(ASTs & arguments, const std::string & argument_name) { ASTs::iterator iterator; - auto named_arg_opt = extractNamedArgument(arguments, argument_name, iterator); + auto named_arg_opt = extractNamedArgument(arguments, argument_name, iterator); if (named_arg_opt) { @@ -128,8 +134,7 @@ static const std::unordered_set optional_configuration_keys = "max_connections", "expiration_window_seconds", "no_sign_request", - "partitioning_style", - "write_partition_columns_into_files" + "partitioning_style" }; String StorageS3Configuration::getDataSourceDescription() const @@ -228,7 +233,6 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect url = S3::URI(collection.get("url"), settings[Setting::allow_archive_path_syntax]); partitioning_style = collection.getOrDefault("partitioning_style", "auto"); - write_partition_columns_into_files = collection.getOrDefault("write_partition_columns_into_files", false); auth_settings[S3AuthSetting::access_key_id] = collection.getOrDefault("access_key_id", ""); auth_settings[S3AuthSetting::secret_access_key] = collection.getOrDefault("secret_access_key", ""); @@ -251,8 +255,7 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ { size_t count = StorageURL::evalArgsAndCollectHeaders(args, headers_from_ast, context); - partitioning_style = extractNamedArgumentAndRemoveFromList(args, "partitioning_style").value_or("auto"); - write_partition_columns_into_files = extractNamedArgumentAndRemoveFromList(args, "write_partition_columns_into_files").value_or(false); + partitioning_style = extractNamedArgumentAndRemoveFromList(args, "partitioning_style").value_or("auto"); if (count == 0 || count > getMaxNumberOfArguments(with_structure)) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index f43a6b1b301c..21df6c8f9d9b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -457,13 +457,7 @@ SinkToStoragePtr StorageObjectStorage::write( if (ASTPtr partition_by_ast = getPartitionByAst(partition_by, query, configuration)) { - auto partition_strategy = PartitionStrategyProvider::get( - partition_by_ast, - sample_block, - local_context, - configuration->format, - configuration->partitioning_style, - configuration->write_partition_columns_into_files); + auto partition_strategy = PartitionStrategyProvider::get(partition_by_ast, sample_block, local_context, configuration->format, configuration->partitioning_style); return std::make_shared( partition_strategy, object_storage, configuration, format_settings, sample_block, local_context); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index cfb71b112c91..532273871bce 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -245,10 +245,6 @@ class StorageObjectStorage::Configuration String compression_method = "auto"; String structure = "auto"; std::string partitioning_style = "auto"; - /* - * Only supported by hive partitioning style for now - */ - bool write_partition_columns_into_files = false; virtual void update(ObjectStoragePtr object_storage, ContextPtr local_context); void updateIfRequired(ObjectStoragePtr object_storage, ContextPtr local_context); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index ab66606b3020..459a498aa2f4 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -164,7 +164,7 @@ SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String object_storage, configuration, format_settings, - getPartitionStrategy()->getBlockWithoutPartitionColumnsIfNeeded(), + sample_block, context, file_path ); diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index 9736728142e2..1949479abda7 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -6,11 +6,8 @@ #include #include #include -#include -#include -#include -#include -#include + +#include "Interpreters/ExternalLoaderDictionaryStorageConfigRepository.h" namespace DB { @@ -21,76 +18,16 @@ extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; } + namespace { Names extractPartitionRequiredColumns(ASTPtr & partition_by, const Block & sample_block, ContextPtr context) { auto pby_clone = partition_by->clone(); - // sample_block columns might be out of order, and we need to respect the partition by expression order - auto key_description = KeyDescription::getKeyFromAST(pby_clone, ColumnsDescription::fromNamesAndTypes(sample_block.getNamesAndTypes()), context); - auto syntax_result = TreeRewriter(context).analyze(pby_clone, key_description.sample_block.getNamesAndTypesList()); + auto syntax_result = TreeRewriter(context).analyze(pby_clone, sample_block.getNamesAndTypesList()); auto exp_analyzer = ExpressionAnalyzer(pby_clone, syntax_result, context).getActions(false); return exp_analyzer->getRequiredColumns(); } - - HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName buildExpressionHive( - ASTPtr partition_by, - const Names & partition_expression_required_columns, - const Block & sample_block, - ContextPtr context) - { - HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName actions_with_column_name; - ASTs concat_args; - - if (const auto * tuple_function = partition_by->as(); - tuple_function && tuple_function->name == "tuple") - { - chassert(tuple_function->arguments->children.size() == partition_expression_required_columns.size()); - - for (size_t i = 0; i < tuple_function->arguments->children.size(); i++) - { - const auto & child = tuple_function->arguments->children[i]; - - concat_args.push_back(std::make_shared(partition_expression_required_columns[i] + "=")); - - concat_args.push_back(makeASTFunction("toString", child)); - - concat_args.push_back(std::make_shared("/")); - } - } - else - { - chassert(partition_expression_required_columns.size() == 1); - - ASTs to_string_args = {1, partition_by}; - concat_args.push_back(std::make_shared(partition_expression_required_columns[0] + "=")); - concat_args.push_back(makeASTFunction("toString", std::move(to_string_args))); - concat_args.push_back(std::make_shared("/")); - } - - ASTPtr hive_expr = makeASTFunction("concat", std::move(concat_args)); - auto hive_syntax_result = TreeRewriter(context).analyze(hive_expr, sample_block.getNamesAndTypesList()); - actions_with_column_name.actions = ExpressionAnalyzer(hive_expr, hive_syntax_result, context).getActions(false); - actions_with_column_name.column_name = hive_expr->getColumnName(); - - return actions_with_column_name; - } - - Block buildBlockWithoutPartitionColumns( - const Block & sample_block, - const std::unordered_set & partition_expression_required_columns_set) - { - Block result; - for (size_t i = 0; i < sample_block.columns(); i++) - { - if (!partition_expression_required_columns_set.contains(sample_block.getByPosition(i).name)) - { - result.insert(sample_block.getByPosition(i)); - } - } - - return result; - } } PartitionStrategy::PartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_) @@ -103,8 +40,7 @@ std::shared_ptr PartitionStrategyProvider::get(ASTPtr partiti const Block & sample_block, ContextPtr context, const std::string & file_format, - const std::string & partitioning_style, - bool write_partition_columns_into_files) + const std::string & partitioning_style) { if (partitioning_style == "hive") { @@ -112,13 +48,7 @@ std::shared_ptr PartitionStrategyProvider::get(ASTPtr partiti { throw Exception(ErrorCodes::LOGICAL_ERROR, "File format can't be empty for hive style partitioning"); } - - return std::make_shared( - partition_by, - sample_block, - context, - file_format, - write_partition_columns_into_files); + return std::make_shared(partition_by, sample_block, context, file_format); } return std::make_shared(partition_by, sample_block, context); @@ -149,24 +79,50 @@ std::string StringfiedPartitionStrategy::getPath( return PartitionedSink::replaceWildcards(prefix, partition_key); } -HiveStylePartitionStrategy::HiveStylePartitionStrategy( - ASTPtr partition_by_, - const Block & sample_block_, - ContextPtr context_, - const std::string & file_format_, - bool write_partition_columns_into_files_) - : PartitionStrategy(partition_by_, sample_block_, context_), - file_format(file_format_), - write_partition_columns_into_files(write_partition_columns_into_files_), - partition_expression_required_columns(extractPartitionRequiredColumns(partition_by, sample_block, context)), - partition_expression_required_columns_set(partition_expression_required_columns.begin(), partition_expression_required_columns.end()) +HiveStylePartitionStrategy::HiveStylePartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_, const std::string & file_format_) +: PartitionStrategy(partition_by_, sample_block_, context_), file_format(file_format_) { - actions_with_column_name = buildExpressionHive(partition_by, partition_expression_required_columns, sample_block, context); - block_without_partition_columns = buildBlockWithoutPartitionColumns(sample_block, partition_expression_required_columns_set); + } HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName HiveStylePartitionStrategy::getExpression() { + PartitionExpressionActionsAndColumnName actions_with_column_name; + + const Names partition_expression_required_columns = extractPartitionRequiredColumns(partition_by, sample_block, context); + ASTs concat_args; + + if (const auto * tuple_function = partition_by->as(); + tuple_function && tuple_function->name == "tuple") + { + chassert(tuple_function->arguments->children.size() == partition_expression_required_columns.size()); + + for (size_t i = 0; i < tuple_function->arguments->children.size(); i++) + { + const auto & child = tuple_function->arguments->children[i]; + + concat_args.push_back(std::make_shared(partition_expression_required_columns[i] + "=")); + + concat_args.push_back(makeASTFunction("toString", child)); + + concat_args.push_back(std::make_shared("/")); + } + } + else + { + chassert(partition_expression_required_columns.size() == 1); + + ASTs to_string_args = {1, partition_by}; + concat_args.push_back(std::make_shared(partition_expression_required_columns[0] + "=")); + concat_args.push_back(makeASTFunction("toString", std::move(to_string_args))); + concat_args.push_back(std::make_shared("/")); + } + + ASTPtr hive_expr = makeASTFunction("concat", std::move(concat_args)); + auto hive_syntax_result = TreeRewriter(context).analyze(hive_expr, sample_block.getNamesAndTypesList()); + actions_with_column_name.actions = ExpressionAnalyzer(hive_expr, hive_syntax_result, context).getActions(false); + actions_with_column_name.column_name = hive_expr->getColumnName(); + return actions_with_column_name; } @@ -191,41 +147,4 @@ std::string HiveStylePartitionStrategy::getPath( return path + partition_key + "/" + std::to_string(generateSnowflakeID()) + "." + Poco::toLower(file_format); } -Chunk HiveStylePartitionStrategy::getChunkWithoutPartitionColumnsIfNeeded(const Chunk & chunk) -{ - Chunk result; - - if (write_partition_columns_into_files) - { - for (const auto & column : chunk.getColumns()) - { - result.addColumn(column); - } - - return result; - } - - chassert(chunk.getColumns().size() == sample_block.columns()); - - for (size_t i = 0; i < sample_block.columns(); i++) - { - if (!partition_expression_required_columns_set.contains(sample_block.getByPosition(i).name)) - { - result.addColumn(chunk.getColumns()[i]); - } - } - - return result; -} - -Block HiveStylePartitionStrategy::getBlockWithoutPartitionColumnsIfNeeded() -{ - if (write_partition_columns_into_files) - { - return sample_block; - } - - return block_without_partition_columns; -} - } diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h index 6fc9006cf515..eecca53a36fe 100644 --- a/src/Storages/PartitionStrategy.h +++ b/src/Storages/PartitionStrategy.h @@ -3,8 +3,6 @@ #include #include -#include - namespace DB { @@ -23,23 +21,6 @@ struct PartitionStrategy virtual PartitionExpressionActionsAndColumnName getExpression() = 0; virtual std::string getPath(const std::string & prefix, const std::string & partition_key) = 0; - virtual Chunk getChunkWithoutPartitionColumnsIfNeeded(const Chunk & chunk) - { - Chunk result; - - for (const auto & column : chunk.getColumns()) - { - result.addColumn(column); - } - - return result; - } - - virtual Block getBlockWithoutPartitionColumnsIfNeeded() - { - return sample_block; - } - protected: ASTPtr partition_by; Block sample_block; @@ -53,11 +34,10 @@ struct PartitionStrategyProvider const Block & sample_block, ContextPtr context, const std::string & file_format, - const std::string & partitioning_style = "", - bool omit_partition_columns_from_file = true); + const std::string & partitioning_style = ""); }; -struct StringfiedPartitionStrategy : PartitionStrategy +struct StringfiedPartitionStrategy : public PartitionStrategy { StringfiedPartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_); @@ -65,27 +45,15 @@ struct StringfiedPartitionStrategy : PartitionStrategy std::string getPath(const std::string & prefix, const std::string & partition_key) override; }; -struct HiveStylePartitionStrategy : PartitionStrategy +struct HiveStylePartitionStrategy : public PartitionStrategy { - HiveStylePartitionStrategy( - ASTPtr partition_by_, - const Block & sample_block_, - ContextPtr context_, - const std::string & file_format_, - bool write_partition_columns_into_files_); + HiveStylePartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_, const std::string & file_format_); PartitionExpressionActionsAndColumnName getExpression() override; std::string getPath(const std::string & prefix, const std::string & partition_key) override; - Chunk getChunkWithoutPartitionColumnsIfNeeded(const Chunk & chunk) override; - Block getBlockWithoutPartitionColumnsIfNeeded() override; private: std::string file_format; - bool write_partition_columns_into_files; - Names partition_expression_required_columns; - std::unordered_set partition_expression_required_columns_set; - PartitionExpressionActionsAndColumnName actions_with_column_name; - Block block_without_partition_columns; }; } diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index ec99748f4475..a3f3945a45d1 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -50,20 +50,16 @@ SinkPtr PartitionedSink::getSinkForPartitionKey(StringRef partition_key) return it->second; } -void PartitionedSink::consume(Chunk & input_chunk) +void PartitionedSink::consume(Chunk & chunk) { + const auto & columns = chunk.getColumns(); + Block block_with_partition_by_expr = sample_block.cloneWithoutColumns(); - block_with_partition_by_expr.setColumns(input_chunk.getColumns()); + block_with_partition_by_expr.setColumns(columns); partition_by_expr->execute(block_with_partition_by_expr); const auto * partition_by_result_column = block_with_partition_by_expr.getByName(partition_by_column_name).column.get(); - /* - * `write_partition_columns_into_files` - */ - const auto chunk = getPartitionStrategy()->getChunkWithoutPartitionColumnsIfNeeded(input_chunk); - const auto & columns_to_consume = chunk.getColumns(); - size_t chunk_rows = chunk.getNumRows(); chunk_row_index_to_partition_index.resize(chunk_rows); @@ -79,7 +75,7 @@ void PartitionedSink::consume(Chunk & input_chunk) chunk_row_index_to_partition_index[row] = it->getMapped(); } - size_t columns_size = columns_to_consume.size(); + size_t columns_size = columns.size(); size_t partitions_size = partition_id_to_chunk_index.size(); Chunks partition_index_to_chunk; @@ -87,7 +83,7 @@ void PartitionedSink::consume(Chunk & input_chunk) for (size_t column_index = 0; column_index < columns_size; ++column_index) { - MutableColumns partition_index_to_column_split = columns_to_consume[column_index]->scatter(partitions_size, chunk_row_index_to_partition_index); + MutableColumns partition_index_to_column_split = columns[column_index]->scatter(partitions_size, chunk_row_index_to_partition_index); /// Add chunks into partition_index_to_chunk with sizes of result columns if (column_index == 0) diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.reference b/tests/queries/0_stateless/03363_hive_style_partition_write.reference index 1daa2f76501b..f85610ca7313 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.reference +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.reference @@ -7,7 +7,8 @@ test/t_03363_parquet/year=2024/country=Germany/.parquet 6 test/t_03363_parquet/year=2024/country=Germany/.parquet 7 test/t_03363_parquet/year=1999/country=Brazil/.parquet 8 test/t_03363_parquet/year=2100/country=Japan/.parquet 9 -test/t_03363_parquet/year=2024/country=CN/.parquet 10 +test/t_03363_parquet/year=2024/country=/.parquet 10 +test/t_03363_parquet/year=2024/country=CN/.parquet 11 test/t_03363_csv/year=2022/country=USA/.csv 1 test/t_03363_csv/year=2022/country=Canada/.csv 2 test/t_03363_csv/year=2023/country=USA/.csv 3 @@ -17,7 +18,8 @@ test/t_03363_csv/year=2024/country=Germany/.csv 6 test/t_03363_csv/year=2024/country=Germany/.csv 7 test/t_03363_csv/year=1999/country=Brazil/.csv 8 test/t_03363_csv/year=2100/country=Japan/.csv 9 -test/t_03363_csv/year=2024/country=CN/.csv 10 +test/t_03363_csv/year=2024/country=/.csv 10 +test/t_03363_csv/year=2024/country=CN/.csv 11 test/t_03363_function/year=2022/country=USA/.parquet 1 test/t_03363_function/year=2022/country=Canada/.parquet 2 test/t_03363_function/year=2023/country=USA/.parquet 3 @@ -27,4 +29,5 @@ test/t_03363_function/year=2024/country=Germany/.parquet 6 test/t_03363_function/year=2024/country=Germany/.parquet 7 test/t_03363_function/year=1999/country=Brazil/.parquet 8 test/t_03363_function/year=2100/country=Japan/.parquet 9 -test/t_03363_function/year=2024/country=CN/.parquet 10 +test/t_03363_function/year=2024/country=/.parquet 10 +test/t_03363_function/year=2024/country=CN/.parquet 11 diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql index b71a8502b396..f838d90b6c07 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -6,7 +6,7 @@ CREATE TABLE t_03363_parquet (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partitioning_style='hive') PARTITION BY (year, country); -INSERT INTO t_03363_parquet VALUES +INSERT INTO t_03363_parquet SETTINGS s3_truncate_on_insert=1 VALUES (2022, 'USA', 1), (2022, 'Canada', 2), (2023, 'USA', 3), @@ -16,7 +16,8 @@ INSERT INTO t_03363_parquet VALUES (2024, 'Germany', 7), (1999, 'Brazil', 8), (2100, 'Japan', 9), - (2024, 'CN', 10); + (2024, '', 10), + (2024, 'CN', 11); -- while reading from object storage partitioned table is not supported, an auxiliary table must be created CREATE TABLE t_03363_parquet_read (year UInt16, country String, counter UInt8) @@ -30,7 +31,7 @@ CREATE TABLE t_03363_csv (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_csv', format = CSV, partitioning_style='hive') PARTITION BY (year, country); -INSERT INTO t_03363_csv VALUES +INSERT INTO t_03363_csv SETTINGS s3_truncate_on_insert=1 VALUES (2022, 'USA', 1), (2022, 'Canada', 2), (2023, 'USA', 3), @@ -40,16 +41,17 @@ INSERT INTO t_03363_csv VALUES (2024, 'Germany', 7), (1999, 'Brazil', 8), (2100, 'Japan', 9), - (2024, 'CN', 10); + (2024, '', 10), + (2024, 'CN', 11); CREATE TABLE t_03363_csv_read (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_csv/**.csv', format = CSV); -select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.csv', '/.csv') AS _path, counter from t_03363_csv_read order by counter; +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.csv', '/.csv') AS _path, counter from t_03363_csv_read order by counter SETTINGS use_hive_partitioning=1; -- s3 table function -INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function', format=Parquet, partitioning_style='hive') PARTITION BY (year, country) SELECT country, year, counter FROM t_03363_parquet_read; -select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03363_function/**.parquet') order by counter; +INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function', format=Parquet, partitioning_style='hive') PARTITION BY (year, country) SELECT * FROM t_03363_parquet_read; +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03363_function/**.parquet') order by counter SETTINGS use_hive_partitioning=1; -- hive with partition id placeholder CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) From 8b73bb5312f73703a9db079c3fba4c3dd780cc8e Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 2 Apr 2025 08:08:29 -0300 Subject: [PATCH 29/42] Reapply "write_partition_columns_into_files argument" This reverts commit 6a6c9a744de2f39e63486c18c6474ac28954bf4a. --- .../ObjectStorage/S3/Configuration.cpp | 25 ++- .../ObjectStorage/StorageObjectStorage.cpp | 8 +- .../ObjectStorage/StorageObjectStorage.h | 4 + .../StorageObjectStorageSink.cpp | 2 +- src/Storages/PartitionStrategy.cpp | 171 +++++++++++++----- src/Storages/PartitionStrategy.h | 40 +++- src/Storages/PartitionedSink.cpp | 16 +- ...03363_hive_style_partition_write.reference | 9 +- .../03363_hive_style_partition_write.sql | 16 +- 9 files changed, 205 insertions(+), 86 deletions(-) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 40c5ad44ffcc..03ea400933f3 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -55,7 +55,8 @@ namespace ErrorCodes namespace { - std::optional extractNamedArgument(ASTs & arguments, const std::string & argument_name, ASTs::iterator & argument_position) + template + std::optional extractNamedArgument(ASTs & arguments, const std::string & argument_name, ASTs::iterator & argument_position) { for (auto arg_it = arguments.begin(); arg_it != arguments.end(); ++arg_it) { @@ -81,27 +82,20 @@ namespace name->name()); } - if (value->value.getType() != Field::Types::String) - { - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Wrong parameter type for '{}'", - name->name()); - } - argument_position = arg_it; - return value->value.safeGet(); + return value->value.safeGet(); } } return std::nullopt; } - std::optional extractNamedArgumentAndRemoveFromList(ASTs & arguments, const std::string & argument_name) + template + std::optional extractNamedArgumentAndRemoveFromList(ASTs & arguments, const std::string & argument_name) { ASTs::iterator iterator; - auto named_arg_opt = extractNamedArgument(arguments, argument_name, iterator); + auto named_arg_opt = extractNamedArgument(arguments, argument_name, iterator); if (named_arg_opt) { @@ -134,7 +128,8 @@ static const std::unordered_set optional_configuration_keys = "max_connections", "expiration_window_seconds", "no_sign_request", - "partitioning_style" + "partitioning_style", + "write_partition_columns_into_files" }; String StorageS3Configuration::getDataSourceDescription() const @@ -233,6 +228,7 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect url = S3::URI(collection.get("url"), settings[Setting::allow_archive_path_syntax]); partitioning_style = collection.getOrDefault("partitioning_style", "auto"); + write_partition_columns_into_files = collection.getOrDefault("write_partition_columns_into_files", false); auth_settings[S3AuthSetting::access_key_id] = collection.getOrDefault("access_key_id", ""); auth_settings[S3AuthSetting::secret_access_key] = collection.getOrDefault("secret_access_key", ""); @@ -255,7 +251,8 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ { size_t count = StorageURL::evalArgsAndCollectHeaders(args, headers_from_ast, context); - partitioning_style = extractNamedArgumentAndRemoveFromList(args, "partitioning_style").value_or("auto"); + partitioning_style = extractNamedArgumentAndRemoveFromList(args, "partitioning_style").value_or("auto"); + write_partition_columns_into_files = extractNamedArgumentAndRemoveFromList(args, "write_partition_columns_into_files").value_or(false); if (count == 0 || count > getMaxNumberOfArguments(with_structure)) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 21df6c8f9d9b..f43a6b1b301c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -457,7 +457,13 @@ SinkToStoragePtr StorageObjectStorage::write( if (ASTPtr partition_by_ast = getPartitionByAst(partition_by, query, configuration)) { - auto partition_strategy = PartitionStrategyProvider::get(partition_by_ast, sample_block, local_context, configuration->format, configuration->partitioning_style); + auto partition_strategy = PartitionStrategyProvider::get( + partition_by_ast, + sample_block, + local_context, + configuration->format, + configuration->partitioning_style, + configuration->write_partition_columns_into_files); return std::make_shared( partition_strategy, object_storage, configuration, format_settings, sample_block, local_context); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 532273871bce..cfb71b112c91 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -245,6 +245,10 @@ class StorageObjectStorage::Configuration String compression_method = "auto"; String structure = "auto"; std::string partitioning_style = "auto"; + /* + * Only supported by hive partitioning style for now + */ + bool write_partition_columns_into_files = false; virtual void update(ObjectStoragePtr object_storage, ContextPtr local_context); void updateIfRequired(ObjectStoragePtr object_storage, ContextPtr local_context); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 459a498aa2f4..ab66606b3020 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -164,7 +164,7 @@ SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String object_storage, configuration, format_settings, - sample_block, + getPartitionStrategy()->getBlockWithoutPartitionColumnsIfNeeded(), context, file_path ); diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index 1949479abda7..9736728142e2 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -6,8 +6,11 @@ #include #include #include - -#include "Interpreters/ExternalLoaderDictionaryStorageConfigRepository.h" +#include +#include +#include +#include +#include namespace DB { @@ -18,16 +21,76 @@ extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; } - namespace { Names extractPartitionRequiredColumns(ASTPtr & partition_by, const Block & sample_block, ContextPtr context) { auto pby_clone = partition_by->clone(); - auto syntax_result = TreeRewriter(context).analyze(pby_clone, sample_block.getNamesAndTypesList()); + // sample_block columns might be out of order, and we need to respect the partition by expression order + auto key_description = KeyDescription::getKeyFromAST(pby_clone, ColumnsDescription::fromNamesAndTypes(sample_block.getNamesAndTypes()), context); + auto syntax_result = TreeRewriter(context).analyze(pby_clone, key_description.sample_block.getNamesAndTypesList()); auto exp_analyzer = ExpressionAnalyzer(pby_clone, syntax_result, context).getActions(false); return exp_analyzer->getRequiredColumns(); } + + HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName buildExpressionHive( + ASTPtr partition_by, + const Names & partition_expression_required_columns, + const Block & sample_block, + ContextPtr context) + { + HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName actions_with_column_name; + ASTs concat_args; + + if (const auto * tuple_function = partition_by->as(); + tuple_function && tuple_function->name == "tuple") + { + chassert(tuple_function->arguments->children.size() == partition_expression_required_columns.size()); + + for (size_t i = 0; i < tuple_function->arguments->children.size(); i++) + { + const auto & child = tuple_function->arguments->children[i]; + + concat_args.push_back(std::make_shared(partition_expression_required_columns[i] + "=")); + + concat_args.push_back(makeASTFunction("toString", child)); + + concat_args.push_back(std::make_shared("/")); + } + } + else + { + chassert(partition_expression_required_columns.size() == 1); + + ASTs to_string_args = {1, partition_by}; + concat_args.push_back(std::make_shared(partition_expression_required_columns[0] + "=")); + concat_args.push_back(makeASTFunction("toString", std::move(to_string_args))); + concat_args.push_back(std::make_shared("/")); + } + + ASTPtr hive_expr = makeASTFunction("concat", std::move(concat_args)); + auto hive_syntax_result = TreeRewriter(context).analyze(hive_expr, sample_block.getNamesAndTypesList()); + actions_with_column_name.actions = ExpressionAnalyzer(hive_expr, hive_syntax_result, context).getActions(false); + actions_with_column_name.column_name = hive_expr->getColumnName(); + + return actions_with_column_name; + } + + Block buildBlockWithoutPartitionColumns( + const Block & sample_block, + const std::unordered_set & partition_expression_required_columns_set) + { + Block result; + for (size_t i = 0; i < sample_block.columns(); i++) + { + if (!partition_expression_required_columns_set.contains(sample_block.getByPosition(i).name)) + { + result.insert(sample_block.getByPosition(i)); + } + } + + return result; + } } PartitionStrategy::PartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_) @@ -40,7 +103,8 @@ std::shared_ptr PartitionStrategyProvider::get(ASTPtr partiti const Block & sample_block, ContextPtr context, const std::string & file_format, - const std::string & partitioning_style) + const std::string & partitioning_style, + bool write_partition_columns_into_files) { if (partitioning_style == "hive") { @@ -48,7 +112,13 @@ std::shared_ptr PartitionStrategyProvider::get(ASTPtr partiti { throw Exception(ErrorCodes::LOGICAL_ERROR, "File format can't be empty for hive style partitioning"); } - return std::make_shared(partition_by, sample_block, context, file_format); + + return std::make_shared( + partition_by, + sample_block, + context, + file_format, + write_partition_columns_into_files); } return std::make_shared(partition_by, sample_block, context); @@ -79,50 +149,24 @@ std::string StringfiedPartitionStrategy::getPath( return PartitionedSink::replaceWildcards(prefix, partition_key); } -HiveStylePartitionStrategy::HiveStylePartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_, const std::string & file_format_) -: PartitionStrategy(partition_by_, sample_block_, context_), file_format(file_format_) +HiveStylePartitionStrategy::HiveStylePartitionStrategy( + ASTPtr partition_by_, + const Block & sample_block_, + ContextPtr context_, + const std::string & file_format_, + bool write_partition_columns_into_files_) + : PartitionStrategy(partition_by_, sample_block_, context_), + file_format(file_format_), + write_partition_columns_into_files(write_partition_columns_into_files_), + partition_expression_required_columns(extractPartitionRequiredColumns(partition_by, sample_block, context)), + partition_expression_required_columns_set(partition_expression_required_columns.begin(), partition_expression_required_columns.end()) { - + actions_with_column_name = buildExpressionHive(partition_by, partition_expression_required_columns, sample_block, context); + block_without_partition_columns = buildBlockWithoutPartitionColumns(sample_block, partition_expression_required_columns_set); } HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName HiveStylePartitionStrategy::getExpression() { - PartitionExpressionActionsAndColumnName actions_with_column_name; - - const Names partition_expression_required_columns = extractPartitionRequiredColumns(partition_by, sample_block, context); - ASTs concat_args; - - if (const auto * tuple_function = partition_by->as(); - tuple_function && tuple_function->name == "tuple") - { - chassert(tuple_function->arguments->children.size() == partition_expression_required_columns.size()); - - for (size_t i = 0; i < tuple_function->arguments->children.size(); i++) - { - const auto & child = tuple_function->arguments->children[i]; - - concat_args.push_back(std::make_shared(partition_expression_required_columns[i] + "=")); - - concat_args.push_back(makeASTFunction("toString", child)); - - concat_args.push_back(std::make_shared("/")); - } - } - else - { - chassert(partition_expression_required_columns.size() == 1); - - ASTs to_string_args = {1, partition_by}; - concat_args.push_back(std::make_shared(partition_expression_required_columns[0] + "=")); - concat_args.push_back(makeASTFunction("toString", std::move(to_string_args))); - concat_args.push_back(std::make_shared("/")); - } - - ASTPtr hive_expr = makeASTFunction("concat", std::move(concat_args)); - auto hive_syntax_result = TreeRewriter(context).analyze(hive_expr, sample_block.getNamesAndTypesList()); - actions_with_column_name.actions = ExpressionAnalyzer(hive_expr, hive_syntax_result, context).getActions(false); - actions_with_column_name.column_name = hive_expr->getColumnName(); - return actions_with_column_name; } @@ -147,4 +191,41 @@ std::string HiveStylePartitionStrategy::getPath( return path + partition_key + "/" + std::to_string(generateSnowflakeID()) + "." + Poco::toLower(file_format); } +Chunk HiveStylePartitionStrategy::getChunkWithoutPartitionColumnsIfNeeded(const Chunk & chunk) +{ + Chunk result; + + if (write_partition_columns_into_files) + { + for (const auto & column : chunk.getColumns()) + { + result.addColumn(column); + } + + return result; + } + + chassert(chunk.getColumns().size() == sample_block.columns()); + + for (size_t i = 0; i < sample_block.columns(); i++) + { + if (!partition_expression_required_columns_set.contains(sample_block.getByPosition(i).name)) + { + result.addColumn(chunk.getColumns()[i]); + } + } + + return result; +} + +Block HiveStylePartitionStrategy::getBlockWithoutPartitionColumnsIfNeeded() +{ + if (write_partition_columns_into_files) + { + return sample_block; + } + + return block_without_partition_columns; +} + } diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h index eecca53a36fe..6fc9006cf515 100644 --- a/src/Storages/PartitionStrategy.h +++ b/src/Storages/PartitionStrategy.h @@ -3,6 +3,8 @@ #include #include +#include + namespace DB { @@ -21,6 +23,23 @@ struct PartitionStrategy virtual PartitionExpressionActionsAndColumnName getExpression() = 0; virtual std::string getPath(const std::string & prefix, const std::string & partition_key) = 0; + virtual Chunk getChunkWithoutPartitionColumnsIfNeeded(const Chunk & chunk) + { + Chunk result; + + for (const auto & column : chunk.getColumns()) + { + result.addColumn(column); + } + + return result; + } + + virtual Block getBlockWithoutPartitionColumnsIfNeeded() + { + return sample_block; + } + protected: ASTPtr partition_by; Block sample_block; @@ -34,10 +53,11 @@ struct PartitionStrategyProvider const Block & sample_block, ContextPtr context, const std::string & file_format, - const std::string & partitioning_style = ""); + const std::string & partitioning_style = "", + bool omit_partition_columns_from_file = true); }; -struct StringfiedPartitionStrategy : public PartitionStrategy +struct StringfiedPartitionStrategy : PartitionStrategy { StringfiedPartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_); @@ -45,15 +65,27 @@ struct StringfiedPartitionStrategy : public PartitionStrategy std::string getPath(const std::string & prefix, const std::string & partition_key) override; }; -struct HiveStylePartitionStrategy : public PartitionStrategy +struct HiveStylePartitionStrategy : PartitionStrategy { - HiveStylePartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_, const std::string & file_format_); + HiveStylePartitionStrategy( + ASTPtr partition_by_, + const Block & sample_block_, + ContextPtr context_, + const std::string & file_format_, + bool write_partition_columns_into_files_); PartitionExpressionActionsAndColumnName getExpression() override; std::string getPath(const std::string & prefix, const std::string & partition_key) override; + Chunk getChunkWithoutPartitionColumnsIfNeeded(const Chunk & chunk) override; + Block getBlockWithoutPartitionColumnsIfNeeded() override; private: std::string file_format; + bool write_partition_columns_into_files; + Names partition_expression_required_columns; + std::unordered_set partition_expression_required_columns_set; + PartitionExpressionActionsAndColumnName actions_with_column_name; + Block block_without_partition_columns; }; } diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index a3f3945a45d1..ec99748f4475 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -50,16 +50,20 @@ SinkPtr PartitionedSink::getSinkForPartitionKey(StringRef partition_key) return it->second; } -void PartitionedSink::consume(Chunk & chunk) +void PartitionedSink::consume(Chunk & input_chunk) { - const auto & columns = chunk.getColumns(); - Block block_with_partition_by_expr = sample_block.cloneWithoutColumns(); - block_with_partition_by_expr.setColumns(columns); + block_with_partition_by_expr.setColumns(input_chunk.getColumns()); partition_by_expr->execute(block_with_partition_by_expr); const auto * partition_by_result_column = block_with_partition_by_expr.getByName(partition_by_column_name).column.get(); + /* + * `write_partition_columns_into_files` + */ + const auto chunk = getPartitionStrategy()->getChunkWithoutPartitionColumnsIfNeeded(input_chunk); + const auto & columns_to_consume = chunk.getColumns(); + size_t chunk_rows = chunk.getNumRows(); chunk_row_index_to_partition_index.resize(chunk_rows); @@ -75,7 +79,7 @@ void PartitionedSink::consume(Chunk & chunk) chunk_row_index_to_partition_index[row] = it->getMapped(); } - size_t columns_size = columns.size(); + size_t columns_size = columns_to_consume.size(); size_t partitions_size = partition_id_to_chunk_index.size(); Chunks partition_index_to_chunk; @@ -83,7 +87,7 @@ void PartitionedSink::consume(Chunk & chunk) for (size_t column_index = 0; column_index < columns_size; ++column_index) { - MutableColumns partition_index_to_column_split = columns[column_index]->scatter(partitions_size, chunk_row_index_to_partition_index); + MutableColumns partition_index_to_column_split = columns_to_consume[column_index]->scatter(partitions_size, chunk_row_index_to_partition_index); /// Add chunks into partition_index_to_chunk with sizes of result columns if (column_index == 0) diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.reference b/tests/queries/0_stateless/03363_hive_style_partition_write.reference index f85610ca7313..1daa2f76501b 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.reference +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.reference @@ -7,8 +7,7 @@ test/t_03363_parquet/year=2024/country=Germany/.parquet 6 test/t_03363_parquet/year=2024/country=Germany/.parquet 7 test/t_03363_parquet/year=1999/country=Brazil/.parquet 8 test/t_03363_parquet/year=2100/country=Japan/.parquet 9 -test/t_03363_parquet/year=2024/country=/.parquet 10 -test/t_03363_parquet/year=2024/country=CN/.parquet 11 +test/t_03363_parquet/year=2024/country=CN/.parquet 10 test/t_03363_csv/year=2022/country=USA/.csv 1 test/t_03363_csv/year=2022/country=Canada/.csv 2 test/t_03363_csv/year=2023/country=USA/.csv 3 @@ -18,8 +17,7 @@ test/t_03363_csv/year=2024/country=Germany/.csv 6 test/t_03363_csv/year=2024/country=Germany/.csv 7 test/t_03363_csv/year=1999/country=Brazil/.csv 8 test/t_03363_csv/year=2100/country=Japan/.csv 9 -test/t_03363_csv/year=2024/country=/.csv 10 -test/t_03363_csv/year=2024/country=CN/.csv 11 +test/t_03363_csv/year=2024/country=CN/.csv 10 test/t_03363_function/year=2022/country=USA/.parquet 1 test/t_03363_function/year=2022/country=Canada/.parquet 2 test/t_03363_function/year=2023/country=USA/.parquet 3 @@ -29,5 +27,4 @@ test/t_03363_function/year=2024/country=Germany/.parquet 6 test/t_03363_function/year=2024/country=Germany/.parquet 7 test/t_03363_function/year=1999/country=Brazil/.parquet 8 test/t_03363_function/year=2100/country=Japan/.parquet 9 -test/t_03363_function/year=2024/country=/.parquet 10 -test/t_03363_function/year=2024/country=CN/.parquet 11 +test/t_03363_function/year=2024/country=CN/.parquet 10 diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql index f838d90b6c07..b71a8502b396 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -6,7 +6,7 @@ CREATE TABLE t_03363_parquet (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partitioning_style='hive') PARTITION BY (year, country); -INSERT INTO t_03363_parquet SETTINGS s3_truncate_on_insert=1 VALUES +INSERT INTO t_03363_parquet VALUES (2022, 'USA', 1), (2022, 'Canada', 2), (2023, 'USA', 3), @@ -16,8 +16,7 @@ INSERT INTO t_03363_parquet SETTINGS s3_truncate_on_insert=1 VALUES (2024, 'Germany', 7), (1999, 'Brazil', 8), (2100, 'Japan', 9), - (2024, '', 10), - (2024, 'CN', 11); + (2024, 'CN', 10); -- while reading from object storage partitioned table is not supported, an auxiliary table must be created CREATE TABLE t_03363_parquet_read (year UInt16, country String, counter UInt8) @@ -31,7 +30,7 @@ CREATE TABLE t_03363_csv (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_csv', format = CSV, partitioning_style='hive') PARTITION BY (year, country); -INSERT INTO t_03363_csv SETTINGS s3_truncate_on_insert=1 VALUES +INSERT INTO t_03363_csv VALUES (2022, 'USA', 1), (2022, 'Canada', 2), (2023, 'USA', 3), @@ -41,17 +40,16 @@ INSERT INTO t_03363_csv SETTINGS s3_truncate_on_insert=1 VALUES (2024, 'Germany', 7), (1999, 'Brazil', 8), (2100, 'Japan', 9), - (2024, '', 10), - (2024, 'CN', 11); + (2024, 'CN', 10); CREATE TABLE t_03363_csv_read (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_csv/**.csv', format = CSV); -select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.csv', '/.csv') AS _path, counter from t_03363_csv_read order by counter SETTINGS use_hive_partitioning=1; +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.csv', '/.csv') AS _path, counter from t_03363_csv_read order by counter; -- s3 table function -INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function', format=Parquet, partitioning_style='hive') PARTITION BY (year, country) SELECT * FROM t_03363_parquet_read; -select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03363_function/**.parquet') order by counter SETTINGS use_hive_partitioning=1; +INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function', format=Parquet, partitioning_style='hive') PARTITION BY (year, country) SELECT country, year, counter FROM t_03363_parquet_read; +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03363_function/**.parquet') order by counter; -- hive with partition id placeholder CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) From 12366093c78e5f25b550a1a6995c5d7fd5da13d0 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 2 Apr 2025 11:21:31 -0300 Subject: [PATCH 30/42] updt --- src/Storages/PartitionStrategy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h index 6fc9006cf515..84d32b2b5c17 100644 --- a/src/Storages/PartitionStrategy.h +++ b/src/Storages/PartitionStrategy.h @@ -54,7 +54,7 @@ struct PartitionStrategyProvider ContextPtr context, const std::string & file_format, const std::string & partitioning_style = "", - bool omit_partition_columns_from_file = true); + bool write_partition_columns_into_files = false); }; struct StringfiedPartitionStrategy : PartitionStrategy From 3e356bcc92be5fd341ec6dccd59e89b809f74492 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 2 Apr 2025 12:43:33 -0300 Subject: [PATCH 31/42] test write_partition_columns_into_files --- .../0_stateless/03363_hive_style_partition_write.reference | 3 +++ .../0_stateless/03363_hive_style_partition_write.sql | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.reference b/tests/queries/0_stateless/03363_hive_style_partition_write.reference index 1daa2f76501b..bad18852bf33 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.reference +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.reference @@ -28,3 +28,6 @@ test/t_03363_function/year=2024/country=Germany/.parquet 7 test/t_03363_function/year=1999/country=Brazil/.parquet 8 test/t_03363_function/year=2100/country=Japan/.parquet 9 test/t_03363_function/year=2024/country=CN/.parquet 10 +1 +3 +USA 2022 1 diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql index b71a8502b396..d9ebf0411606 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -51,6 +51,13 @@ select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.csv', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03363_function/**.parquet') order by counter; +-- should output 1 because partition columns are not written down to the file by default when hive style is being used +select num_columns from s3(s3_conn, filename='t_03363_function/**.parquet', format=ParquetMetadata) limit 1; + +INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function_write_down_partition_columns', format=Parquet, partitioning_style='hive', write_partition_columns_into_files=1) PARTITION BY (year, country) SELECT country, year, counter FROM t_03363_parquet_read; +select num_columns from s3(s3_conn, filename='t_03363_function_write_down_partition_columns/**.parquet', format=ParquetMetadata) limit 1; +select * from s3(s3_conn, filename='t_03363_function_write_down_partition_columns/**.parquet', format=Parquet) order by counter limit 1 SETTINGS use_hive_partitioning=0; + -- hive with partition id placeholder CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_parquet/{_partition_id}', format = Parquet, partitioning_style='hive') From 0a8bacb03f3e53bd3b3c4a8b3c9805d8ce9157cf Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 2 Apr 2025 12:59:06 -0300 Subject: [PATCH 32/42] fix ub --- src/Storages/ObjectStorage/S3/Configuration.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 03ea400933f3..0803aa9e4089 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -249,11 +249,11 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_structure) { - size_t count = StorageURL::evalArgsAndCollectHeaders(args, headers_from_ast, context); - partitioning_style = extractNamedArgumentAndRemoveFromList(args, "partitioning_style").value_or("auto"); write_partition_columns_into_files = extractNamedArgumentAndRemoveFromList(args, "write_partition_columns_into_files").value_or(false); + size_t count = StorageURL::evalArgsAndCollectHeaders(args, headers_from_ast, context); + if (count == 0 || count > getMaxNumberOfArguments(with_structure)) throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Storage S3 requires 1 to {} arguments. All supported signatures:\n{}", From 043ce22ddce10b52ca30273ac7bbee395c2e0b1b Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 3 Apr 2025 14:25:48 -0300 Subject: [PATCH 33/42] refactor extractNamedArgumentAndRemoveFromList --- .../ObjectStorage/S3/Configuration.cpp | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 0803aa9e4089..ac6e9c096ffd 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -55,10 +55,10 @@ namespace ErrorCodes namespace { - template - std::optional extractNamedArgument(ASTs & arguments, const std::string & argument_name, ASTs::iterator & argument_position) + + ASTs::const_iterator getValueForNamedArgument(const ASTs & arguments, const std::string & argument_name, Field & value) { - for (auto arg_it = arguments.begin(); arg_it != arguments.end(); ++arg_it) + for (const auto * arg_it = arguments.begin(); arg_it != arguments.end(); ++arg_it) { auto argument = *arg_it; const auto * type_ast_function = argument->as(); @@ -68,13 +68,13 @@ namespace continue; } - auto name = type_ast_function->arguments->children[0]->as(); + const auto * name = type_ast_function->arguments->children[0]->as(); if (name && name->name() == argument_name) { - auto value = type_ast_function->arguments->children[1]->as(); + const auto * ast_literal = type_ast_function->arguments->children[1]->as(); - if (!value) + if (!ast_literal) { throw Exception( ErrorCodes::BAD_ARGUMENTS, @@ -82,27 +82,29 @@ namespace name->name()); } - argument_position = arg_it; + value = ast_literal->value; - return value->value.safeGet(); + return arg_it; } } - return std::nullopt; + return arguments.end(); } template std::optional extractNamedArgumentAndRemoveFromList(ASTs & arguments, const std::string & argument_name) { - ASTs::iterator iterator; - auto named_arg_opt = extractNamedArgument(arguments, argument_name, iterator); + Field value; + const auto * p = getValueForNamedArgument(arguments, argument_name, value); - if (named_arg_opt) + if (p == arguments.end()) { - arguments.erase(iterator); + return std::nullopt; } - return named_arg_opt; + arguments.erase(p); + + return value.safeGet(); } } From 3f93ac09990c587ff5f7b143fe0d7fc52c66e8d6 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 3 Apr 2025 14:35:52 -0300 Subject: [PATCH 34/42] address a few pr comments --- .../ObjectStorage/S3/Configuration.cpp | 6 ++--- .../ObjectStorage/StorageObjectStorage.cpp | 27 +++++++++---------- .../ObjectStorage/StorageObjectStorage.h | 2 +- .../StorageObjectStorageCluster.cpp | 2 +- src/Storages/PartitionStrategy.cpp | 4 +-- src/Storages/PartitionStrategy.h | 23 +++++++++------- .../03363_hive_style_partition_write.sql | 12 ++++----- 7 files changed, 40 insertions(+), 36 deletions(-) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index ac6e9c096ffd..8d26ea0c5c1f 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -130,7 +130,7 @@ static const std::unordered_set optional_configuration_keys = "max_connections", "expiration_window_seconds", "no_sign_request", - "partitioning_style", + "partition_strategy", "write_partition_columns_into_files" }; @@ -229,7 +229,7 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect else url = S3::URI(collection.get("url"), settings[Setting::allow_archive_path_syntax]); - partitioning_style = collection.getOrDefault("partitioning_style", "auto"); + partition_strategy = collection.getOrDefault("partition_strategy", "auto"); write_partition_columns_into_files = collection.getOrDefault("write_partition_columns_into_files", false); auth_settings[S3AuthSetting::access_key_id] = collection.getOrDefault("access_key_id", ""); @@ -251,7 +251,7 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_structure) { - partitioning_style = extractNamedArgumentAndRemoveFromList(args, "partitioning_style").value_or("auto"); + partition_strategy = extractNamedArgumentAndRemoveFromList(args, "partition_strategy").value_or("auto"); write_partition_columns_into_files = extractNamedArgumentAndRemoveFromList(args, "write_partition_columns_into_files").value_or(false); size_t count = StorageURL::evalArgsAndCollectHeaders(args, headers_from_ast, context); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index f43a6b1b301c..5882beeb6915 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -55,11 +55,12 @@ namespace void sanityCheckPartitioningConfiguration( const ASTPtr & table_level_partition_by, const ASTPtr & query_partition_by, - const StorageObjectStorage::ConfigurationPtr & configuration) + const std::string & partition_strategy, + bool has_partition_wildcard) { if (!table_level_partition_by && !query_partition_by) { - // do we want to assert that `partitioning_style` is not set to something different style AND + // do we want to assert that `partition_strategy` is not set to something different style AND // wildcard is not set either? return; } @@ -70,32 +71,30 @@ namespace throw Exception(ErrorCodes::LOGICAL_ERROR, "Table level partition expression and query level partition expression can't be specified together, this is a bug"); } - static std::unordered_map partitioning_style_to_wildcard_acceptance = + static std::unordered_map partition_strategy_to_wildcard_acceptance = { {"auto", true}, {"hive", false} }; - if (!partitioning_style_to_wildcard_acceptance.contains(configuration->partitioning_style)) + if (!partition_strategy_to_wildcard_acceptance.contains(partition_strategy)) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partitioning style '{}'", - configuration->partitioning_style); + partition_strategy); } - bool has_partition_wildcard = configuration->withPartitionWildcard(); - - if (has_partition_wildcard && !partitioning_style_to_wildcard_acceptance.at(configuration->partitioning_style)) + if (has_partition_wildcard && !partition_strategy_to_wildcard_acceptance.at(partition_strategy)) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "The {} wildcard can't be used with {} partitioning style", - PartitionedSink::PARTITION_ID_WILDCARD, configuration->partitioning_style); + PartitionedSink::PARTITION_ID_WILDCARD, partition_strategy); } - if (!has_partition_wildcard && partitioning_style_to_wildcard_acceptance.at(configuration->partitioning_style)) + if (!has_partition_wildcard && partition_strategy_to_wildcard_acceptance.at(partition_strategy)) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partitioning style '{}' requires {} wildcard", - configuration->partitioning_style, + partition_strategy, PartitionedSink::PARTITION_ID_WILDCARD); } } @@ -111,7 +110,7 @@ namespace } } - sanityCheckPartitioningConfiguration(table_level_partition_by, query_partition_by, configuration); + sanityCheckPartitioningConfiguration(table_level_partition_by, query_partition_by, configuration->partition_strategy, configuration->withPartitionWildcard()); if (table_level_partition_by) { @@ -174,7 +173,7 @@ StorageObjectStorage::StorageObjectStorage( , distributed_processing(distributed_processing_) , log(getLogger(fmt::format("Storage{}({})", configuration->getEngineName(), table_id_.getFullTableName()))) { - sanityCheckPartitioningConfiguration(partition_by, nullptr, configuration); + sanityCheckPartitioningConfiguration(partition_by, nullptr, configuration->partition_strategy, configuration->withPartitionWildcard()); try { @@ -462,7 +461,7 @@ SinkToStoragePtr StorageObjectStorage::write( sample_block, local_context, configuration->format, - configuration->partitioning_style, + configuration->partition_strategy, configuration->write_partition_columns_into_files); return std::make_shared( partition_strategy, object_storage, configuration, format_settings, sample_block, local_context); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index cfb71b112c91..48851917b380 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -244,7 +244,7 @@ class StorageObjectStorage::Configuration String format = "auto"; String compression_method = "auto"; String structure = "auto"; - std::string partitioning_style = "auto"; + std::string partition_strategy = "auto"; /* * Only supported by hive partitioning style for now */ diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 3a2bfe8a990b..2d281f8be60f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -92,8 +92,8 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( if (sample_path.empty() && context_->getSettingsRef()[Setting::use_hive_partitioning]) sample_path = getPathSample(metadata, context_); - setInMemoryMetadata(metadata); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns, context_, sample_path)); + setInMemoryMetadata(metadata); pure_storage = std::make_shared( configuration, diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index 9736728142e2..fe42c058125a 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -103,10 +103,10 @@ std::shared_ptr PartitionStrategyProvider::get(ASTPtr partiti const Block & sample_block, ContextPtr context, const std::string & file_format, - const std::string & partitioning_style, + const std::string & partition_strategy, bool write_partition_columns_into_files) { - if (partitioning_style == "hive") + if (partition_strategy == "hive") { if (file_format.empty()) { diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h index 84d32b2b5c17..2dc8985496a6 100644 --- a/src/Storages/PartitionStrategy.h +++ b/src/Storages/PartitionStrategy.h @@ -23,18 +23,23 @@ struct PartitionStrategy virtual PartitionExpressionActionsAndColumnName getExpression() = 0; virtual std::string getPath(const std::string & prefix, const std::string & partition_key) = 0; + /* + * Hive style partition strategy will put partition column keys and values in the filepath itself + * So we need to remove those columns from the chunk. + * + * Default behavior is not to remove, therefore the base class simply returns the same chunk + * */ virtual Chunk getChunkWithoutPartitionColumnsIfNeeded(const Chunk & chunk) { - Chunk result; - - for (const auto & column : chunk.getColumns()) - { - result.addColumn(column); - } - - return result; + return chunk.clone(); } + /* + * Hive style partition strategy will put partition column keys and values in the filepath itself + * So we need to remove those columns from the block. + * + * Default behavior is not to remove, therefore the base class simply returns the same block + * */ virtual Block getBlockWithoutPartitionColumnsIfNeeded() { return sample_block; @@ -53,7 +58,7 @@ struct PartitionStrategyProvider const Block & sample_block, ContextPtr context, const std::string & file_format, - const std::string & partitioning_style = "", + const std::string & partition_strategy = "", bool write_partition_columns_into_files = false); }; diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql index d9ebf0411606..387af53819fd 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -3,7 +3,7 @@ DROP TABLE IF EXISTS t_03363_parquet, t_03363_parquet_read, t_03363_csv, t_03363_csv_read; CREATE TABLE t_03363_parquet (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partitioning_style='hive') +ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partition_strategy='hive') PARTITION BY (year, country); INSERT INTO t_03363_parquet VALUES @@ -27,7 +27,7 @@ select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.csv') AS _path, counter from t_03363_csv_read order by counter; -- s3 table function -INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function', format=Parquet, partitioning_style='hive') PARTITION BY (year, country) SELECT country, year, counter FROM t_03363_parquet_read; +INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function', format=Parquet, partition_strategy='hive') PARTITION BY (year, country) SELECT country, year, counter FROM t_03363_parquet_read; select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03363_function/**.parquet') order by counter; -- should output 1 because partition columns are not written down to the file by default when hive style is being used select num_columns from s3(s3_conn, filename='t_03363_function/**.parquet', format=ParquetMetadata) limit 1; -INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function_write_down_partition_columns', format=Parquet, partitioning_style='hive', write_partition_columns_into_files=1) PARTITION BY (year, country) SELECT country, year, counter FROM t_03363_parquet_read; +INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function_write_down_partition_columns', format=Parquet, partition_strategy='hive', write_partition_columns_into_files=1) PARTITION BY (year, country) SELECT country, year, counter FROM t_03363_parquet_read; select num_columns from s3(s3_conn, filename='t_03363_function_write_down_partition_columns/**.parquet', format=ParquetMetadata) limit 1; select * from s3(s3_conn, filename='t_03363_function_write_down_partition_columns/**.parquet', format=Parquet) order by counter limit 1 SETTINGS use_hive_partitioning=0; -- hive with partition id placeholder CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_parquet/{_partition_id}', format = Parquet, partitioning_style='hive') +ENGINE = S3(s3_conn, filename = 't_03363_parquet/{_partition_id}', format = Parquet, partition_strategy='hive') PARTITION BY (year, country); -- {serverError BAD_ARGUMENTS}; -- unknown partitioning style CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partitioning_style='abc') +ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partition_strategy='abc') PARTITION BY (year, country); -- {serverError BAD_ARGUMENTS}; -- auto partitioning style without partition_id wildcard From db9bfd0ec804157b9c5e8303bb72d6bb40958207 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 3 Apr 2025 14:38:52 -0300 Subject: [PATCH 35/42] address some more comments --- .../ObjectStorage/StorageObjectStorage.cpp | 4 ++-- src/Storages/PartitionStrategy.cpp | 20 +++++++++---------- src/Storages/PartitionStrategy.h | 6 +++--- src/Storages/StorageFile.cpp | 2 +- src/Storages/StorageURL.cpp | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 5882beeb6915..4d2c6e401d90 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -102,7 +102,7 @@ namespace ASTPtr getPartitionByAst(const ASTPtr & table_level_partition_by, const ASTPtr & query, const StorageObjectStorage::ConfigurationPtr & configuration) { ASTPtr query_partition_by = nullptr; - if (const auto insert_query = std::dynamic_pointer_cast(query)) + if (const auto insert_query = query->as()) { if (insert_query->partition_by) { @@ -456,7 +456,7 @@ SinkToStoragePtr StorageObjectStorage::write( if (ASTPtr partition_by_ast = getPartitionByAst(partition_by, query, configuration)) { - auto partition_strategy = PartitionStrategyProvider::get( + auto partition_strategy = PartitionStrategyFactory::get( partition_by_ast, sample_block, local_context, diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index fe42c058125a..81add504fb47 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -99,12 +99,12 @@ PartitionStrategy::PartitionStrategy(ASTPtr partition_by_, const Block & sample_ } -std::shared_ptr PartitionStrategyProvider::get(ASTPtr partition_by, - const Block & sample_block, - ContextPtr context, - const std::string & file_format, - const std::string & partition_strategy, - bool write_partition_columns_into_files) +std::shared_ptr PartitionStrategyFactory::get(ASTPtr partition_by, + const Block & sample_block, + ContextPtr context, + const std::string & file_format, + const std::string & partition_strategy, + bool write_partition_columns_into_files) { if (partition_strategy == "hive") { @@ -121,15 +121,15 @@ std::shared_ptr PartitionStrategyProvider::get(ASTPtr partiti write_partition_columns_into_files); } - return std::make_shared(partition_by, sample_block, context); + return std::make_shared(partition_by, sample_block, context); } -StringfiedPartitionStrategy::StringfiedPartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_) +StringifiedPartitionStrategy::StringifiedPartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_) : PartitionStrategy(partition_by_, sample_block_, context_) { } -StringfiedPartitionStrategy::PartitionExpressionActionsAndColumnName StringfiedPartitionStrategy::getExpression() +StringifiedPartitionStrategy::PartitionExpressionActionsAndColumnName StringifiedPartitionStrategy::getExpression() { PartitionExpressionActionsAndColumnName actions_with_column_name; @@ -142,7 +142,7 @@ StringfiedPartitionStrategy::PartitionExpressionActionsAndColumnName StringfiedP return actions_with_column_name; } -std::string StringfiedPartitionStrategy::getPath( +std::string StringifiedPartitionStrategy::getPath( const std::string & prefix, const std::string & partition_key) { diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h index 2dc8985496a6..3cfc10f5b6c9 100644 --- a/src/Storages/PartitionStrategy.h +++ b/src/Storages/PartitionStrategy.h @@ -51,7 +51,7 @@ struct PartitionStrategy ContextPtr context; }; -struct PartitionStrategyProvider +struct PartitionStrategyFactory { static std::shared_ptr get( ASTPtr partition_by, @@ -62,9 +62,9 @@ struct PartitionStrategyProvider bool write_partition_columns_into_files = false); }; -struct StringfiedPartitionStrategy : PartitionStrategy +struct StringifiedPartitionStrategy : PartitionStrategy { - StringfiedPartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_); + StringifiedPartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_); PartitionExpressionActionsAndColumnName getExpression() override; std::string getPath(const std::string & prefix, const std::string & partition_key) override; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 84311fa9e27e..c1943bbbbc5e 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1984,7 +1984,7 @@ SinkToStoragePtr StorageFile::write( if (path_for_partitioned_write.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty path for partitioned write"); - auto partition_strategy = PartitionStrategyProvider::get(insert_query->partition_by, metadata_snapshot->getSampleBlock(), context, format_name); + auto partition_strategy = PartitionStrategyFactory::get(insert_query->partition_by, metadata_snapshot->getSampleBlock(), context, format_name); return std::make_shared( partition_strategy, diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 968d521cf376..497b2557ca4c 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -1348,7 +1348,7 @@ SinkToStoragePtr IStorageURLBase::write(const ASTPtr & query, const StorageMetad if (is_partitioned_implementation) { - auto partition_strategy = PartitionStrategyProvider::get(partition_by_ast, metadata_snapshot->getSampleBlock(), context, format_name); + auto partition_strategy = PartitionStrategyFactory::get(partition_by_ast, metadata_snapshot->getSampleBlock(), context, format_name); return std::make_shared( partition_strategy, From 0a31b756e151bb547b49ae840d79b929432e1d0f Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 3 Apr 2025 14:42:39 -0300 Subject: [PATCH 36/42] simmplify extractPartitionRequiredColumns --- src/Storages/PartitionStrategy.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index 81add504fb47..fd9870ba27a9 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -25,12 +25,8 @@ namespace { Names extractPartitionRequiredColumns(ASTPtr & partition_by, const Block & sample_block, ContextPtr context) { - auto pby_clone = partition_by->clone(); - // sample_block columns might be out of order, and we need to respect the partition by expression order - auto key_description = KeyDescription::getKeyFromAST(pby_clone, ColumnsDescription::fromNamesAndTypes(sample_block.getNamesAndTypes()), context); - auto syntax_result = TreeRewriter(context).analyze(pby_clone, key_description.sample_block.getNamesAndTypesList()); - auto exp_analyzer = ExpressionAnalyzer(pby_clone, syntax_result, context).getActions(false); - return exp_analyzer->getRequiredColumns(); + auto key_description = KeyDescription::getKeyFromAST(partition_by, ColumnsDescription::fromNamesAndTypes(sample_block.getNamesAndTypes()), context); + return key_description.sample_block.getNames(); } HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName buildExpressionHive( From 1dc7f9de1d18099371539e48a92916f3dae32261 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 3 Apr 2025 14:48:52 -0300 Subject: [PATCH 37/42] rename setting --- src/Storages/ObjectStorage/S3/Configuration.cpp | 6 +++--- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 2 +- src/Storages/ObjectStorage/StorageObjectStorage.h | 2 +- src/Storages/PartitionStrategy.cpp | 12 ++++++------ src/Storages/PartitionStrategy.h | 6 +++--- src/Storages/PartitionedSink.cpp | 4 ++-- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 8d26ea0c5c1f..92d7152263b9 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -131,7 +131,7 @@ static const std::unordered_set optional_configuration_keys = "expiration_window_seconds", "no_sign_request", "partition_strategy", - "write_partition_columns_into_files" + "hive_partition_strategy_write_partition_columns_into_files" }; String StorageS3Configuration::getDataSourceDescription() const @@ -230,7 +230,7 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect url = S3::URI(collection.get("url"), settings[Setting::allow_archive_path_syntax]); partition_strategy = collection.getOrDefault("partition_strategy", "auto"); - write_partition_columns_into_files = collection.getOrDefault("write_partition_columns_into_files", false); + hive_partition_strategy_write_partition_columns_into_files = collection.getOrDefault("hive_partition_strategy_write_partition_columns_into_files", false); auth_settings[S3AuthSetting::access_key_id] = collection.getOrDefault("access_key_id", ""); auth_settings[S3AuthSetting::secret_access_key] = collection.getOrDefault("secret_access_key", ""); @@ -252,7 +252,7 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_structure) { partition_strategy = extractNamedArgumentAndRemoveFromList(args, "partition_strategy").value_or("auto"); - write_partition_columns_into_files = extractNamedArgumentAndRemoveFromList(args, "write_partition_columns_into_files").value_or(false); + hive_partition_strategy_write_partition_columns_into_files = extractNamedArgumentAndRemoveFromList(args, "hive_partition_strategy_write_partition_columns_into_files").value_or(false); size_t count = StorageURL::evalArgsAndCollectHeaders(args, headers_from_ast, context); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 4d2c6e401d90..39d686f517b9 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -462,7 +462,7 @@ SinkToStoragePtr StorageObjectStorage::write( local_context, configuration->format, configuration->partition_strategy, - configuration->write_partition_columns_into_files); + configuration->hive_partition_strategy_write_partition_columns_into_files); return std::make_shared( partition_strategy, object_storage, configuration, format_settings, sample_block, local_context); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 48851917b380..32cf65590817 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -248,7 +248,7 @@ class StorageObjectStorage::Configuration /* * Only supported by hive partitioning style for now */ - bool write_partition_columns_into_files = false; + bool hive_partition_strategy_write_partition_columns_into_files = false; virtual void update(ObjectStoragePtr object_storage, ContextPtr local_context); void updateIfRequired(ObjectStoragePtr object_storage, ContextPtr local_context); diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index fd9870ba27a9..807ad14d1722 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -100,7 +100,7 @@ std::shared_ptr PartitionStrategyFactory::get(ASTPtr partitio ContextPtr context, const std::string & file_format, const std::string & partition_strategy, - bool write_partition_columns_into_files) + bool hive_partition_strategy_write_partition_columns_into_files) { if (partition_strategy == "hive") { @@ -114,7 +114,7 @@ std::shared_ptr PartitionStrategyFactory::get(ASTPtr partitio sample_block, context, file_format, - write_partition_columns_into_files); + hive_partition_strategy_write_partition_columns_into_files); } return std::make_shared(partition_by, sample_block, context); @@ -150,10 +150,10 @@ HiveStylePartitionStrategy::HiveStylePartitionStrategy( const Block & sample_block_, ContextPtr context_, const std::string & file_format_, - bool write_partition_columns_into_files_) + bool hive_partition_strategy_write_partition_columns_into_files_) : PartitionStrategy(partition_by_, sample_block_, context_), file_format(file_format_), - write_partition_columns_into_files(write_partition_columns_into_files_), + hive_partition_strategy_write_partition_columns_into_files(hive_partition_strategy_write_partition_columns_into_files_), partition_expression_required_columns(extractPartitionRequiredColumns(partition_by, sample_block, context)), partition_expression_required_columns_set(partition_expression_required_columns.begin(), partition_expression_required_columns.end()) { @@ -191,7 +191,7 @@ Chunk HiveStylePartitionStrategy::getChunkWithoutPartitionColumnsIfNeeded(const { Chunk result; - if (write_partition_columns_into_files) + if (hive_partition_strategy_write_partition_columns_into_files) { for (const auto & column : chunk.getColumns()) { @@ -216,7 +216,7 @@ Chunk HiveStylePartitionStrategy::getChunkWithoutPartitionColumnsIfNeeded(const Block HiveStylePartitionStrategy::getBlockWithoutPartitionColumnsIfNeeded() { - if (write_partition_columns_into_files) + if (hive_partition_strategy_write_partition_columns_into_files) { return sample_block; } diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h index 3cfc10f5b6c9..85cdf2017815 100644 --- a/src/Storages/PartitionStrategy.h +++ b/src/Storages/PartitionStrategy.h @@ -59,7 +59,7 @@ struct PartitionStrategyFactory ContextPtr context, const std::string & file_format, const std::string & partition_strategy = "", - bool write_partition_columns_into_files = false); + bool hive_partition_strategy_write_partition_columns_into_files = false); }; struct StringifiedPartitionStrategy : PartitionStrategy @@ -77,7 +77,7 @@ struct HiveStylePartitionStrategy : PartitionStrategy const Block & sample_block_, ContextPtr context_, const std::string & file_format_, - bool write_partition_columns_into_files_); + bool hive_partition_strategy_write_partition_columns_into_files_); PartitionExpressionActionsAndColumnName getExpression() override; std::string getPath(const std::string & prefix, const std::string & partition_key) override; @@ -86,7 +86,7 @@ struct HiveStylePartitionStrategy : PartitionStrategy private: std::string file_format; - bool write_partition_columns_into_files; + bool hive_partition_strategy_write_partition_columns_into_files; Names partition_expression_required_columns; std::unordered_set partition_expression_required_columns_set; PartitionExpressionActionsAndColumnName actions_with_column_name; diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index ec99748f4475..965d389fafc5 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -59,9 +59,9 @@ void PartitionedSink::consume(Chunk & input_chunk) const auto * partition_by_result_column = block_with_partition_by_expr.getByName(partition_by_column_name).column.get(); /* - * `write_partition_columns_into_files` + * `hive_partition_strategy_write_partition_columns_into_files` */ - const auto chunk = getPartitionStrategy()->getChunkWithoutPartitionColumnsIfNeeded(input_chunk); + const auto chunk = partition_strategy->getChunkWithoutPartitionColumnsIfNeeded(input_chunk); const auto & columns_to_consume = chunk.getColumns(); size_t chunk_rows = chunk.getNumRows(); From baefb50a5798a1b18309cda68345cb6195325e29 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 3 Apr 2025 14:50:53 -0300 Subject: [PATCH 38/42] revert some small changes on storagefile --- src/Storages/StorageFile.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index c1943bbbbc5e..d3d41983f60a 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1976,8 +1976,9 @@ SinkToStoragePtr StorageFile::write( if (context->getSettingsRef()[Setting::engine_file_truncate_on_insert]) flags |= O_TRUNC; + bool has_wildcards = path_for_partitioned_write.contains(PartitionedSink::PARTITION_ID_WILDCARD); const auto * insert_query = dynamic_cast(query.get()); - bool is_partitioned_implementation = insert_query && insert_query->partition_by; + bool is_partitioned_implementation = insert_query && insert_query->partition_by && has_wildcards; if (is_partitioned_implementation) { From 1ac2a9c7c70963510d6771410a78caa8d4679159 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 3 Apr 2025 15:08:06 -0300 Subject: [PATCH 39/42] some intermediate docs --- src/Storages/ObjectStorage/S3/Configuration.cpp | 5 +++++ src/Storages/ObjectStorage/StorageObjectStorage.cpp | 3 +++ 2 files changed, 8 insertions(+) diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 92d7152263b9..930890b8a0c4 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -251,6 +251,11 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_structure) { + /* + * Calls to `extractNamedArgumentAndRemoveFromList` need to happen before count is determined: + * `size_t count = StorageURL::evalArgsAndCollectHeaders` + * This is because extractNamedArgumentAndRemoveFromList alters the list of arguments + * */ partition_strategy = extractNamedArgumentAndRemoveFromList(args, "partition_strategy").value_or("auto"); hive_partition_strategy_write_partition_columns_into_files = extractNamedArgumentAndRemoveFromList(args, "hive_partition_strategy_write_partition_columns_into_files").value_or(false); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 39d686f517b9..787b49790033 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -173,6 +173,9 @@ StorageObjectStorage::StorageObjectStorage( , distributed_processing(distributed_processing_) , log(getLogger(fmt::format("Storage{}({})", configuration->getEngineName(), table_id_.getFullTableName()))) { + // this function needs to be called in the constructor so we can validate table definition at creation time + // it is also being called during `write` because this code is also used by table functions and, unfortunately, + // at creation time the partition by is not available (maybe this needs to be fixed) sanityCheckPartitioningConfiguration(partition_by, nullptr, configuration->partition_strategy, configuration->withPartitionWildcard()); try From 7e8b4d792d83064027f5d01122ab92e8090671a8 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 3 Apr 2025 16:18:43 -0300 Subject: [PATCH 40/42] add missing test file updt --- tests/queries/0_stateless/03363_hive_style_partition_write.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03363_hive_style_partition_write.sql b/tests/queries/0_stateless/03363_hive_style_partition_write.sql index 387af53819fd..9211d8b1018c 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition_write.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition_write.sql @@ -54,7 +54,7 @@ select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/ Date: Thu, 3 Apr 2025 16:18:54 -0300 Subject: [PATCH 41/42] simplify sanity check of partition config --- src/Interpreters/InterpreterInsertQuery.cpp | 2 +- .../ObjectStorage/StorageObjectStorage.cpp | 102 +++--------------- .../ObjectStorage/StorageObjectStorage.h | 2 + src/Storages/PartitionStrategy.cpp | 35 ++++++ src/Storages/PartitionStrategy.h | 1 + src/Storages/StorageFile.cpp | 2 +- src/Storages/StorageURL.cpp | 2 +- src/TableFunctions/ITableFunction.cpp | 12 +-- src/TableFunctions/ITableFunction.h | 9 +- .../TableFunctionObjectStorage.cpp | 36 +++++++ .../TableFunctionObjectStorage.h | 7 ++ 11 files changed, 115 insertions(+), 95 deletions(-) diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 929f5fd2d6cc..12adea5e83c0 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -136,7 +136,7 @@ StoragePtr InterpreterInsertQuery::getTable(ASTInsertQuery & query) } return table_function_ptr->execute(query.table_function, current_context, table_function_ptr->getName(), - /* cached_columns */ {}, /* use_global_context */ false, /* is_insert_query */true); + /* cached_columns */ {}, /* use_global_context */ false, &query); } if (query.table_id) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 787b49790033..e28831344f27 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -50,77 +50,6 @@ namespace StorageObjectStorageSetting extern const StorageObjectStorageSettingsBool allow_dynamic_metadata_for_data_lakes; } -namespace -{ - void sanityCheckPartitioningConfiguration( - const ASTPtr & table_level_partition_by, - const ASTPtr & query_partition_by, - const std::string & partition_strategy, - bool has_partition_wildcard) - { - if (!table_level_partition_by && !query_partition_by) - { - // do we want to assert that `partition_strategy` is not set to something different style AND - // wildcard is not set either? - return; - } - - if (table_level_partition_by && query_partition_by) - { - // should never happen because parser should not allow that - throw Exception(ErrorCodes::LOGICAL_ERROR, "Table level partition expression and query level partition expression can't be specified together, this is a bug"); - } - - static std::unordered_map partition_strategy_to_wildcard_acceptance = - { - {"auto", true}, - {"hive", false} - }; - - if (!partition_strategy_to_wildcard_acceptance.contains(partition_strategy)) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Unknown partitioning style '{}'", - partition_strategy); - } - - if (has_partition_wildcard && !partition_strategy_to_wildcard_acceptance.at(partition_strategy)) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "The {} wildcard can't be used with {} partitioning style", - PartitionedSink::PARTITION_ID_WILDCARD, partition_strategy); - } - - if (!has_partition_wildcard && partition_strategy_to_wildcard_acceptance.at(partition_strategy)) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Partitioning style '{}' requires {} wildcard", - partition_strategy, - PartitionedSink::PARTITION_ID_WILDCARD); - } - } - - ASTPtr getPartitionByAst(const ASTPtr & table_level_partition_by, const ASTPtr & query, const StorageObjectStorage::ConfigurationPtr & configuration) - { - ASTPtr query_partition_by = nullptr; - if (const auto insert_query = query->as()) - { - if (insert_query->partition_by) - { - query_partition_by = insert_query->partition_by; - } - } - - sanityCheckPartitioningConfiguration(table_level_partition_by, query_partition_by, configuration->partition_strategy, configuration->withPartitionWildcard()); - - if (table_level_partition_by) - { - return table_level_partition_by; - } - - return query_partition_by; - } -} - String StorageObjectStorage::getPathSample(ContextPtr context) { auto query_settings = configuration->getQuerySettings(context); @@ -173,11 +102,6 @@ StorageObjectStorage::StorageObjectStorage( , distributed_processing(distributed_processing_) , log(getLogger(fmt::format("Storage{}({})", configuration->getEngineName(), table_id_.getFullTableName()))) { - // this function needs to be called in the constructor so we can validate table definition at creation time - // it is also being called during `write` because this code is also used by table functions and, unfortunately, - // at creation time the partition by is not available (maybe this needs to be fixed) - sanityCheckPartitioningConfiguration(partition_by, nullptr, configuration->partition_strategy, configuration->withPartitionWildcard()); - try { if (!lazy_init) @@ -216,6 +140,21 @@ StorageObjectStorage::StorageObjectStorage( setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns, context, sample_path, format_settings)); setInMemoryMetadata(metadata); + + // perhaps it is worth adding some extra safeguards for cases like + // create table s3_table engine=s3('{_partition_id}'); -- partition id wildcard set, but no partition expression + // create table s3_table engine=s3(partition_strategy='hive'); -- partition strategy set, but no partition expression + if (partition_by) + { + partition_strategy = PartitionStrategyFactory::get( + partition_by, + metadata.getSampleBlock(), + context, + configuration->format, + configuration->withPartitionWildcard(), + configuration->partition_strategy, + configuration->hive_partition_strategy_write_partition_columns_into_files); + } } String StorageObjectStorage::getName() const @@ -434,7 +373,7 @@ void StorageObjectStorage::read( } SinkToStoragePtr StorageObjectStorage::write( - const ASTPtr & query, + const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /* async_insert */) @@ -457,15 +396,8 @@ SinkToStoragePtr StorageObjectStorage::write( configuration->getPath()); } - if (ASTPtr partition_by_ast = getPartitionByAst(partition_by, query, configuration)) + if (partition_strategy) { - auto partition_strategy = PartitionStrategyFactory::get( - partition_by_ast, - sample_block, - local_context, - configuration->format, - configuration->partition_strategy, - configuration->hive_partition_strategy_write_partition_columns_into_files); return std::make_shared( partition_strategy, object_storage, configuration, format_settings, sample_block, local_context); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 32cf65590817..41ecec9b1083 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -18,6 +18,7 @@ namespace DB class ReadBufferIterator; class SchemaCache; class NamedCollection; +struct PartitionStrategy; namespace ErrorCodes { @@ -153,6 +154,7 @@ class StorageObjectStorage : public IStorage const std::optional format_settings; const ASTPtr partition_by; const bool distributed_processing; + std::shared_ptr partition_strategy; LoggerPtr log; }; diff --git a/src/Storages/PartitionStrategy.cpp b/src/Storages/PartitionStrategy.cpp index 807ad14d1722..94f8795f158d 100644 --- a/src/Storages/PartitionStrategy.cpp +++ b/src/Storages/PartitionStrategy.cpp @@ -87,6 +87,38 @@ namespace return result; } + + void sanityCheckPartitioningConfiguration( + const std::string & partition_strategy, + bool has_partition_wildcard) + { + static std::unordered_map partition_strategy_to_wildcard_acceptance = + { + {"auto", true}, + {"hive", false} + }; + + if (!partition_strategy_to_wildcard_acceptance.contains(partition_strategy)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Unknown partitioning style '{}'", + partition_strategy); + } + + if (has_partition_wildcard && !partition_strategy_to_wildcard_acceptance.at(partition_strategy)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The {} wildcard can't be used with {} partitioning style", + PartitionedSink::PARTITION_ID_WILDCARD, partition_strategy); + } + + if (!has_partition_wildcard && partition_strategy_to_wildcard_acceptance.at(partition_strategy)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Partitioning style '{}' requires {} wildcard", + partition_strategy, + PartitionedSink::PARTITION_ID_WILDCARD); + } + } } PartitionStrategy::PartitionStrategy(ASTPtr partition_by_, const Block & sample_block_, ContextPtr context_) @@ -99,9 +131,12 @@ std::shared_ptr PartitionStrategyFactory::get(ASTPtr partitio const Block & sample_block, ContextPtr context, const std::string & file_format, + bool has_partition_wildcard, const std::string & partition_strategy, bool hive_partition_strategy_write_partition_columns_into_files) { + sanityCheckPartitioningConfiguration(partition_strategy, has_partition_wildcard); + if (partition_strategy == "hive") { if (file_format.empty()) diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h index 85cdf2017815..e89127fecde7 100644 --- a/src/Storages/PartitionStrategy.h +++ b/src/Storages/PartitionStrategy.h @@ -58,6 +58,7 @@ struct PartitionStrategyFactory const Block & sample_block, ContextPtr context, const std::string & file_format, + bool has_partition_wildcard, const std::string & partition_strategy = "", bool hive_partition_strategy_write_partition_columns_into_files = false); }; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index d3d41983f60a..498b44e08a20 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1985,7 +1985,7 @@ SinkToStoragePtr StorageFile::write( if (path_for_partitioned_write.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty path for partitioned write"); - auto partition_strategy = PartitionStrategyFactory::get(insert_query->partition_by, metadata_snapshot->getSampleBlock(), context, format_name); + auto partition_strategy = PartitionStrategyFactory::get(insert_query->partition_by, metadata_snapshot->getSampleBlock(), context, format_name, has_wildcards); return std::make_shared( partition_strategy, diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 497b2557ca4c..86e47d28eecd 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -1348,7 +1348,7 @@ SinkToStoragePtr IStorageURLBase::write(const ASTPtr & query, const StorageMetad if (is_partitioned_implementation) { - auto partition_strategy = PartitionStrategyFactory::get(partition_by_ast, metadata_snapshot->getSampleBlock(), context, format_name); + auto partition_strategy = PartitionStrategyFactory::get(partition_by_ast, metadata_snapshot->getSampleBlock(), context, format_name, has_wildcards); return std::make_shared( partition_strategy, diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp index 916ff7ec0222..342cfa8a40e6 100644 --- a/src/TableFunctions/ITableFunction.cpp +++ b/src/TableFunctions/ITableFunction.cpp @@ -20,28 +20,28 @@ AccessType ITableFunction::getSourceAccessType() const } StoragePtr ITableFunction::execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, - ColumnsDescription cached_columns, bool use_global_context, bool is_insert_query) const + ColumnsDescription cached_columns, bool use_global_context, ASTInsertQuery * insert_query) const { ProfileEvents::increment(ProfileEvents::TableFunctionExecute); AccessFlags required_access = getSourceAccessType(); auto table_function_properties = TableFunctionFactory::instance().tryGetProperties(getName()); - if (is_insert_query || !(table_function_properties && table_function_properties->allow_readonly)) + if (insert_query || !(table_function_properties && table_function_properties->allow_readonly)) required_access |= AccessType::CREATE_TEMPORARY_TABLE; context->checkAccess(required_access); auto context_to_use = use_global_context ? context->getGlobalContext() : context; if (cached_columns.empty()) - return executeImpl(ast_function, context, table_name, std::move(cached_columns), is_insert_query); + return executeImpl(ast_function, context, table_name, std::move(cached_columns), insert_query); - if (hasStaticStructure() && cached_columns == getActualTableStructure(context, is_insert_query)) - return executeImpl(ast_function, context_to_use, table_name, std::move(cached_columns), is_insert_query); + if (hasStaticStructure() && cached_columns == getActualTableStructure(context, insert_query)) + return executeImpl(ast_function, context_to_use, table_name, std::move(cached_columns), insert_query); auto this_table_function = shared_from_this(); auto get_storage = [=]() -> StoragePtr { - return this_table_function->executeImpl(ast_function, context_to_use, table_name, cached_columns, is_insert_query); + return this_table_function->executeImpl(ast_function, context_to_use, table_name, cached_columns, insert_query); }; /// It will request actual table structure and create underlying storage lazily diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h index ed7f80e5df97..d9ca9f1c9ea3 100644 --- a/src/TableFunctions/ITableFunction.h +++ b/src/TableFunctions/ITableFunction.h @@ -15,6 +15,7 @@ namespace DB { class Context; +class ASTInsertQuery; /** Interface for table functions. * @@ -80,7 +81,7 @@ class ITableFunction : public std::enable_shared_from_this /// Create storage according to the query. StoragePtr - execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}, bool use_global_context = false, bool is_insert = false) const; + execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}, bool use_global_context = false, ASTInsertQuery * insert_query = nullptr) const; virtual ~ITableFunction() = default; @@ -91,6 +92,12 @@ class ITableFunction : public std::enable_shared_from_this virtual StoragePtr executeImpl( const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const = 0; + virtual StoragePtr executeImpl( + const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, ASTInsertQuery * insert_query = nullptr) const + { + return executeImpl(ast_function, context, table_name, cached_columns, insert_query != nullptr); + } + virtual const char * getStorageTypeName() const = 0; }; diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index 7207c500b96a..f22681fbca6d 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -12,6 +12,8 @@ #include +#include + #include #include #include @@ -126,6 +128,40 @@ StoragePtr TableFunctionObjectStorage::executeImpl( return storage; } +template +StoragePtr TableFunctionObjectStorage::executeImpl( + const ASTPtr & /* ast_function */, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns, + ASTInsertQuery * insert_query) const +{ + ColumnsDescription columns; + chassert(configuration); + if (configuration->structure != "auto") + columns = parseColumnsListFromString(configuration->structure, context); + else if (!structure_hint.empty()) + columns = structure_hint; + else if (!cached_columns.empty()) + columns = cached_columns; + + StoragePtr storage = std::make_shared( + configuration, + getObjectStorage(context, !insert_query), + context, + StorageID(getDatabaseName(), table_name), + columns, + ConstraintsDescription{}, + String{}, + /* format_settings */ std::nullopt, + /* mode */ LoadingStrictnessLevel::CREATE, + /* distributed_processing */ false, + insert_query ? insert_query->partition_by : nullptr); + + storage->startup(); + return storage; +} + void registerTableFunctionObjectStorage(TableFunctionFactory & factory) { UNUSED(factory); diff --git a/src/TableFunctions/TableFunctionObjectStorage.h b/src/TableFunctions/TableFunctionObjectStorage.h index f77b95e93c07..c9222ff01d62 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.h +++ b/src/TableFunctions/TableFunctionObjectStorage.h @@ -152,6 +152,13 @@ class TableFunctionObjectStorage : public ITableFunction ColumnsDescription cached_columns, bool is_insert_query) const override; + StoragePtr executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns, + ASTInsertQuery * insert_query) const override; + const char * getStorageTypeName() const override { return Definition::storage_type_name; } ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; From be49e038b8fbf83bb8a525931ba639d034980d8d Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 4 Apr 2025 12:45:32 -0300 Subject: [PATCH 42/42] default partition strategy in factory --- src/Core/Settings.cpp | 2 +- src/Storages/ObjectStorage/StorageObjectStorage.h | 3 --- src/Storages/PartitionStrategy.h | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 0b4bd1af28cf..d04b9bcf2a71 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -5856,7 +5856,7 @@ The maximum number of rows in the right table to determine whether to rerange th DECLARE(Bool, allow_experimental_join_right_table_sorting, false, R"( If it is set to true, and the conditions of `join_to_sort_minimum_perkey_rows` and `join_to_sort_maximum_table_rows` are met, rerange the right table by key to improve the performance in left or inner hash join. )", EXPERIMENTAL) \ - DECLARE(Bool, use_hive_partitioning, false, R"( + DECLARE(Bool, use_hive_partitioning, true, R"( When enabled, ClickHouse will detect Hive-style partitioning in path (`/name=value/`) in file-like table engines [File](../../engines/table-engines/special/file.md#hive-style-partitioning)/[S3](../../engines/table-engines/integrations/s3.md#hive-style-partitioning)/[URL](../../engines/table-engines/special/url.md#hive-style-partitioning)/[HDFS](../../engines/table-engines/integrations/hdfs.md#hive-style-partitioning)/[AzureBlobStorage](../../engines/table-engines/integrations/azureBlobStorage.md#hive-style-partitioning) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. )", EXPERIMENTAL)\ \ diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 41ecec9b1083..841ff8f803fb 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -247,9 +247,6 @@ class StorageObjectStorage::Configuration String compression_method = "auto"; String structure = "auto"; std::string partition_strategy = "auto"; - /* - * Only supported by hive partitioning style for now - */ bool hive_partition_strategy_write_partition_columns_into_files = false; virtual void update(ObjectStoragePtr object_storage, ContextPtr local_context); diff --git a/src/Storages/PartitionStrategy.h b/src/Storages/PartitionStrategy.h index e89127fecde7..7ca515ecfc3b 100644 --- a/src/Storages/PartitionStrategy.h +++ b/src/Storages/PartitionStrategy.h @@ -59,7 +59,7 @@ struct PartitionStrategyFactory ContextPtr context, const std::string & file_format, bool has_partition_wildcard, - const std::string & partition_strategy = "", + const std::string & partition_strategy = "auto", bool hive_partition_strategy_write_partition_columns_into_files = false); };