-
Notifications
You must be signed in to change notification settings - Fork 4
S3 hive style writes #697
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: antalya
Are you sure you want to change the base?
S3 hive style writes #697
Changes from all commits
152fc42
9af41a7
1aa7db4
4f2ef27
301e61e
7d25cce
f3f7e9a
6a75897
839af35
c8805b5
1926495
33b61d4
35d875a
fc546c9
f9777a5
de868f3
b4bb1a2
4622f9e
3ca53e1
82ee821
d9409cc
b1bca06
3f65f27
7ae3743
35e9c18
1ae8b93
abdd84a
b3b14e6
6a6c9a7
8b73bb5
1236609
3e356bc
0a8bacb
043ce22
3f93ac0
db9bfd0
0a31b75
1dc7f9d
baefb50
1ac2a9c
7e8b4d7
ef99792
be49e03
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#pragma once | ||
|
||
#include <cstdint> | ||
|
||
namespace DB | ||
{ | ||
|
||
uint64_t generateSnowflakeID(); | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -92,8 +92,8 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( | |
if (sample_path.empty() && context_->getSettingsRef()[Setting::use_hive_partitioning]) | ||
sample_path = getPathSample(metadata, context_); | ||
|
||
setInMemoryMetadata(metadata); | ||
setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns, context_, sample_path)); | ||
setInMemoryMetadata(metadata); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
pure_storage = std::make_shared<StorageObjectStorage>( | ||
configuration, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,34 @@ namespace ErrorCodes | |
extern const int BAD_ARGUMENTS; | ||
} | ||
|
||
namespace | ||
{ | ||
void validateKey(const String & str) | ||
{ | ||
/// See: | ||
/// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html | ||
/// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject | ||
|
||
if (str.empty() || str.size() > 1024) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just comment, not for change right now, because this code already exists, just moved in namespace, but I don't like it. As I understand key is generated inside clickhouse code and customer can't fully control key length. And when he gets this error - what's next? "Ok, key is to long, how can I fix it?". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not fully generated by ClickHouse. The key here represents the path without the bucket. Part of that can be specified by user upon table creation. |
||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size()); | ||
|
||
if (!UTF8::isValidUTF8(reinterpret_cast<const UInt8 *>(str.data()), str.size())) | ||
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); | ||
|
||
PartitionedSink::validatePartitionKey(str, true); | ||
} | ||
|
||
void validateNamespace(const String & str, PartitionedStorageObjectStorageSink::ConfigurationPtr configuration) | ||
{ | ||
configuration->validateNamespace(str); | ||
|
||
if (!UTF8::isValidUTF8(reinterpret_cast<const UInt8 *>(str.data()), str.size())) | ||
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in bucket name"); | ||
|
||
PartitionedSink::validatePartitionKey(str, false); | ||
} | ||
} | ||
|
||
StorageObjectStorageSink::StorageObjectStorageSink( | ||
ObjectStoragePtr object_storage, | ||
ConfigurationPtr configuration, | ||
|
@@ -97,13 +125,13 @@ void StorageObjectStorageSink::cancelBuffers() | |
} | ||
|
||
PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( | ||
std::shared_ptr<PartitionStrategy> partition_strategy_, | ||
ObjectStoragePtr object_storage_, | ||
ConfigurationPtr configuration_, | ||
std::optional<FormatSettings> format_settings_, | ||
const Block & sample_block_, | ||
ContextPtr context_, | ||
const ASTPtr & partition_by) | ||
: PartitionedSink(partition_by, context_, sample_block_) | ||
ContextPtr context_) | ||
: PartitionedSink(partition_strategy_, context_, sample_block_) | ||
, object_storage(object_storage_) | ||
, configuration(configuration_) | ||
, query_settings(configuration_->getQuerySettings(context_)) | ||
|
@@ -121,51 +149,25 @@ StorageObjectStorageSink::~StorageObjectStorageSink() | |
|
||
SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String & partition_id) | ||
{ | ||
auto partition_bucket = replaceWildcards(configuration->getNamespace(), partition_id); | ||
validateNamespace(partition_bucket); | ||
auto file_path = getPartitionStrategy()->getPath(configuration->getPath(), partition_id); | ||
|
||
auto partition_key = replaceWildcards(configuration->getPath(), partition_id); | ||
validateKey(partition_key); | ||
validateNamespace(configuration->getNamespace(), configuration); | ||
validateKey(file_path); | ||
|
||
if (auto new_key = checkAndGetNewFileOnInsertIfNeeded( | ||
*object_storage, *configuration, query_settings, partition_key, /* sequence_number */1)) | ||
*object_storage, *configuration, query_settings, file_path, /* sequence_number */1)) | ||
{ | ||
partition_key = *new_key; | ||
file_path = *new_key; | ||
} | ||
|
||
return std::make_shared<StorageObjectStorageSink>( | ||
object_storage, | ||
configuration, | ||
format_settings, | ||
sample_block, | ||
getPartitionStrategy()->getBlockWithoutPartitionColumnsIfNeeded(), | ||
context, | ||
partition_key | ||
file_path | ||
); | ||
} | ||
|
||
void PartitionedStorageObjectStorageSink::validateKey(const String & str) | ||
{ | ||
/// See: | ||
/// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html | ||
/// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject | ||
|
||
if (str.empty() || str.size() > 1024) | ||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size()); | ||
|
||
if (!UTF8::isValidUTF8(reinterpret_cast<const UInt8 *>(str.data()), str.size())) | ||
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); | ||
|
||
validatePartitionKey(str, true); | ||
} | ||
|
||
void PartitionedStorageObjectStorageSink::validateNamespace(const String & str) | ||
{ | ||
configuration->validateNamespace(str); | ||
|
||
if (!UTF8::isValidUTF8(reinterpret_cast<const UInt8 *>(str.data()), str.size())) | ||
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in bucket name"); | ||
|
||
validatePartitionKey(str, false); | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is somewhat problematic.
PartitionStrategyFactory
needs a sample block, so it needs to be after the call toVirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns, context, sample_path, format_settings)
since it might alter the number of columns. This function needs asample_path
, that is resolved bygetPathSample
.On the other hand,
PartitionStrategyFactory
performs some bucket & key validation which should happen beforegetPathSample