Skip to content

Commit a10cb2b

Browse files
Update vendored DuckDB sources to d573b275ce
1 parent ab0b217 commit a10cb2b

File tree

57 files changed

+1664
-566
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+1664
-566
lines changed

src/duckdb/extension/parquet/include/parquet_reader.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ struct ParquetOptions {
105105
explicit ParquetOptions(ClientContext &context);
106106

107107
bool binary_as_string = false;
108-
bool variant_legacy_encoding = false;
109108
bool file_row_number = false;
110109
shared_ptr<ParquetEncryptionConfig> encryption_config;
111110
bool debug_use_openssl = true;

src/duckdb/extension/parquet/include/reader/variant/variant_value.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ struct VariantValue {
4242

4343
public:
4444
yyjson_mut_val *ToJSON(ClientContext &context, yyjson_mut_doc *doc) const;
45+
static void ToVARIANT(vector<VariantValue> &input, Vector &result);
4546

4647
public:
4748
VariantValueType value_type;

src/duckdb/extension/parquet/include/reader/variant_column_reader.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ namespace duckdb {
1515

1616
class VariantColumnReader : public ColumnReader {
1717
public:
18-
static constexpr const PhysicalType TYPE = PhysicalType::VARCHAR;
18+
static constexpr const PhysicalType TYPE = PhysicalType::STRUCT;
1919

2020
public:
2121
VariantColumnReader(ClientContext &context, ParquetReader &reader, const ParquetColumnSchema &schema,

src/duckdb/extension/parquet/parquet_extension.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -998,9 +998,6 @@ static void LoadInternal(ExtensionLoader &loader) {
998998
"enable_geoparquet_conversion",
999999
"Attempt to decode/encode geometry data in/as GeoParquet files if the spatial extension is present.",
10001000
LogicalType::BOOLEAN, Value::BOOLEAN(true));
1001-
config.AddExtensionOption("variant_legacy_encoding",
1002-
"Enables the Parquet reader to identify a Variant structurally.", LogicalType::BOOLEAN,
1003-
Value::BOOLEAN(false));
10041001
}
10051002

10061003
void ParquetExtension::Load(ExtensionLoader &loader) {

src/duckdb/extension/parquet/parquet_multi_file_info.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -397,10 +397,6 @@ bool ParquetMultiFileInfo::ParseOption(ClientContext &context, const string &ori
397397
options.binary_as_string = BooleanValue::Get(val);
398398
return true;
399399
}
400-
if (key == "variant_legacy_encoding") {
401-
options.variant_legacy_encoding = BooleanValue::Get(val);
402-
return true;
403-
}
404400
if (key == "file_row_number") {
405401
options.file_row_number = BooleanValue::Get(val);
406402
return true;

src/duckdb/extension/parquet/parquet_reader.cpp

Lines changed: 1 addition & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -517,52 +517,6 @@ unique_ptr<BaseStatistics> ParquetColumnSchema::Stats(const FileMetaData &file_m
517517
return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns, parquet_options.can_have_nan);
518518
}
519519

520-
static bool IsVariantType(const SchemaElement &root, const vector<ParquetColumnSchema> &children) {
521-
if (children.size() < 2) {
522-
return false;
523-
}
524-
auto &child0 = children[0];
525-
auto &child1 = children[1];
526-
527-
ParquetColumnSchema const *metadata;
528-
ParquetColumnSchema const *value;
529-
530-
if (child0.name == "metadata" && child1.name == "value") {
531-
metadata = &child0;
532-
value = &child1;
533-
} else if (child1.name == "metadata" && child0.name == "value") {
534-
metadata = &child1;
535-
value = &child0;
536-
} else {
537-
return false;
538-
}
539-
540-
//! Verify names
541-
if (metadata->name != "metadata") {
542-
return false;
543-
}
544-
if (value->name != "value") {
545-
return false;
546-
}
547-
548-
//! Verify types
549-
if (metadata->parquet_type != duckdb_parquet::Type::BYTE_ARRAY) {
550-
return false;
551-
}
552-
if (value->parquet_type != duckdb_parquet::Type::BYTE_ARRAY) {
553-
return false;
554-
}
555-
if (children.size() == 3) {
556-
auto &typed_value = children[2];
557-
if (typed_value.name != "typed_value") {
558-
return false;
559-
}
560-
} else if (children.size() != 2) {
561-
return false;
562-
}
563-
return true;
564-
}
565-
566520
ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_define, idx_t max_repeat,
567521
idx_t &next_schema_idx, idx_t &next_file_idx,
568522
ClientContext &context) {
@@ -629,9 +583,6 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d
629583
const bool is_map = s_ele.__isset.converted_type && s_ele.converted_type == ConvertedType::MAP;
630584
bool is_map_kv = s_ele.__isset.converted_type && s_ele.converted_type == ConvertedType::MAP_KEY_VALUE;
631585
bool is_variant = s_ele.__isset.logicalType && s_ele.logicalType.__isset.VARIANT == true;
632-
if (!is_variant) {
633-
is_variant = parquet_options.variant_legacy_encoding && IsVariantType(s_ele, child_schemas);
634-
}
635586

636587
if (!is_map_kv && this_idx > 0) {
637588
// check if the parent node of this is a map
@@ -667,7 +618,7 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d
667618

668619
LogicalType result_type;
669620
if (is_variant) {
670-
result_type = LogicalType::JSON();
621+
result_type = LogicalType::VARIANT();
671622
} else {
672623
result_type = LogicalType::STRUCT(std::move(struct_types));
673624
}
@@ -808,9 +759,6 @@ ParquetOptions::ParquetOptions(ClientContext &context) {
808759
if (context.TryGetCurrentSetting("binary_as_string", lookup_value)) {
809760
binary_as_string = lookup_value.GetValue<bool>();
810761
}
811-
if (context.TryGetCurrentSetting("variant_legacy_encoding", lookup_value)) {
812-
variant_legacy_encoding = lookup_value.GetValue<bool>();
813-
}
814762
}
815763

816764
ParquetColumnDefinition ParquetColumnDefinition::FromSchemaValue(ClientContext &context, const Value &column_value) {

src/duckdb/extension/parquet/parquet_writer.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -541,7 +541,7 @@ void ParquetWriter::FlushRowGroup(PreparedRowGroup &prepared) {
541541
row_group.__isset.total_compressed_size = true;
542542

543543
if (encryption_config) {
544-
auto row_group_ordinal = num_row_groups.load();
544+
const auto row_group_ordinal = file_meta_data.row_groups.size();
545545
if (row_group_ordinal > std::numeric_limits<int16_t>::max()) {
546546
throw InvalidInputException("RowGroup ordinal exceeds 32767 when encryption enabled");
547547
}
@@ -562,6 +562,14 @@ void ParquetWriter::Flush(ColumnDataCollection &buffer) {
562562
return;
563563
}
564564

565+
// "total_written" is only used for the FILE_SIZE_BYTES flag, and only when threads are writing in parallel.
566+
// We pre-emptively increase it here to try to reduce overshooting when many threads are writing in parallel.
567+
// However, waiting for the exact value (PrepareRowGroup) takes too long, and would cause overshoots to happen.
568+
// So, we guess the compression ratio. We guess 3x, but this will be off depending on the data.
569+
// "total_written" is restored to the exact number of written bytes at the end of FlushRowGroup.
570+
// PhysicalCopyToFile should be reworked to use prepare/flush batch separately for better accuracy.
571+
total_written += buffer.SizeInBytes() / 2;
572+
565573
PreparedRowGroup prepared_row_group;
566574
PrepareRowGroup(buffer, prepared_row_group);
567575
buffer.Reset();

0 commit comments

Comments
 (0)