Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/IntegrationTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,4 @@ jobs:
run: |
source ./scripts/run_s3_test_server.sh
source ./scripts/set_s3_test_server_variables.sh
make test
./build/release/test/unittest "*" --skip-error-messages "[]"
2 changes: 1 addition & 1 deletion duckdb
Submodule duckdb updated 57 files
+1 −1 .github/config/extensions/ducklake.cmake
+1 −1 .github/config/extensions/httpfs.cmake
+1 −1 .github/config/extensions/iceberg.cmake
+3 −3 .github/workflows/Extensions.yml
+1 −2 CMakeLists.txt
+1 −2 extension/icu/icu-strptime.cpp
+14 −23 extension/parquet/column_writer.cpp
+1 −1 extension/parquet/include/column_writer.hpp
+0 −4 extension/parquet/include/writer/array_column_writer.hpp
+28 −22 extension/parquet/writer/array_column_writer.cpp
+2 −3 extension/parquet/writer/struct_column_writer.cpp
+1 −1 scripts/amalgamation.py
+1 −1 scripts/extension-upload-all.sh
+1 −1 scripts/extension-upload-from-nightly.sh
+1 −1 scripts/extension-upload-repository.sh
+2 −2 scripts/extension-upload-wasm.sh
+1 −1 scripts/package_build.py
+6 −12 src/CMakeLists.txt
+0 −20 src/common/enum_util.cpp
+0 −9 src/common/string_util.cpp
+1 −1 src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp
+2 −2 src/execution/operator/helper/physical_reset.cpp
+1 −1 src/execution/operator/schema/physical_attach.cpp
+2 −2 src/function/scalar/variant/functions.json
+1 −11 src/function/table/system/duckdb_settings.cpp
+0 −8 src/include/duckdb/common/enum_util.hpp
+1 −1 src/include/duckdb/function/scalar/variant_functions.hpp
+6 −6 src/include/duckdb/logging/log_storage.hpp
+3 −26 src/include/duckdb/logging/log_type.hpp
+0 −2 src/include/duckdb/main/client_context.hpp
+0 −3 src/include/duckdb/main/database_manager.hpp
+7 −9 src/include/duckdb/storage/metadata/metadata_manager.hpp
+0 −63 src/logging/log_types.cpp
+19 −27 src/main/client_context.cpp
+3 −18 src/main/database_manager.cpp
+7 −3 src/optimizer/rule/comparison_simplification.cpp
+0 −2 src/parser/transform/statement/transform_create_function.cpp
+25 −67 src/storage/metadata/metadata_manager.cpp
+1 −1 src/storage/table/array_column_data.cpp
+0 −14 src/storage/table/column_data.cpp
+1 −15 src/storage/table/column_data_checkpointer.cpp
+2 −6 src/storage/table/row_group_collection.cpp
+1 −1 test/api/test_reset.cpp
+0 −128 test/common/test_string_util.cpp
+0 −126 test/configs/one_schema_per_test.json
+0 −37 test/optimizer/test_try_cast_decimal.test
+2 −1 test/sql/catalog/function/test_macro_default_arg_with_dependencies.test
+4 −36 test/sql/catalog/function/test_simple_macro.test
+0 −23 test/sql/copy/csv/test_null_padding_quoted_new_line.test_slow
+2 −2 test/sql/copy/s3/metadata_cache.test
+2 −2 test/sql/export/export_compression_level.test
+0 −6 test/sql/function/timestamp/test_icu_strptime.test
+52 −162 test/sql/parallelism/interquery/concurrent_attach_detach.cpp
+0 −26 test/sql/settings/setting_alias.test
+0 −51 test/sql/storage/checkpoint/concurrent_load_delete.test_slow
+6 −6 test/sqlite/sqllogic_command.cpp
+1 −1 test/sqlite/sqllogic_test_runner.cpp
14 changes: 10 additions & 4 deletions src/httpfs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,13 +266,19 @@ unique_ptr<HTTPResponse> HTTPFileSystem::GetRangeRequest(FileHandle &handle, str
string responseEtag = response.GetHeaderValue("ETag");

if (!responseEtag.empty() && responseEtag != hfh.etag) {
if (global_metadata_cache) {
global_metadata_cache->Erase(handle.path);
}
throw HTTPException(
response,
"ETag was initially %s and now it returned %s, this likely means the remote file has "
"changed.\nTry to restart the read or close the file-handle and read the file again (e.g. "
"`DETACH` in the file is a database file).\nYou can disable checking etags via `SET "
"ETag on reading file \"%s\" was initially %s and now it returned %s, this likely means "
"the "
"remote file has "
"changed.\nFor parquet or similar single table sources, consider retrying the query, for "
"persistent FileHandles such as databases consider `DETACH` and re-`ATTACH` "
"\nYou can disable checking etags via `SET "
"unsafe_disable_etag_checks = true;`",
hfh.etag, response.GetHeaderValue("ETag"));
handle.path, hfh.etag, response.GetHeaderValue("ETag"));
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/httpfs_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ static void LoadInternal(ExtensionLoader &loader) {
"http_keep_alive",
"Keep alive connections. Setting this to false can help when running into connection failures",
LogicalType::BOOLEAN, Value(true));
config.AddExtensionOption("allow_asterisks_in_http_paths", "Allow '*' character in URLs users can query",
LogicalType::BOOLEAN, Value(false));
config.AddExtensionOption("enable_curl_server_cert_verification",
"Enable server side certificate verification for CURL backend.", LogicalType::BOOLEAN,
Value(true));
Expand Down
8 changes: 8 additions & 0 deletions src/include/httpfs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,14 @@ class HTTPFileSystem : public FileSystem {
static bool TryParseLastModifiedTime(const string &timestamp, timestamp_t &result);

vector<OpenFileInfo> Glob(const string &path, FileOpener *opener = nullptr) override {
if (path.find('*') != std::string::npos && opener) {
Value setting_val;
if (FileOpener::TryGetCurrentSetting(opener, "allow_asterisks_in_http_paths", setting_val) &&
!setting_val.GetValue<bool>()) {
throw InvalidInputException("Globs (`*`) for generic HTTP file is are not supported.\nConsider `SET "
"allow_asterisks_in_http_paths = true;` to allow this behaviour");
}
}
return {path}; // FIXME
}

Expand Down
3 changes: 3 additions & 0 deletions test/extension/duckdb_extension_settings.test
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
# description: settings for extensions
# group: [extension]

# TODO: move back to duckdb/duckdb
mode skip

require httpfs

statement ok
Expand Down
3 changes: 3 additions & 0 deletions test/sql/copy/csv/parallel/test_parallel_csv.test
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
# description: Test parallel read CSV function on ghub bugs
# group: [parallel]

# TODO: figure out where that bucket went
mode skip

require httpfs

query II
Expand Down
28 changes: 28 additions & 0 deletions test/sql/httpfs/globbing.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# name: test/sql/httpfs/globbing.test
# description: Ensure the HuggingFace filesystem works as expected
# group: [httpfs]

require parquet

require httpfs

statement error
select parse_path(filename), size, part, date from read_parquet('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/hive-partitioning/simple/*/*/test.parquet') order by filename;
----
Invalid Input Error: Globs (`*`) for generic HTTP file is are not supported.

statement ok
SET allow_asterisks_in_http_paths = true;

statement error
select parse_path(filename), size, part, date from read_parquet('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/hive-partitioning/simple/*/*/test.parquet') order by filename;
----
HTTP Error: Unable to connect to URL

statement ok
SET allow_asterisks_in_http_paths = false;

statement error
select parse_path(filename), size, part, date from read_parquet('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/hive-partitioning/simple/*/*/test.parquet') order by filename;
----
Invalid Input Error: Globs (`*`) for generic HTTP file is are not supported.
2 changes: 1 addition & 1 deletion test/sql/secrets/create_secret_r2.test
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ __default_r2 r2 config ['r2://']
statement error
FROM 's3://test-bucket/test.csv'
----
<REGEX>:.*HTTP Error.*HTTP GET error on.*
<REGEX>:.*HTTP Error.*

# Account ID is only for R2, trying to set this for S3 will fail
statement error
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,23 @@ require httpfs

# first query caches the data
statement ok
from 's3://duckdb-blobs/data/shakespeare.parquet';

from 'https://blobs.duckdb.org/data/shakespeare.parquet';

# second query should only have a head request, no gets
query II
explain analyze from 's3://duckdb-blobs/data/shakespeare.parquet';
explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet';
----
analyzed_plan <REGEX>:.*GET: 0.*

statement ok
SET enable_http_metadata_cache = true;

# first query saves the metadata (and data, but that was already there)
statement ok
from 'https://blobs.duckdb.org/data/shakespeare.parquet';

# second query should do no HEAD and no GET
query II
explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet';
----
analyzed_plan <REGEX>:.*HEAD: 0.*
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@ require httpfs

# first read_blob should do 1 GET
query II
explain analyze from read_blob('s3://duckdb-blobs/data/shakespeare.parquet');
explain analyze from read_blob('https://blobs.duckdb.org/data/shakespeare.parquet');
----
analyzed_plan <REGEX>:.*GET: 1.*

# second one should do 0
query II
explain analyze from read_blob('s3://duckdb-blobs/data/shakespeare.parquet');
explain analyze from read_blob('https://blobs.duckdb.org/data/shakespeare.parquet');
----
analyzed_plan <REGEX>:.*GET: 0.*

# although the read was cached using read_blob, the parquet reader can read from cache
query II
explain analyze from 's3://duckdb-blobs/data/shakespeare.parquet';
explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet';
----
analyzed_plan <REGEX>:.*GET: 0.*
Loading