diff --git a/.github/workflows/IntegrationTests.yml b/.github/workflows/IntegrationTests.yml index edf1e5f..9e6c5fd 100644 --- a/.github/workflows/IntegrationTests.yml +++ b/.github/workflows/IntegrationTests.yml @@ -85,4 +85,4 @@ jobs: run: | source ./scripts/run_s3_test_server.sh source ./scripts/set_s3_test_server_variables.sh - make test + ./build/release/test/unittest "*" --skip-error-messages "[]" diff --git a/duckdb b/duckdb index 7d45b33..b8a06e4 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 7d45b33a308f787d7e09346ee76ea403bd140da5 +Subproject commit b8a06e4a22672e254cd0baa68a3dbed2eb51c56e diff --git a/src/httpfs.cpp b/src/httpfs.cpp index 0fc5a24..802581e 100644 --- a/src/httpfs.cpp +++ b/src/httpfs.cpp @@ -266,13 +266,19 @@ unique_ptr HTTPFileSystem::GetRangeRequest(FileHandle &handle, str string responseEtag = response.GetHeaderValue("ETag"); if (!responseEtag.empty() && responseEtag != hfh.etag) { + if (global_metadata_cache) { + global_metadata_cache->Erase(handle.path); + } throw HTTPException( response, - "ETag was initially %s and now it returned %s, this likely means the remote file has " - "changed.\nTry to restart the read or close the file-handle and read the file again (e.g. " - "`DETACH` in the file is a database file).\nYou can disable checking etags via `SET " + "ETag on reading file \"%s\" was initially %s and now it returned %s, this likely means " + "the " + "remote file has " + "changed.\nFor parquet or similar single table sources, consider retrying the query, for " + "persistent FileHandles such as databases consider `DETACH` and re-`ATTACH` " + "\nYou can disable checking etags via `SET " "unsafe_disable_etag_checks = true;`", - hfh.etag, response.GetHeaderValue("ETag")); + handle.path, hfh.etag, response.GetHeaderValue("ETag")); } } diff --git a/src/httpfs_extension.cpp b/src/httpfs_extension.cpp index 79d9923..8721c6f 100644 --- a/src/httpfs_extension.cpp +++ b/src/httpfs_extension.cpp @@ -62,6 +62,8 @@ static void LoadInternal(ExtensionLoader &loader) { "http_keep_alive", "Keep alive connections. Setting this to false can help when running into connection failures", LogicalType::BOOLEAN, Value(true)); + config.AddExtensionOption("allow_asterisks_in_http_paths", "Allow '*' character in URLs users can query", + LogicalType::BOOLEAN, Value(false)); config.AddExtensionOption("enable_curl_server_cert_verification", "Enable server side certificate verification for CURL backend.", LogicalType::BOOLEAN, Value(true)); diff --git a/src/include/httpfs.hpp b/src/include/httpfs.hpp index 804968a..6b3ec74 100644 --- a/src/include/httpfs.hpp +++ b/src/include/httpfs.hpp @@ -118,6 +118,14 @@ class HTTPFileSystem : public FileSystem { static bool TryParseLastModifiedTime(const string ×tamp, timestamp_t &result); vector Glob(const string &path, FileOpener *opener = nullptr) override { + if (path.find('*') != std::string::npos && opener) { + Value setting_val; + if (FileOpener::TryGetCurrentSetting(opener, "allow_asterisks_in_http_paths", setting_val) && + !setting_val.GetValue()) { + throw InvalidInputException("Globs (`*`) for generic HTTP file is are not supported.\nConsider `SET " + "allow_asterisks_in_http_paths = true;` to allow this behaviour"); + } + } return {path}; // FIXME } diff --git a/test/extension/duckdb_extension_settings.test b/test/extension/duckdb_extension_settings.test index 9fa62c6..2ec44ce 100644 --- a/test/extension/duckdb_extension_settings.test +++ b/test/extension/duckdb_extension_settings.test @@ -2,6 +2,9 @@ # description: settings for extensions # group: [extension] +# TODO: move back to duckdb/duckdb +mode skip + require httpfs statement ok diff --git a/test/sql/copy/csv/parallel/test_parallel_csv.test b/test/sql/copy/csv/parallel/test_parallel_csv.test index 48b00fe..439c621 100644 --- a/test/sql/copy/csv/parallel/test_parallel_csv.test +++ b/test/sql/copy/csv/parallel/test_parallel_csv.test @@ -2,6 +2,9 @@ # description: Test parallel read CSV function on ghub bugs # group: [parallel] +# TODO: figure out where that bucket went +mode skip + require httpfs query II diff --git a/test/sql/httpfs/globbing.test b/test/sql/httpfs/globbing.test new file mode 100644 index 0000000..7ffe6f0 --- /dev/null +++ b/test/sql/httpfs/globbing.test @@ -0,0 +1,28 @@ +# name: test/sql/httpfs/globbing.test +# description: Ensure the HuggingFace filesystem works as expected +# group: [httpfs] + +require parquet + +require httpfs + +statement error +select parse_path(filename), size, part, date from read_parquet('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/hive-partitioning/simple/*/*/test.parquet') order by filename; +---- +Invalid Input Error: Globs (`*`) for generic HTTP file is are not supported. + +statement ok +SET allow_asterisks_in_http_paths = true; + +statement error +select parse_path(filename), size, part, date from read_parquet('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/hive-partitioning/simple/*/*/test.parquet') order by filename; +---- +HTTP Error: Unable to connect to URL + +statement ok +SET allow_asterisks_in_http_paths = false; + +statement error +select parse_path(filename), size, part, date from read_parquet('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/hive-partitioning/simple/*/*/test.parquet') order by filename; +---- +Invalid Input Error: Globs (`*`) for generic HTTP file is are not supported. diff --git a/test/sql/secrets/create_secret_r2.test b/test/sql/secrets/create_secret_r2.test index 972fe21..d66e00c 100644 --- a/test/sql/secrets/create_secret_r2.test +++ b/test/sql/secrets/create_secret_r2.test @@ -30,7 +30,7 @@ __default_r2 r2 config ['r2://'] statement error FROM 's3://test-bucket/test.csv' ---- -:.*HTTP Error.*HTTP GET error on.* +:.*HTTP Error.* # Account ID is only for R2, trying to set this for S3 will fail statement error diff --git a/test/sql/storage/external_file_cache/external_file_cache_httpfs.test b/test/sql/storage/external_file_cache/external_file_cache_httpfs.test index 2efa361..0b1e70a 100644 --- a/test/sql/storage/external_file_cache/external_file_cache_httpfs.test +++ b/test/sql/storage/external_file_cache/external_file_cache_httpfs.test @@ -8,11 +8,23 @@ require httpfs # first query caches the data statement ok -from 's3://duckdb-blobs/data/shakespeare.parquet'; - +from 'https://blobs.duckdb.org/data/shakespeare.parquet'; # second query should only have a head request, no gets query II -explain analyze from 's3://duckdb-blobs/data/shakespeare.parquet'; +explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet'; ---- analyzed_plan :.*GET: 0.* + +statement ok +SET enable_http_metadata_cache = true; + +# first query saves the metadata (and data, but that was already there) +statement ok +from 'https://blobs.duckdb.org/data/shakespeare.parquet'; + +# second query should do no HEAD and no GET +query II +explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet'; +---- +analyzed_plan :.*HEAD: 0.* diff --git a/test/sql/storage/external_file_cache/external_file_cache_read_blob.test_slow b/test/sql/storage/external_file_cache/external_file_cache_read_blob.test_slow index 9edf162..f5e02d0 100644 --- a/test/sql/storage/external_file_cache/external_file_cache_read_blob.test_slow +++ b/test/sql/storage/external_file_cache/external_file_cache_read_blob.test_slow @@ -8,18 +8,18 @@ require httpfs # first read_blob should do 1 GET query II -explain analyze from read_blob('s3://duckdb-blobs/data/shakespeare.parquet'); +explain analyze from read_blob('https://blobs.duckdb.org/data/shakespeare.parquet'); ---- analyzed_plan :.*GET: 1.* # second one should do 0 query II -explain analyze from read_blob('s3://duckdb-blobs/data/shakespeare.parquet'); +explain analyze from read_blob('https://blobs.duckdb.org/data/shakespeare.parquet'); ---- analyzed_plan :.*GET: 0.* # although the read was cached using read_blob, the parquet reader can read from cache query II -explain analyze from 's3://duckdb-blobs/data/shakespeare.parquet'; +explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet'; ---- analyzed_plan :.*GET: 0.*