CogitatorTech
diff --git a/‎README.md‎
Lines changed: 24 additions & 42 deletions b/‎README.md‎
Lines changed: 24 additions & 42 deletions
diff --git a/‎ROADMAP.md‎
Lines changed: 2 additions & 2 deletions b/‎ROADMAP.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/README.md‎
Lines changed: 34 additions & 8 deletions b/‎docs/README.md‎
Lines changed: 34 additions & 8 deletions
diff --git a/‎docs/examples/e2_advanced_features.sql‎
Lines changed: 15 additions & 7 deletions b/‎docs/examples/e2_advanced_features.sql‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎docs/examples/gaggle_usage.sql‎
Lines changed: 26 additions & 8 deletions b/‎docs/examples/gaggle_usage.sql‎
Lines changed: 26 additions & 8 deletions
@@ -108,56 +108,38 @@ select gaggle_list_files('habedi/flickr-8k-dataset-clean') as files;
 
 select gaggle_info('habedi/flickr-8k-dataset-clean') as metadata;
 
--- Read a CSV file directly from local path after download
-select *
-from read_csv_auto('/path/to/downloaded/dataset/file.csv') limit 10;
+-- Read a Parquet file directly from local path after download
+-- Read a Parquet file from local cache using a prepared statement (no subquery in function args)
+prepare rp as select * from read_parquet(?) limit 10;
+execute rp(gaggle_get_file_path('habedi/flickr-8k-dataset-clean','flickr8k.parquet'));
+
+-- Use the replacement scan to read directly via kaggle: URL
+select count(*) from 'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet';
+-- Or glob Parquet files in a dataset directory
+select count(*) from 'kaggle:habedi/flickr-8k-dataset-clean/*.parquet';
 ```
 
-[![Simple Demo 1](https://asciinema.org/a/745806.svg)](https://asciinema.org/a/745806)
-
-#### API Functions
-
-| Function                                | Description                              |
-|-----------------------------------------|------------------------------------------|
-| `gaggle_set_credentials(username, key)` | Set Kaggle API credentials               |
-| `gaggle_search(query, page, page_size)` | Search for datasets on Kaggle            |
-| `gaggle_download(dataset_path)`         | Download a dataset and return local path |
-| `gaggle_list_files(dataset_path)`       | List files in a dataset (JSON array)     |
-| `gaggle_info(dataset_path)`             | Get dataset metadata (JSON object)       |
-| `gaggle_get_version()`                  | Get extension version info               |
-| `gaggle_clear_cache()`                  | Clear the local dataset cache            |
-| `gaggle_get_cache_info()`               | Get cache statistics                     |
-
-#### Configuration
+```sql
+load 'build/release/extension/gaggle/gaggle.duckdb_extension';
+select gaggle_set_credentials('your-username','your-api-key');
 
-Gaggle can be configured via environment variables:
+-- Prime cache
+select gaggle_download('habedi/flickr-8k-dataset-clean');
 
-- `KAGGLE_USERNAME` - Your Kaggle username
-- `KAGGLE_KEY` - Your Kaggle API key
-- `GAGGLE_CACHE_DIR` - Directory for caching datasets (default: system cache dir)
-- `GAGGLE_VERBOSE` - Enable verbose logging (default: false)
-- `GAGGLE_HTTP_TIMEOUT` - HTTP timeout in seconds (default: 30)
+-- Replacement scan single-file parquet
+select count(*) from 'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet';
 
-Alternatively, create `~/.kaggle/kaggle.json`:
+-- If the file is nested or the name differs, try a glob:
+select count(*) from 'kaggle:habedi/flickr-8k-dataset-clean/*.parquet';
+-- Or even broader:
+-- select count(*) from 'kaggle:habedi/flickr-8k-dataset-clean/*flickr8k*.parquet';
 
-```json
-{
-    "username": "your-username",
-    "key": "your-api-key"
-}
+-- Direct read via file path without subquery in table function
+prepare rp as select * from read_parquet(?) limit 10;
+execute rp(gaggle_get_file_path('habedi/flickr-8k-dataset-clean','flickr8k.parquet'));
 ```
 
-##### JSON Parsing
-
-> [!TIP]
-> Gaggle returns JSON data for search results, file lists, and metadata.
-> For advanced JSON parsing, you can optionally load the JSON DuckDB extension:
-> ```sql
-> install json;
-> load json;
-> select * from json_each(gaggle_search('covid-19', 1, 10));
-> ```
-> If the JSON extension is not available, you can still access the raw JSON strings and work with them directly.
+[![Simple Demo 1](https://asciinema.org/a/745806.svg)](https://asciinema.org/a/745806)
 
 ---
 
 
@@ -54,7 +54,7 @@ It outlines features to be implemented and their current status.
 * **Network Optimization**
     * [x] Configurable HTTP timeouts.
     * [ ] Connection pooling for Kaggle API requests.
-    * [ ] Retry logic with exponential backoff.
+    * [x] Retry logic with exponential backoff.
 * **Caching Strategy**
     * [ ] Incremental cache updates.
     * [ ] Background cache synchronization.
@@ -67,7 +67,7 @@ It outlines features to be implemented and their current status.
     * [x] Clear error messages for `NULL` inputs.
     * [ ] Detailed error codes for programmatic error handling.
 * **Resilience**
-    * [ ] Automatic retry on network failures.
+    * [x] Automatic retry on network failures.
     * [ ] Graceful degradation when Kaggle API is unavailable.
     * [ ] Local-only mode for cached datasets.
 
 
@@ -6,14 +6,41 @@ The table below includes the information about all SQL functions exposed by Gagg
 |---|:--------------------------------------------------------|:-----------------|:------------------------------------------------------------------------------------|
 | 1 | `gaggle_set_credentials(username VARCHAR, key VARCHAR)` | `BOOLEAN`        | Sets Kaggle API credentials for the session. Returns `true` on success.             |
 | 2 | `gaggle_search(query VARCHAR, page INTEGER, page_size INTEGER)` | `VARCHAR (JSON)` | Searches Kaggle for datasets matching the query and returns results as JSON.        |
-| 3 | `gaggle_list_files(dataset_path VARCHAR)`        | `VARCHAR (JSON)` | Lists all files in a Kaggle dataset (format: 'owner/dataset-name').                 |
-| 4 | `gaggle_download(dataset_path VARCHAR)`         | `VARCHAR`        | Downloads a Kaggle dataset and returns the local cache directory path.              |
-| 5 | `gaggle_info(dataset_path VARCHAR)`     | `VARCHAR (JSON)` | Returns metadata for a Kaggle dataset including size, description, and update info. |
+| 3 | `gaggle_list_files(dataset_path VARCHAR)`               | `VARCHAR (JSON)` | Lists all files in a Kaggle dataset (format: 'owner/dataset-name').                 |
+| 4 | `gaggle_download(dataset_path VARCHAR)`                 | `VARCHAR`        | Downloads a Kaggle dataset and returns the local cache directory path.              |
+| 5 | `gaggle_info(dataset_path VARCHAR)`                     | `VARCHAR (JSON)` | Returns metadata for a Kaggle dataset including size, description, and update info. |
+| 6 | `gaggle_get_version()`                                  | `VARCHAR (JSON)` | Returns version information for the Gaggle extension.                                |
+| 7 | `gaggle_last_error()`                                   | `VARCHAR`        | Returns the last error message recorded by the extension (empty string if none).    |
+| 8 | `gaggle_clear_cache()`                                  | `BOOLEAN`        | Clears the local cache directory used by Gaggle.                                     |
+| 9 | `gaggle_get_cache_info()`                               | `VARCHAR (JSON)` | Returns cache statistics including size and location.                                |
+| 10 | `gaggle_json_each(json VARCHAR)`                        | `VARCHAR`        | Returns newline-delimited JSON records from a JSON array string.                     |
+| 11 | `gaggle_get_file_path(dataset_path VARCHAR, filename VARCHAR)` | `VARCHAR` | Resolves and returns the local file path for a file inside a downloaded dataset.    |
 
 > [!NOTE]
 > Kaggle credentials can be provided via environment variables (`KAGGLE_USERNAME`, `KAGGLE_KEY`),
 > a `~/.kaggle/kaggle.json` file, or using the `gaggle_set_credentials()` function.
 
+### Configuration
+
+Gaggle can be configured via environment variables:
+
+- `KAGGLE_USERNAME` - Your Kaggle username
+- `KAGGLE_KEY` - Your Kaggle API key
+- `GAGGLE_CACHE_DIR` - Directory for caching datasets (default: system cache dir)
+- `GAGGLE_VERBOSE` - Enable verbose logging (default: false)
+- `GAGGLE_HTTP_TIMEOUT` - HTTP timeout in seconds (default: 30)
+- `GAGGLE_HTTP_RETRY_ATTEMPTS` - Number of retry attempts on HTTP errors (default: 0)
+- `GAGGLE_HTTP_RETRY_DELAY` - Initial retry delay in milliseconds (default: 1000)
+
+Alternatively, create `~/.kaggle/kaggle.json`:
+
+```json
+{
+    "username": "your-username",
+    "key": "your-api-key"
+}
+```
+
 ---
 
 ### Usage Examples
@@ -62,15 +89,14 @@ from read_csv('~/.gaggle_cache/datasets/username/dataset-name/file.csv');
 
 ```sql
 -- Load the extension
-LOAD
-'build/release/extension/gaggle/gaggle.duckdb_extension';
+LOAD 'build/release/extension/gaggle/gaggle.duckdb_extension';
 
 -- Search for a dataset
 select gaggle_search('iris', 1, 10);
 
--- Download and read the dataset
-select *
-from read_csv((select gaggle_download('uciml/iris') || '/iris.csv'));
+-- Download and read the dataset (avoid subqueries in table function args)
+prepare rp as select * from read_csv(?) limit 10;
+execute rp(gaggle_download('uciml/iris') || '/iris.csv');
 ```
 
 ---
 
@@ -9,23 +9,31 @@ load 'build/release/extension/gaggle/gaggle.duckdb_extension';
 select gaggle_set_credentials('your-username', 'your-api-key') as credentials_set;
 
 -- Get path to specific file
-select gaggle_get_file_path('owid/covid-latest-data', 'owid-covid-latest.csv') as file_path;
+select gaggle_get_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet') as file_path;
 
--- Use the file path with DuckDB's read_csv_auto
-select * from read_csv_auto(
-    (select gaggle_get_file_path('owid/covid-latest-data', 'owid-covid-latest.csv'))
-) limit 10;
+-- Use the file path with DuckDB's read_parquet
+-- DuckDB table functions cannot contain subqueries as arguments.
+-- Use PREPARE/EXECUTE to pass the computed path as a parameter instead.
+prepare rp as select * from read_parquet(?);
+execute rp(gaggle_get_file_path('habedi/flickr-8k-dataset-clean', 'flickr8k.parquet')) limit 10;
 
 -- section 2: List and process multiple files
 select '## List and process dataset files';
 with files as (
-  select gaggle_list_files('owid/covid-latest-data') as files_json
+  select gaggle_list_files('habedi/flickr-8k-dataset-clean') as files_json
 )
 select files_json from files;
 
+-- section 2b: Use replacement scan for direct reads via kaggle: URLs
+select '## Replacement scan - direct reads via kaggle:';
+-- Single file read
+select count(*) from 'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet';
+-- Glob pattern over CSVs
+select count(*) from 'kaggle:habedi/flickr-8k-dataset-clean/*.parquet';
+
 -- section 3: Download and verify cache
 select '## Verify dataset is cached';
-select gaggle_download('owid/covid-latest-data') as cached_path;
+select gaggle_download('habedi/flickr-8k-dataset-clean') as cached_path;
 select gaggle_get_cache_info() as cache_status;
 
 -- section 4: Clear cache if needed
 
@@ -1,8 +1,26 @@
 -- Gaggle - Kaggle Dataset Extension for DuckDB
 -- Example Usage
 
--- load the extension
+.echo on
+
++-- Optional: configure retry/backoff via environment
++--   export GAGGLE_HTTP_RETRY_ATTEMPTS=3
++--   export GAGGLE_HTTP_RETRY_DELAY=250
++
+-- Load the extension and set credentials
 load 'build/release/extension/gaggle/gaggle.duckdb_extension';
+select gaggle_set_credentials('your-username', 'your-api-key') as credentials_set;
+
+-- Download a dataset and read a file via local path
+select gaggle_download('habedi/flickr-8k-dataset-clean') as flickr_path;
+select * from read_parquet((select gaggle_download('habedi/flickr-8k-dataset-clean') || '/flickr8k.parquet')) limit 5;
+
+-- Read directly via kaggle: URL using replacement scan
+select count(*) from 'kaggle:habedi/flickr-8k-dataset-clean/flickr8k.parquet';
+-- Glob Parquet files in a dataset directory
+select count(*) from 'kaggle:habedi/flickr-8k-dataset-clean/*.parquet';
++
+.echo off
 
 -- set kaggle credentials (or use kaggle_username and kaggle_key env vars, or ~/.kaggle/kaggle.json)
 select gaggle_set_credentials('your-username', 'your-api-key');
@@ -11,21 +29,21 @@ select gaggle_set_credentials('your-username', 'your-api-key');
 select gaggle_get_version();
 
 -- search for datasets
-select * from json_each(gaggle_search('covid-19', 1, 10));
+select * from json_each(gaggle_search('flickr', 1, 10));
 
 -- download a dataset
-select gaggle_download('owid/covid-latest-data');
+select gaggle_download('habedi/flickr-8k-dataset-clean');
 
 -- list files in a dataset
-select * from json_each(gaggle_list_files('owid/covid-latest-data'));
+select * from json_each(gaggle_list_files('habedi/flickr-8k-dataset-clean'));
 
 -- get dataset metadata
-select * from json_each(gaggle_info('owid/covid-latest-data'));
+select * from json_each(gaggle_info('habedi/flickr-8k-dataset-clean'));
 
 -- read a csv file from kaggle dataset directly
--- option 1: using read_csv with the file path
-select * from read_csv_auto(
-    (select gaggle_download('owid/covid-latest-data') || '/owid-covid-latest.csv')
+-- option 1: using read_parquet with the file path
+select * from read_parquet(
+    (select gaggle_download('habedi/flickr-8k-dataset-clean') || '/flickr8k.parquet')
 ) limit 10;
 
 -- clear cache