From 223347cadf193fa8506a9ff6191496e9dc28460f Mon Sep 17 00:00:00 2001 From: Tom Christian Date: Tue, 8 Jul 2025 15:31:35 -0700 Subject: [PATCH] fix(#177): fixed fixes_to_apply persistence and documented --- README.md | 4 ++++ docs/index-config.md | 17 +++++++++++------ .../src/stac_index/indexer/creator/creator.py | 2 +- scripts/run-with-remote-source.sh | 19 +++++++++++++++++-- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index cb639df..9801b5a 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,13 @@ scripts/run-with-local-http.sh This project includes a convenience script to index and serve a remote STAC catalog. This script will fully index the remote STAC catalog each time it is run. This may not be the most efficient way to meet your needs, but it does help demonstrate some of this project's capabilities. +This script can optionally be called with a comma-separated list of STAC item JSON fixers, invoking the behaviour described [here](./docs/index-config.md#fixes). + ```sh # indexes a public static STAC catalog over HTTPS and runs the API scripts/run-with-remote-source.sh https://esa.pages.eox.at/cubes-and-clouds-catalog/MOOC_Cubes_and_clouds/catalog.json +# indexes and attempts to apply a single fixer if necessary +scripts/run-with-remote-source.sh https://esa.pages.eox.at/cubes-and-clouds-catalog/MOOC_Cubes_and_clouds/catalog.json --fixes_to_apply eo-extension-uri ``` Output includes the following information about the index. diff --git a/docs/index-config.md b/docs/index-config.md index 0589cd0..0488b9d 100644 --- a/docs/index-config.md +++ b/docs/index-config.md @@ -1,15 +1,13 @@ # Index Configuration The indexer requires exactly one of the following arguments -- `--root_catalog_uri` referencing the location of a STAC catalog JSON -- `--manifest_json_uri` referencing the index manifest from a prior indexer run +- `--root_catalog_uri` referencing the location of a STAC catalog JSON. +- `--manifest_json_uri` referencing the index manifest from a prior indexer run. -The indexer can optionally accept an argument referencing a JSON index configuration file, which offers greater control over indexer behaviour. The following describes that file's content. +When indexing a new STAC catalog (i.e. not updating an existing index) the indexer can optionally accept an argument referencing a JSON index configuration file, which offers greater control over indexer behaviour. The following describes that file's content. ## Optional Properties -Any number of queryable and sortable STAC properties may be configured. - ### Indexables The indexer requires knowledge of the DuckDB data type that can be used to store queryable or sortable properties. Because properties can be both queryable _and_ sortable this configuration is maintained in the `indexables` property to avoid duplication. @@ -24,6 +22,10 @@ Each queryable and sortable property must include a list of collections for whic Queryables require a `json_schema` property containing a schema that could be used to validate values of this property. This JSON schema is not used directly by the API but is provided to API clients via the `/queryables` endpoints such that a client can validate any value it intends to send as query value for this property. +### Fixes + +The indexer attempts to parse STAC item JSON using [stac-pydantic](https://pypi.org/project/stac-pydantic/). stac-pydantic is not particularly lenient and will reject invalid JSON, resulting in the STAC item not being indexed and an error in the indexer log. This may be valid in some use-cases, but in cases where STAC item JSON cannot be fixed, and may not be owned or controlled by the indexer's user, it might be preferable to index invalid JSON. The indexer supports a `fixes_to_apply` property. This property accepts a list of fixer names to attempt to apply to invalid JSON. Fixers are defined [in code](../packages/stac-index/src/stac_index/indexer/stac_parser.py) and must exist before being referenced here. The list of available fixers is currently short and may be expanded in future to accommodate common validity problems. + ## Example ```json @@ -52,6 +54,9 @@ Queryables require a `json_schema` property containing a schema that could be us "joplin" ] } - } + }, + "fixes_to_apply": [ + "eo-extension-uri" + ] } ``` diff --git a/packages/stac-index/src/stac_index/indexer/creator/creator.py b/packages/stac-index/src/stac_index/indexer/creator/creator.py index 41130c2..033cb60 100644 --- a/packages/stac-index/src/stac_index/indexer/creator/creator.py +++ b/packages/stac-index/src/stac_index/indexer/creator/creator.py @@ -80,7 +80,6 @@ async def _index_stac_source( self: Self, root_catalog_uri: str, index_config: Optional[IndexConfig] = None, - output_dir: Optional[str] = None, ) -> Tuple[List[IndexingError], str]: _logger.info(f"indexing stac source for load {self._load_id}") self._create_db_objects() @@ -98,6 +97,7 @@ async def _index_stac_source( collection_errors + items_errors, self._export_db_objects( root_catalog_uri=root_catalog_uri, + index_config=index_config, ), ) diff --git a/scripts/run-with-remote-source.sh b/scripts/run-with-remote-source.sh index e85ce16..9e6942d 100755 --- a/scripts/run-with-remote-source.sh +++ b/scripts/run-with-remote-source.sh @@ -10,6 +10,21 @@ if [ "$#" -lt 1 ]; then fi export root_catalog_uri="$1" +shift + +fixes_to_apply="" +while [[ $# -gt 0 ]]; do + case $1 in + --fixes_to_apply) + fixes_to_apply="$2" + shift; shift + ;; + *) + echo "Unknown option $1" + exit 1 + ;; + esac +done if [[ $root_catalog_uri == s3://* ]]; then echo; echo "* Assumes \$AWS_ACCESS_KEY_ID, \$AWS_REGION, \$AWS_SECRET_ACCESS_KEY, and (optionally) \$AWS_SESSION_TOKEN are set for obstore *"; echo @@ -25,9 +40,9 @@ if [ -f $"$tmp_index_path/manifest.json" ]; then unset root_catalog_uri else # No point evaluating this if updating an existing index as it will be ignored. - if [ -n "${FIXES_TO_APPLY}" ]; then + if [ -n "$fixes_to_apply" ]; then export tmp_index_config_path=$(mktemp) - fixes_json=$(echo "${FIXES_TO_APPLY}" | sed "s/,\s*/\", \"/g") + fixes_json=$(echo "$fixes_to_apply" | sed "s/,\s*/\", \"/g") echo "{\"fixes_to_apply\": [\"${fixes_json}\"]}" > $tmp_index_config_path fi fi