diff --git a/dvmeta/func.py b/dvmeta/func.py index 7cc72c5..01ebf3a 100644 --- a/dvmeta/func.py +++ b/dvmeta/func.py @@ -52,7 +52,7 @@ def get_pids(read_dict: dict, config: dict) -> tuple: return empty_dv, write_dict -def check_connection(config: dict) -> bool: +def check_connection(config: dict) -> tuple[bool, bool]: """Check the connection to the dataverse repository. Args: @@ -60,27 +60,36 @@ def check_connection(config: dict) -> bool: auth (bool): Check the connection with authentication Returns: - bool: True if the connection is successful, False otherwise + bool: True if the connection is successful + bool: True if the connection is successful with authentication """ - if config.get('API_KEY'): - url = f"{config['BASE_URL']}/api/mydata/retrieve?role_ids=8&dvobject_types=Dataverse&published_states=Published&per_page=1" # noqa: E501 - config['HEADERS'] = {'X-Dataverse-key': config['API_KEY']} - print('Checking the connection to the dataverse repository with authentication...\n') # noqa: E501 - else: - url = f"{config['BASE_URL']}/api/info/version" - config['HEADERS'] = {} - print('Checking the connection to the dataverse repository without authentication...\n') # noqa: E501 + base_url = config.get('BASE_URL') + api_key = config.get('API_KEY') + auth_headers = {'X-Dataverse-key': api_key} if api_key and api_key.lower() != 'none' else {} + auth_url = f'{base_url}/api/mydata/retrieve?role_ids=8&dvobject_types=Dataverse&published_states=Published&per_page=1' # noqa: E501 + public_url = f'{base_url}/api/info/version' + try: with HttpxClient(config) as httpx_client: - response = httpx_client.sync_get(url) + if auth_headers: + print('Checking the connection to the Dataverse repository with authentication...') + response = httpx_client.sync_get(auth_url) + if response and response.status_code == httpx_client.httpx_success_status: + print(f'Connection to the dataverse repository {config["BASE_URL"]} is successful.\n') + return True, True + print('Your API_KEY is invalid. The crawler will now fall back using unauthenticated connection.\n') + + # Attempt to connect to the repository without authentication + response = httpx_client.sync_get(public_url) if response and response.status_code == httpx_client.httpx_success_status: - print(f'Connection to the dataverse repository {config["BASE_URL"]} is successful.\n') # noqa: E501 - return True - print('Your API_KEY is invalid and therefore failed to connect to the dataverse repository. Please check your input.\n') # noqa: E501 - return False + print(f'Unauthenticated connection to the dataverse repository {config["BASE_URL"]} is successful. The script continue crawling.\n') # noqa: E501 + return True, False + print(f'Failed to connect to the dataverse repository {config["BASE_URL"]}.\nExiting...\n') # noqa: E501 + return False, False + except httpx.HTTPStatusError as e: print(f'Failed to connect to the dataverse repository {config["BASE_URL"]}: HTTP Error {e.response.status_code}\n') # noqa: E501 - return False + return False, False def version_type(value: str) -> str: @@ -103,9 +112,7 @@ def version_type(value: str) -> str: if value in valid_special_versions or re.match(r'^\d+(\.\d+)?$', value): return value msg = f'Invalid value for --version: "{value}".\nMust be "draft", "latest", "latest-published", or a version number like "x" or "x.y".' # noqa: E501 - raise typer.BadParameter( - msg - ) + raise typer.BadParameter(msg) def validate_spreadsheet(value: bool, dvdfds_metadata: bool) -> bool: diff --git a/dvmeta/main.py b/dvmeta/main.py index 2df929d..bcbee85 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -79,9 +79,6 @@ def main( start_time_obj, start_time_display = utils.Timestamp().get_current_time(), utils.Timestamp().get_display_time() print(f'Start time: {start_time_display}\n') - # Load the crawler - metadata_crawler = MetaDataCrawler(config) - # Check if either dvdfds_matadata or permission is provided if not dvdfds_matadata and not permission: print( @@ -90,13 +87,22 @@ def main( sys.exit(1) # Check if the authentication token is provided if the permission metadata is requested to be crawled - if permission and config.get('API_KEY') is None: - print('Error: Crawling permission metadata requires API Token. Please provide the API Token.\nExiting...') + if permission and config.get('API_KEY') is None or config.get('API_KEY') == 'None': + print('Error: Crawling permission metadata requires API Token. Please provide the API Token.Exiting...') sys.exit(1) # Check the connection to the dataverse repository - if not func.check_connection(config): + connection_status, auth_status = func.check_connection(config) + if not connection_status: sys.exit(1) + if not auth_status: + config['API_KEY'] = None + if permission: + print('[WARNING]: Crawling permission metadata requires valid API Token. The script will skip crawling permission metadata\n') + permission = False + + # Initialize the crawler + metadata_crawler = MetaDataCrawler(config) # Crawl the collection tree metadata response = metadata_crawler.get_collections_tree(collection_alias) diff --git a/dvmeta/metadatacrawler.py b/dvmeta/metadatacrawler.py index 2d7dca5..0469170 100644 --- a/dvmeta/metadatacrawler.py +++ b/dvmeta/metadatacrawler.py @@ -21,7 +21,7 @@ class MetaDataCrawler: def __init__(self, config: dict) -> None: """Initialize the class with the configuration settings.""" - self.config = config + self.config = self._define_headers(config) self.url_tree = f"{config['BASE_URL']}/api/info/metrics/tree?parentAlias={config['COLLECTION_ALIAS']}" self.http_success_status = 200 self.url_dataverse = f"{config['BASE_URL']}/api/dataverses" @@ -30,7 +30,27 @@ def __init__(self, config: dict) -> None: self.write_dict = {} self.failed_dict = [] self.url = None - self.client = HttpxClient(config) + self.client = HttpxClient(self.config) + + @staticmethod + def _define_headers(config: dict) -> dict[str, str]: + """Define the headers for the HTTP request. + + Args: + config (dict): Configuration dictionary + + Returns: + dict[str, str]: Dictionary containing the headers + """ + headers = {'Accept': 'application/json'} + + api_key = config.get('API_KEY') + if api_key and str(api_key).lower() != 'none': + headers['X-Dataverse-key'] = api_key + + config['HEADERS'] = headers + + return config def _get_dataset_content_url(self, identifier: str) -> str: return f"{self.config['BASE_URL']}/api/datasets/:persistentId/versions/:{self.config['VERSION']}?persistentId={identifier}" # noqa: E501 diff --git a/example.ipynb b/example.ipynb new file mode 100644 index 0000000..ab15bdf --- /dev/null +++ b/example.ipynb @@ -0,0 +1,103 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 1: Setting environment variables\n", + "Replace the values inside the quotes for BASE_URL and API_KEY.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the placeholders with your own values and run this script to create a .env file\n", + "BASE_URL = 'TARGET_REPO_URL' # Base URL of the repository; e.g., \"https://demo.borealisdata.ca/\"\n", + "API_KEY = 'YOUR_API_KEY' # Found in your Dataverse account settings. Optional. Delete this line if you plan not to use it.\n", + "\n", + "\n", + "# Write the .env file\n", + "with open('.env', 'w', encoding='utf-8') as file:\n", + " if locals().get('API_KEY') is None:\n", + " file.write(f'BASE_URL = \"{BASE_URL}\"\\n')\n", + " else:\n", + " file.write(f'BASE_URL = \"{BASE_URL}\"\\n')\n", + " file.write(f'API_KEY = \"{API_KEY}\"\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 2: Running the command line tool\n", + "The following cell runs the comand line tool.\n", + "\n", + "**Configuration**:\n", + "1. Replace the COLLECTION_ALIAS with your desired value. See [here](https://github.com/scholarsportal/dataverse-metadata-crawler/wiki/Guide:-How-to-find-the-COLLECTION_ALIAS-of-a-Dataverse-collection) for getting your collection alias.\n", + "2. Replace the VERSION with your desired value. It can either be 'latest', 'latest-published' or a version number 'x.y' (like '1.0')\n", + "3. Add the optional flags. See the following table for your reference:\n", + " \n", + "\n", + "| **Option** | **Short** | **Type** | **Description** | **Default** |\n", + "|----------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|\n", + "| --auth | -a | TEXT | Authentication token to access the Dataverse repository.
| None |\n", + "| --log
--no-log | -l | | Output a log file.
Use `--no-log` to disable logging. | `log` (unless `--no-log`) |\n", + "| --dvdfds_metadata | -d | | Output a JSON file containing metadata of Dataverses, Datasets, and Data Files. | |\n", + "| --permission | -p | | Output a JSON file that stores permission metadata for all Datasets in the repository. | |\n", + "| --emptydv | -e | | Output a JSON file that stores all Dataverses which do **not** contain Datasets (though they might have child Dataverses which have Datasets). | |\n", + "| --failed | -f | | Output a JSON file of Dataverses/Datasets that failed to be crawled. | |\n", + "| --spreadsheet | -s | | Output a CSV file of the metadata of Datasets. | |\n", + "| --help | | | Show the help message. | |\n", + "\n", + "Example:\n", + "1. Export the metadata of latest version of datasets under collection 'demo' to JSON\n", + "\n", + " `!python3 dvmeta/main.py -c demo -v latest -d`\n", + "\n", + "2. Export the metadata of version 1.0 of all datasets under collection 'demo' to JSON and CSV\n", + "\n", + " `!python3 dvmeta/main.py -c demo -v 1.0 -d -s`\n", + "\n", + "3. Export the metadata and permission metadata of version latest-published of all datasets under collection 'toronto' to JSON and CSV. Also export the empty dataverses and datasets failed to be rawled\n", + "\n", + " `!python3 dvmeta/main.py -c toronto -v 1.0 -d -s -p -e -f`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the command line interface\n", + "# Replace 'COLLECTION_ALIAS' and 'VERSION' with your values\n", + "# Modify the flags as needed referring to the table above\n", + "!python3 dvmeta/main.py -c 'COLLECTION_ALIAS' -v 'VERSION' -d -s -p -e -f" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}