From d0b024de967abc1f5ba43dd06ec862bfca947eda Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Mon, 3 Feb 2025 11:01:20 -0500 Subject: [PATCH] 1. Updated handling of check connetion. Fall back to Unauthenticated connection if API key is invalid. 2. Moved defining headers to `MetaDataCrawler` class 3. Added example.ipynb for running the crawler on mybinder.org 4. Updated README, CITATION.cff and pyproject.toml. --- .../workflows/poetry-export_dependencies.yml | 3 +- CITATION.cff | 4 +- README.md | 20 ++-- dvmeta/func.py | 45 ++++---- dvmeta/main.py | 18 ++- dvmeta/metadatacrawler.py | 24 +++- example.ipynb | 104 ++++++++++++++++++ pyproject.toml | 2 +- 8 files changed, 182 insertions(+), 38 deletions(-) create mode 100644 example.ipynb diff --git a/.github/workflows/poetry-export_dependencies.yml b/.github/workflows/poetry-export_dependencies.yml index ded344b..685b151 100644 --- a/.github/workflows/poetry-export_dependencies.yml +++ b/.github/workflows/poetry-export_dependencies.yml @@ -38,7 +38,8 @@ jobs: - name: Check for changes id: check_changes run: | - if [[ -n "$(git status --porcelain requirements.txt poetry.lock)" ]]; then + # Use git diff to check actual content changes + if ! git diff --quiet requirements.txt poetry.lock; then echo "changes=true" >> $GITHUB_OUTPUT else echo "changes=false" >> $GITHUB_OUTPUT diff --git a/CITATION.cff b/CITATION.cff index a2f4cff..4c6b7e5 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,10 +1,10 @@ -cff-version: 0.1.1 +cff-version: 0.1.2 message: "If you use this software, please cite it as below." authors: - family-names: "Lui" given-names: "Lok Hei" orcid: "https://orcid.org/0000-0001-5077-1530" title: "Dataverse Metadata Crawler" -version: 0.1.1 +version: 0.1.2 date-released: 2025-01-28 url: "https://github.com/scholarsportal/dataverse-metadata-crawler" diff --git a/README.md b/README.md index f93d90d..460a188 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-blue)](https://opensource.org/license/mit) [![Dataverse](https://img.shields.io/badge/Dataverse-FFA500?)](https://dataverse.org/) [![Code Style: Black](https://img.shields.io/badge/code_style-black-black?)](https://github.com/psf/black) +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/scholarsportal/dataverse-metadata-crawler/main?urlpath=%2Fdoc%2Ftree%2Fexample.ipynb) # Dataverse Metadata Crawler ![Screencapture of the CLI tool](res/screenshot.png) @@ -13,12 +14,17 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/ 1. Bulk metadata extraction from Dataverse repositories at any chosen level of collection (top level or selected collection) 2. JSON & CSV file export options -## 📦Prerequisites -1. Git -2. Python 3.10+ +## ☁️ Installation (Cloud - Slower) +Click +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/scholarsportal/dataverse-metadata-crawler/main?urlpath=%2Fdoc%2Ftree%2Fexample.ipynb) +to launch the crawler directly in your web browser—no Git or Python installation required! -## ⚙️Installation +## ⚙️Installation (Locally - Better performance) +### 📦Prerequisites +1. [Git](https://git-scm.com/) +2. [Python 3.10+](https://www.python.org/) +--- 1. Clone the repository ```sh git clone https://github.com/scholarsportal/dataverse-metadata-crawler.git @@ -87,7 +93,7 @@ python3 dvmeta/main.py [-a AUTH] [-l] [-d] [-p] [-f] [-e] [-s] -c COLLECTION_ALI | --permission | -p | | Output a JSON file that stores permission metadata for all Datasets in the repository. | | | --emptydv | -e | | Output a JSON file that stores all Dataverses which do **not** contain Datasets (though they might have child Dataverses which have Datasets). | | | --failed | -f | | Output a JSON file of Dataverses/Datasets that failed to be crawled. | | -| --spreadsheet | -s | | Output a CSV file of the metadata of Datasets. | | +| --spreadsheet | -s | | Output a CSV file of the metadata of Datasets.
You may find the spreadsheet column explanation [here](https://github.com/scholarsportal/dataverse-metadata-crawler/wiki/Explanation-of--Spreadsheet-Column-Headers). | | | --help | | | Show the help message. | | ### Examples @@ -157,7 +163,7 @@ If you use this software in your work, please cite it using the following metada APA: ``` -Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.1) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler +Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.2) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler ``` BibTeX: @@ -167,7 +173,7 @@ BibTeX: month = {jan}, title = {Dataverse Metadata Crawler}, url = {https://github.com/scholarsportal/dataverse-metadata-crawler}, - version = {0.1.1}, + version = {0.1.2}, year = {2025} } ``` diff --git a/dvmeta/func.py b/dvmeta/func.py index 7cc72c5..01ebf3a 100644 --- a/dvmeta/func.py +++ b/dvmeta/func.py @@ -52,7 +52,7 @@ def get_pids(read_dict: dict, config: dict) -> tuple: return empty_dv, write_dict -def check_connection(config: dict) -> bool: +def check_connection(config: dict) -> tuple[bool, bool]: """Check the connection to the dataverse repository. Args: @@ -60,27 +60,36 @@ def check_connection(config: dict) -> bool: auth (bool): Check the connection with authentication Returns: - bool: True if the connection is successful, False otherwise + bool: True if the connection is successful + bool: True if the connection is successful with authentication """ - if config.get('API_KEY'): - url = f"{config['BASE_URL']}/api/mydata/retrieve?role_ids=8&dvobject_types=Dataverse&published_states=Published&per_page=1" # noqa: E501 - config['HEADERS'] = {'X-Dataverse-key': config['API_KEY']} - print('Checking the connection to the dataverse repository with authentication...\n') # noqa: E501 - else: - url = f"{config['BASE_URL']}/api/info/version" - config['HEADERS'] = {} - print('Checking the connection to the dataverse repository without authentication...\n') # noqa: E501 + base_url = config.get('BASE_URL') + api_key = config.get('API_KEY') + auth_headers = {'X-Dataverse-key': api_key} if api_key and api_key.lower() != 'none' else {} + auth_url = f'{base_url}/api/mydata/retrieve?role_ids=8&dvobject_types=Dataverse&published_states=Published&per_page=1' # noqa: E501 + public_url = f'{base_url}/api/info/version' + try: with HttpxClient(config) as httpx_client: - response = httpx_client.sync_get(url) + if auth_headers: + print('Checking the connection to the Dataverse repository with authentication...') + response = httpx_client.sync_get(auth_url) + if response and response.status_code == httpx_client.httpx_success_status: + print(f'Connection to the dataverse repository {config["BASE_URL"]} is successful.\n') + return True, True + print('Your API_KEY is invalid. The crawler will now fall back using unauthenticated connection.\n') + + # Attempt to connect to the repository without authentication + response = httpx_client.sync_get(public_url) if response and response.status_code == httpx_client.httpx_success_status: - print(f'Connection to the dataverse repository {config["BASE_URL"]} is successful.\n') # noqa: E501 - return True - print('Your API_KEY is invalid and therefore failed to connect to the dataverse repository. Please check your input.\n') # noqa: E501 - return False + print(f'Unauthenticated connection to the dataverse repository {config["BASE_URL"]} is successful. The script continue crawling.\n') # noqa: E501 + return True, False + print(f'Failed to connect to the dataverse repository {config["BASE_URL"]}.\nExiting...\n') # noqa: E501 + return False, False + except httpx.HTTPStatusError as e: print(f'Failed to connect to the dataverse repository {config["BASE_URL"]}: HTTP Error {e.response.status_code}\n') # noqa: E501 - return False + return False, False def version_type(value: str) -> str: @@ -103,9 +112,7 @@ def version_type(value: str) -> str: if value in valid_special_versions or re.match(r'^\d+(\.\d+)?$', value): return value msg = f'Invalid value for --version: "{value}".\nMust be "draft", "latest", "latest-published", or a version number like "x" or "x.y".' # noqa: E501 - raise typer.BadParameter( - msg - ) + raise typer.BadParameter(msg) def validate_spreadsheet(value: bool, dvdfds_metadata: bool) -> bool: diff --git a/dvmeta/main.py b/dvmeta/main.py index 2df929d..bcbee85 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -79,9 +79,6 @@ def main( start_time_obj, start_time_display = utils.Timestamp().get_current_time(), utils.Timestamp().get_display_time() print(f'Start time: {start_time_display}\n') - # Load the crawler - metadata_crawler = MetaDataCrawler(config) - # Check if either dvdfds_matadata or permission is provided if not dvdfds_matadata and not permission: print( @@ -90,13 +87,22 @@ def main( sys.exit(1) # Check if the authentication token is provided if the permission metadata is requested to be crawled - if permission and config.get('API_KEY') is None: - print('Error: Crawling permission metadata requires API Token. Please provide the API Token.\nExiting...') + if permission and config.get('API_KEY') is None or config.get('API_KEY') == 'None': + print('Error: Crawling permission metadata requires API Token. Please provide the API Token.Exiting...') sys.exit(1) # Check the connection to the dataverse repository - if not func.check_connection(config): + connection_status, auth_status = func.check_connection(config) + if not connection_status: sys.exit(1) + if not auth_status: + config['API_KEY'] = None + if permission: + print('[WARNING]: Crawling permission metadata requires valid API Token. The script will skip crawling permission metadata\n') + permission = False + + # Initialize the crawler + metadata_crawler = MetaDataCrawler(config) # Crawl the collection tree metadata response = metadata_crawler.get_collections_tree(collection_alias) diff --git a/dvmeta/metadatacrawler.py b/dvmeta/metadatacrawler.py index 2d7dca5..0469170 100644 --- a/dvmeta/metadatacrawler.py +++ b/dvmeta/metadatacrawler.py @@ -21,7 +21,7 @@ class MetaDataCrawler: def __init__(self, config: dict) -> None: """Initialize the class with the configuration settings.""" - self.config = config + self.config = self._define_headers(config) self.url_tree = f"{config['BASE_URL']}/api/info/metrics/tree?parentAlias={config['COLLECTION_ALIAS']}" self.http_success_status = 200 self.url_dataverse = f"{config['BASE_URL']}/api/dataverses" @@ -30,7 +30,27 @@ def __init__(self, config: dict) -> None: self.write_dict = {} self.failed_dict = [] self.url = None - self.client = HttpxClient(config) + self.client = HttpxClient(self.config) + + @staticmethod + def _define_headers(config: dict) -> dict[str, str]: + """Define the headers for the HTTP request. + + Args: + config (dict): Configuration dictionary + + Returns: + dict[str, str]: Dictionary containing the headers + """ + headers = {'Accept': 'application/json'} + + api_key = config.get('API_KEY') + if api_key and str(api_key).lower() != 'none': + headers['X-Dataverse-key'] = api_key + + config['HEADERS'] = headers + + return config def _get_dataset_content_url(self, identifier: str) -> str: return f"{self.config['BASE_URL']}/api/datasets/:persistentId/versions/:{self.config['VERSION']}?persistentId={identifier}" # noqa: E501 diff --git a/example.ipynb b/example.ipynb new file mode 100644 index 0000000..6d7092e --- /dev/null +++ b/example.ipynb @@ -0,0 +1,104 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 1: Setting environment variables\n", + "Replace the values inside the quotes for BASE_URL and API_KEY.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the placeholders with your own values and run this script to create a .env file\n", + "BASE_URL = 'TARGET_REPO_URL' # Base URL of the repository; e.g., \"https://demo.borealisdata.ca/\"\n", + "API_KEY = 'YOUR_API_KEY' # Found in your Dataverse account settings. Optional. Delete this line if you plan not to use it.\n", + "\n", + "\n", + "# Write the .env file\n", + "with open('.env', 'w', encoding='utf-8') as file:\n", + " if locals().get('API_KEY') is None:\n", + " file.write(f'BASE_URL = \"{BASE_URL}\"\\n')\n", + " else:\n", + " file.write(f'BASE_URL = \"{BASE_URL}\"\\n')\n", + " file.write(f'API_KEY = \"{API_KEY}\"\\n')\n", + " print('Successfully created the .env file!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 2: Running the command line tool\n", + "The following cell runs the comand line tool.\n", + "\n", + "**Configuration**:\n", + "1. Replace the COLLECTION_ALIAS with your desired value. See [here](https://github.com/scholarsportal/dataverse-metadata-crawler/wiki/Guide:-How-to-find-the-COLLECTION_ALIAS-of-a-Dataverse-collection) for getting your collection alias.\n", + "2. Replace the VERSION with your desired value. It can either be 'latest', 'latest-published' or a version number 'x.y' (like '1.0')\n", + "3. Add the optional flags. See the following table for your reference:\n", + " \n", + "\n", + "| **Option** | **Short** | **Type** | **Description** | **Default** |\n", + "|----------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|\n", + "| --auth | -a | TEXT | Authentication token to access the Dataverse repository.
| None |\n", + "| --log
--no-log | -l | | Output a log file.
Use `--no-log` to disable logging. | `log` (unless `--no-log`) |\n", + "| --dvdfds_metadata | -d | | Output a JSON file containing metadata of Dataverses, Datasets, and Data Files. | |\n", + "| --permission | -p | | Output a JSON file that stores permission metadata for all Datasets in the repository. | |\n", + "| --emptydv | -e | | Output a JSON file that stores all Dataverses which do **not** contain Datasets (though they might have child Dataverses which have Datasets). | |\n", + "| --failed | -f | | Output a JSON file of Dataverses/Datasets that failed to be crawled. | |\n", + "| --spreadsheet | -s | | Output a CSV file of the metadata of Datasets. | |\n", + "| --help | | | Show the help message. | |\n", + "\n", + "Example:\n", + "1. Export the metadata of latest version of datasets under collection 'demo' to JSON\n", + "\n", + " `!python3 dvmeta/main.py -c demo -v latest -d`\n", + "\n", + "2. Export the metadata of version 1.0 of all datasets under collection 'demo' to JSON and CSV\n", + "\n", + " `!python3 dvmeta/main.py -c demo -v 1.0 -d -s`\n", + "\n", + "3. Export the metadata and permission metadata of version latest-published of all datasets under collection 'toronto' to JSON and CSV. Also export the empty dataverses and datasets failed to be crawled\n", + "\n", + " `!python3 dvmeta/main.py -c toronto -v latest-published -d -s -p -e -f`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the command line interface\n", + "# Replace 'COLLECTION_ALIAS' and 'VERSION' with your values\n", + "# Modify the flags as needed referring to the table above\n", + "!python3 dvmeta/main.py -c 'COLLECTION_ALIAS' -v 'VERSION' -d -s -p -e -f" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index 5f3d38a..d97644c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dataverse-metadata-crawler" -version = "0.1.1" +version = "0.1.2" description = "A Python CLI tool for bulk extracting and exporting metadata from Dataverse repositories' collections to JSON and CSV formats." authors = ["Ken Lui "] license = "MIT"