diff --git a/.github/workflows/poetry-export_dependencies.yml b/.github/workflows/poetry-export_dependencies.yml
index ded344b..685b151 100644
--- a/.github/workflows/poetry-export_dependencies.yml
+++ b/.github/workflows/poetry-export_dependencies.yml
@@ -38,7 +38,8 @@ jobs:
- name: Check for changes
id: check_changes
run: |
- if [[ -n "$(git status --porcelain requirements.txt poetry.lock)" ]]; then
+ # Use git diff to check actual content changes
+ if ! git diff --quiet requirements.txt poetry.lock; then
echo "changes=true" >> $GITHUB_OUTPUT
else
echo "changes=false" >> $GITHUB_OUTPUT
diff --git a/CITATION.cff b/CITATION.cff
index a2f4cff..4c6b7e5 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -1,10 +1,10 @@
-cff-version: 0.1.1
+cff-version: 0.1.2
message: "If you use this software, please cite it as below."
authors:
- family-names: "Lui"
given-names: "Lok Hei"
orcid: "https://orcid.org/0000-0001-5077-1530"
title: "Dataverse Metadata Crawler"
-version: 0.1.1
+version: 0.1.2
date-released: 2025-01-28
url: "https://github.com/scholarsportal/dataverse-metadata-crawler"
diff --git a/README.md b/README.md
index f93d90d..460a188 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
[](https://opensource.org/license/mit)
[](https://dataverse.org/)
[](https://github.com/psf/black)
+[](https://mybinder.org/v2/gh/scholarsportal/dataverse-metadata-crawler/main?urlpath=%2Fdoc%2Ftree%2Fexample.ipynb)
# Dataverse Metadata Crawler

@@ -13,12 +14,17 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/
1. Bulk metadata extraction from Dataverse repositories at any chosen level of collection (top level or selected collection)
2. JSON & CSV file export options
-## 📦Prerequisites
-1. Git
-2. Python 3.10+
+## ☁️ Installation (Cloud - Slower)
+Click
+[](https://mybinder.org/v2/gh/scholarsportal/dataverse-metadata-crawler/main?urlpath=%2Fdoc%2Ftree%2Fexample.ipynb)
+to launch the crawler directly in your web browser—no Git or Python installation required!
-## ⚙️Installation
+## ⚙️Installation (Locally - Better performance)
+### 📦Prerequisites
+1. [Git](https://git-scm.com/)
+2. [Python 3.10+](https://www.python.org/)
+---
1. Clone the repository
```sh
git clone https://github.com/scholarsportal/dataverse-metadata-crawler.git
@@ -87,7 +93,7 @@ python3 dvmeta/main.py [-a AUTH] [-l] [-d] [-p] [-f] [-e] [-s] -c COLLECTION_ALI
| --permission | -p | | Output a JSON file that stores permission metadata for all Datasets in the repository. | |
| --emptydv | -e | | Output a JSON file that stores all Dataverses which do **not** contain Datasets (though they might have child Dataverses which have Datasets). | |
| --failed | -f | | Output a JSON file of Dataverses/Datasets that failed to be crawled. | |
-| --spreadsheet | -s | | Output a CSV file of the metadata of Datasets. | |
+| --spreadsheet | -s | | Output a CSV file of the metadata of Datasets.
You may find the spreadsheet column explanation [here](https://github.com/scholarsportal/dataverse-metadata-crawler/wiki/Explanation-of--Spreadsheet-Column-Headers). | |
| --help | | | Show the help message. | |
### Examples
@@ -157,7 +163,7 @@ If you use this software in your work, please cite it using the following metada
APA:
```
-Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.1) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler
+Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.2) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler
```
BibTeX:
@@ -167,7 +173,7 @@ BibTeX:
month = {jan},
title = {Dataverse Metadata Crawler},
url = {https://github.com/scholarsportal/dataverse-metadata-crawler},
- version = {0.1.1},
+ version = {0.1.2},
year = {2025}
}
```
diff --git a/dvmeta/func.py b/dvmeta/func.py
index 7cc72c5..01ebf3a 100644
--- a/dvmeta/func.py
+++ b/dvmeta/func.py
@@ -52,7 +52,7 @@ def get_pids(read_dict: dict, config: dict) -> tuple:
return empty_dv, write_dict
-def check_connection(config: dict) -> bool:
+def check_connection(config: dict) -> tuple[bool, bool]:
"""Check the connection to the dataverse repository.
Args:
@@ -60,27 +60,36 @@ def check_connection(config: dict) -> bool:
auth (bool): Check the connection with authentication
Returns:
- bool: True if the connection is successful, False otherwise
+ bool: True if the connection is successful
+ bool: True if the connection is successful with authentication
"""
- if config.get('API_KEY'):
- url = f"{config['BASE_URL']}/api/mydata/retrieve?role_ids=8&dvobject_types=Dataverse&published_states=Published&per_page=1" # noqa: E501
- config['HEADERS'] = {'X-Dataverse-key': config['API_KEY']}
- print('Checking the connection to the dataverse repository with authentication...\n') # noqa: E501
- else:
- url = f"{config['BASE_URL']}/api/info/version"
- config['HEADERS'] = {}
- print('Checking the connection to the dataverse repository without authentication...\n') # noqa: E501
+ base_url = config.get('BASE_URL')
+ api_key = config.get('API_KEY')
+ auth_headers = {'X-Dataverse-key': api_key} if api_key and api_key.lower() != 'none' else {}
+ auth_url = f'{base_url}/api/mydata/retrieve?role_ids=8&dvobject_types=Dataverse&published_states=Published&per_page=1' # noqa: E501
+ public_url = f'{base_url}/api/info/version'
+
try:
with HttpxClient(config) as httpx_client:
- response = httpx_client.sync_get(url)
+ if auth_headers:
+ print('Checking the connection to the Dataverse repository with authentication...')
+ response = httpx_client.sync_get(auth_url)
+ if response and response.status_code == httpx_client.httpx_success_status:
+ print(f'Connection to the dataverse repository {config["BASE_URL"]} is successful.\n')
+ return True, True
+ print('Your API_KEY is invalid. The crawler will now fall back using unauthenticated connection.\n')
+
+ # Attempt to connect to the repository without authentication
+ response = httpx_client.sync_get(public_url)
if response and response.status_code == httpx_client.httpx_success_status:
- print(f'Connection to the dataverse repository {config["BASE_URL"]} is successful.\n') # noqa: E501
- return True
- print('Your API_KEY is invalid and therefore failed to connect to the dataverse repository. Please check your input.\n') # noqa: E501
- return False
+ print(f'Unauthenticated connection to the dataverse repository {config["BASE_URL"]} is successful. The script continue crawling.\n') # noqa: E501
+ return True, False
+ print(f'Failed to connect to the dataverse repository {config["BASE_URL"]}.\nExiting...\n') # noqa: E501
+ return False, False
+
except httpx.HTTPStatusError as e:
print(f'Failed to connect to the dataverse repository {config["BASE_URL"]}: HTTP Error {e.response.status_code}\n') # noqa: E501
- return False
+ return False, False
def version_type(value: str) -> str:
@@ -103,9 +112,7 @@ def version_type(value: str) -> str:
if value in valid_special_versions or re.match(r'^\d+(\.\d+)?$', value):
return value
msg = f'Invalid value for --version: "{value}".\nMust be "draft", "latest", "latest-published", or a version number like "x" or "x.y".' # noqa: E501
- raise typer.BadParameter(
- msg
- )
+ raise typer.BadParameter(msg)
def validate_spreadsheet(value: bool, dvdfds_metadata: bool) -> bool:
diff --git a/dvmeta/main.py b/dvmeta/main.py
index 2df929d..bcbee85 100644
--- a/dvmeta/main.py
+++ b/dvmeta/main.py
@@ -79,9 +79,6 @@ def main(
start_time_obj, start_time_display = utils.Timestamp().get_current_time(), utils.Timestamp().get_display_time()
print(f'Start time: {start_time_display}\n')
- # Load the crawler
- metadata_crawler = MetaDataCrawler(config)
-
# Check if either dvdfds_matadata or permission is provided
if not dvdfds_matadata and not permission:
print(
@@ -90,13 +87,22 @@ def main(
sys.exit(1)
# Check if the authentication token is provided if the permission metadata is requested to be crawled
- if permission and config.get('API_KEY') is None:
- print('Error: Crawling permission metadata requires API Token. Please provide the API Token.\nExiting...')
+ if permission and config.get('API_KEY') is None or config.get('API_KEY') == 'None':
+ print('Error: Crawling permission metadata requires API Token. Please provide the API Token.Exiting...')
sys.exit(1)
# Check the connection to the dataverse repository
- if not func.check_connection(config):
+ connection_status, auth_status = func.check_connection(config)
+ if not connection_status:
sys.exit(1)
+ if not auth_status:
+ config['API_KEY'] = None
+ if permission:
+ print('[WARNING]: Crawling permission metadata requires valid API Token. The script will skip crawling permission metadata\n')
+ permission = False
+
+ # Initialize the crawler
+ metadata_crawler = MetaDataCrawler(config)
# Crawl the collection tree metadata
response = metadata_crawler.get_collections_tree(collection_alias)
diff --git a/dvmeta/metadatacrawler.py b/dvmeta/metadatacrawler.py
index 2d7dca5..0469170 100644
--- a/dvmeta/metadatacrawler.py
+++ b/dvmeta/metadatacrawler.py
@@ -21,7 +21,7 @@ class MetaDataCrawler:
def __init__(self, config: dict) -> None:
"""Initialize the class with the configuration settings."""
- self.config = config
+ self.config = self._define_headers(config)
self.url_tree = f"{config['BASE_URL']}/api/info/metrics/tree?parentAlias={config['COLLECTION_ALIAS']}"
self.http_success_status = 200
self.url_dataverse = f"{config['BASE_URL']}/api/dataverses"
@@ -30,7 +30,27 @@ def __init__(self, config: dict) -> None:
self.write_dict = {}
self.failed_dict = []
self.url = None
- self.client = HttpxClient(config)
+ self.client = HttpxClient(self.config)
+
+ @staticmethod
+ def _define_headers(config: dict) -> dict[str, str]:
+ """Define the headers for the HTTP request.
+
+ Args:
+ config (dict): Configuration dictionary
+
+ Returns:
+ dict[str, str]: Dictionary containing the headers
+ """
+ headers = {'Accept': 'application/json'}
+
+ api_key = config.get('API_KEY')
+ if api_key and str(api_key).lower() != 'none':
+ headers['X-Dataverse-key'] = api_key
+
+ config['HEADERS'] = headers
+
+ return config
def _get_dataset_content_url(self, identifier: str) -> str:
return f"{self.config['BASE_URL']}/api/datasets/:persistentId/versions/:{self.config['VERSION']}?persistentId={identifier}" # noqa: E501
diff --git a/example.ipynb b/example.ipynb
new file mode 100644
index 0000000..6d7092e
--- /dev/null
+++ b/example.ipynb
@@ -0,0 +1,104 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Step 1: Setting environment variables\n",
+ "Replace the values inside the quotes for BASE_URL and API_KEY.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Replace the placeholders with your own values and run this script to create a .env file\n",
+ "BASE_URL = 'TARGET_REPO_URL' # Base URL of the repository; e.g., \"https://demo.borealisdata.ca/\"\n",
+ "API_KEY = 'YOUR_API_KEY' # Found in your Dataverse account settings. Optional. Delete this line if you plan not to use it.\n",
+ "\n",
+ "\n",
+ "# Write the .env file\n",
+ "with open('.env', 'w', encoding='utf-8') as file:\n",
+ " if locals().get('API_KEY') is None:\n",
+ " file.write(f'BASE_URL = \"{BASE_URL}\"\\n')\n",
+ " else:\n",
+ " file.write(f'BASE_URL = \"{BASE_URL}\"\\n')\n",
+ " file.write(f'API_KEY = \"{API_KEY}\"\\n')\n",
+ " print('Successfully created the .env file!')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Step 2: Running the command line tool\n",
+ "The following cell runs the comand line tool.\n",
+ "\n",
+ "**Configuration**:\n",
+ "1. Replace the COLLECTION_ALIAS with your desired value. See [here](https://github.com/scholarsportal/dataverse-metadata-crawler/wiki/Guide:-How-to-find-the-COLLECTION_ALIAS-of-a-Dataverse-collection) for getting your collection alias.\n",
+ "2. Replace the VERSION with your desired value. It can either be 'latest', 'latest-published' or a version number 'x.y' (like '1.0')\n",
+ "3. Add the optional flags. See the following table for your reference:\n",
+ " \n",
+ "\n",
+ "| **Option** | **Short** | **Type** | **Description** | **Default** |\n",
+ "|----------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|\n",
+ "| --auth | -a | TEXT | Authentication token to access the Dataverse repository.
| None |\n",
+ "| --log
--no-log | -l | | Output a log file.
Use `--no-log` to disable logging. | `log` (unless `--no-log`) |\n",
+ "| --dvdfds_metadata | -d | | Output a JSON file containing metadata of Dataverses, Datasets, and Data Files. | |\n",
+ "| --permission | -p | | Output a JSON file that stores permission metadata for all Datasets in the repository. | |\n",
+ "| --emptydv | -e | | Output a JSON file that stores all Dataverses which do **not** contain Datasets (though they might have child Dataverses which have Datasets). | |\n",
+ "| --failed | -f | | Output a JSON file of Dataverses/Datasets that failed to be crawled. | |\n",
+ "| --spreadsheet | -s | | Output a CSV file of the metadata of Datasets. | |\n",
+ "| --help | | | Show the help message. | |\n",
+ "\n",
+ "Example:\n",
+ "1. Export the metadata of latest version of datasets under collection 'demo' to JSON\n",
+ "\n",
+ " `!python3 dvmeta/main.py -c demo -v latest -d`\n",
+ "\n",
+ "2. Export the metadata of version 1.0 of all datasets under collection 'demo' to JSON and CSV\n",
+ "\n",
+ " `!python3 dvmeta/main.py -c demo -v 1.0 -d -s`\n",
+ "\n",
+ "3. Export the metadata and permission metadata of version latest-published of all datasets under collection 'toronto' to JSON and CSV. Also export the empty dataverses and datasets failed to be crawled\n",
+ "\n",
+ " `!python3 dvmeta/main.py -c toronto -v latest-published -d -s -p -e -f`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Run the command line interface\n",
+ "# Replace 'COLLECTION_ALIAS' and 'VERSION' with your values\n",
+ "# Modify the flags as needed referring to the table above\n",
+ "!python3 dvmeta/main.py -c 'COLLECTION_ALIAS' -v 'VERSION' -d -s -p -e -f"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pyproject.toml b/pyproject.toml
index 5f3d38a..d97644c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dataverse-metadata-crawler"
-version = "0.1.1"
+version = "0.1.2"
description = "A Python CLI tool for bulk extracting and exporting metadata from Dataverse repositories' collections to JSON and CSV formats."
authors = ["Ken Lui "]
license = "MIT"