Skip to content

Commit

Permalink
Merge pull request #7 from kenlhlui/feature/mybinder
Browse files Browse the repository at this point in the history
Integrate Feature/mybinder branch
  • Loading branch information
kenlhlui authored Jan 31, 2025
2 parents 2ada64b + 6ce870d commit 1f802b8
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 27 deletions.
45 changes: 26 additions & 19 deletions dvmeta/func.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,35 +52,44 @@ def get_pids(read_dict: dict, config: dict) -> tuple:
return empty_dv, write_dict


def check_connection(config: dict) -> bool:
def check_connection(config: dict) -> tuple[bool, bool]:
"""Check the connection to the dataverse repository.
Args:
config (dict): Configuration dictionary
auth (bool): Check the connection with authentication
Returns:
bool: True if the connection is successful, False otherwise
bool: True if the connection is successful
bool: True if the connection is successful with authentication
"""
if config.get('API_KEY'):
url = f"{config['BASE_URL']}/api/mydata/retrieve?role_ids=8&dvobject_types=Dataverse&published_states=Published&per_page=1" # noqa: E501
config['HEADERS'] = {'X-Dataverse-key': config['API_KEY']}
print('Checking the connection to the dataverse repository with authentication...\n') # noqa: E501
else:
url = f"{config['BASE_URL']}/api/info/version"
config['HEADERS'] = {}
print('Checking the connection to the dataverse repository without authentication...\n') # noqa: E501
base_url = config.get('BASE_URL')
api_key = config.get('API_KEY')
auth_headers = {'X-Dataverse-key': api_key} if api_key and api_key.lower() != 'none' else {}
auth_url = f'{base_url}/api/mydata/retrieve?role_ids=8&dvobject_types=Dataverse&published_states=Published&per_page=1' # noqa: E501
public_url = f'{base_url}/api/info/version'

try:
with HttpxClient(config) as httpx_client:
response = httpx_client.sync_get(url)
if auth_headers:
print('Checking the connection to the Dataverse repository with authentication...')
response = httpx_client.sync_get(auth_url)
if response and response.status_code == httpx_client.httpx_success_status:
print(f'Connection to the dataverse repository {config["BASE_URL"]} is successful.\n')
return True, True
print('Your API_KEY is invalid. The crawler will now fall back using unauthenticated connection.\n')

# Attempt to connect to the repository without authentication
response = httpx_client.sync_get(public_url)
if response and response.status_code == httpx_client.httpx_success_status:
print(f'Connection to the dataverse repository {config["BASE_URL"]} is successful.\n') # noqa: E501
return True
print('Your API_KEY is invalid and therefore failed to connect to the dataverse repository. Please check your input.\n') # noqa: E501
return False
print(f'Unauthenticated connection to the dataverse repository {config["BASE_URL"]} is successful. The script continue crawling.\n') # noqa: E501
return True, False
print(f'Failed to connect to the dataverse repository {config["BASE_URL"]}.\nExiting...\n') # noqa: E501
return False, False

except httpx.HTTPStatusError as e:
print(f'Failed to connect to the dataverse repository {config["BASE_URL"]}: HTTP Error {e.response.status_code}\n') # noqa: E501
return False
return False, False


def version_type(value: str) -> str:
Expand All @@ -103,9 +112,7 @@ def version_type(value: str) -> str:
if value in valid_special_versions or re.match(r'^\d+(\.\d+)?$', value):
return value
msg = f'Invalid value for --version: "{value}".\nMust be "draft", "latest", "latest-published", or a version number like "x" or "x.y".' # noqa: E501
raise typer.BadParameter(
msg
)
raise typer.BadParameter(msg)


def validate_spreadsheet(value: bool, dvdfds_metadata: bool) -> bool:
Expand Down
18 changes: 12 additions & 6 deletions dvmeta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,6 @@ def main(
start_time_obj, start_time_display = utils.Timestamp().get_current_time(), utils.Timestamp().get_display_time()
print(f'Start time: {start_time_display}\n')

# Load the crawler
metadata_crawler = MetaDataCrawler(config)

# Check if either dvdfds_matadata or permission is provided
if not dvdfds_matadata and not permission:
print(
Expand All @@ -90,13 +87,22 @@ def main(
sys.exit(1)

# Check if the authentication token is provided if the permission metadata is requested to be crawled
if permission and config.get('API_KEY') is None:
print('Error: Crawling permission metadata requires API Token. Please provide the API Token.\nExiting...')
if permission and config.get('API_KEY') is None or config.get('API_KEY') == 'None':
print('Error: Crawling permission metadata requires API Token. Please provide the API Token.Exiting...')
sys.exit(1)

# Check the connection to the dataverse repository
if not func.check_connection(config):
connection_status, auth_status = func.check_connection(config)
if not connection_status:
sys.exit(1)
if not auth_status:
config['API_KEY'] = None
if permission:
print('[WARNING]: Crawling permission metadata requires valid API Token. The script will skip crawling permission metadata\n')
permission = False

# Initialize the crawler
metadata_crawler = MetaDataCrawler(config)

# Crawl the collection tree metadata
response = metadata_crawler.get_collections_tree(collection_alias)
Expand Down
24 changes: 22 additions & 2 deletions dvmeta/metadatacrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class MetaDataCrawler:

def __init__(self, config: dict) -> None:
"""Initialize the class with the configuration settings."""
self.config = config
self.config = self._define_headers(config)
self.url_tree = f"{config['BASE_URL']}/api/info/metrics/tree?parentAlias={config['COLLECTION_ALIAS']}"
self.http_success_status = 200
self.url_dataverse = f"{config['BASE_URL']}/api/dataverses"
Expand All @@ -30,7 +30,27 @@ def __init__(self, config: dict) -> None:
self.write_dict = {}
self.failed_dict = []
self.url = None
self.client = HttpxClient(config)
self.client = HttpxClient(self.config)

@staticmethod
def _define_headers(config: dict) -> dict[str, str]:
"""Define the headers for the HTTP request.
Args:
config (dict): Configuration dictionary
Returns:
dict[str, str]: Dictionary containing the headers
"""
headers = {'Accept': 'application/json'}

api_key = config.get('API_KEY')
if api_key and str(api_key).lower() != 'none':
headers['X-Dataverse-key'] = api_key

config['HEADERS'] = headers

return config

def _get_dataset_content_url(self, identifier: str) -> str:
return f"{self.config['BASE_URL']}/api/datasets/:persistentId/versions/:{self.config['VERSION']}?persistentId={identifier}" # noqa: E501
Expand Down
103 changes: 103 additions & 0 deletions example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Step 1: Setting environment variables\n",
"Replace the values inside the quotes for BASE_URL and API_KEY.\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Replace the placeholders with your own values and run this script to create a .env file\n",
"BASE_URL = 'TARGET_REPO_URL' # Base URL of the repository; e.g., \"https://demo.borealisdata.ca/\"\n",
"API_KEY = 'YOUR_API_KEY' # Found in your Dataverse account settings. Optional. Delete this line if you plan not to use it.\n",
"\n",
"\n",
"# Write the .env file\n",
"with open('.env', 'w', encoding='utf-8') as file:\n",
" if locals().get('API_KEY') is None:\n",
" file.write(f'BASE_URL = \"{BASE_URL}\"\\n')\n",
" else:\n",
" file.write(f'BASE_URL = \"{BASE_URL}\"\\n')\n",
" file.write(f'API_KEY = \"{API_KEY}\"\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Step 2: Running the command line tool\n",
"The following cell runs the comand line tool.\n",
"\n",
"**Configuration**:\n",
"1. Replace the COLLECTION_ALIAS with your desired value. See [here](https://github.com/scholarsportal/dataverse-metadata-crawler/wiki/Guide:-How-to-find-the-COLLECTION_ALIAS-of-a-Dataverse-collection) for getting your collection alias.\n",
"2. Replace the VERSION with your desired value. It can either be 'latest', 'latest-published' or a version number 'x.y' (like '1.0')\n",
"3. Add the optional flags. See the following table for your reference:\n",
" \n",
"\n",
"| **Option** | **Short** | **Type** | **Description** | **Default** |\n",
"|----------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|\n",
"| --auth | -a | TEXT | Authentication token to access the Dataverse repository. <br/> | None |\n",
"| --log <br/> --no-log | -l | | Output a log file. <br/> Use `--no-log` to disable logging. | `log` (unless `--no-log`) |\n",
"| --dvdfds_metadata | -d | | Output a JSON file containing metadata of Dataverses, Datasets, and Data Files. | |\n",
"| --permission | -p | | Output a JSON file that stores permission metadata for all Datasets in the repository. | |\n",
"| --emptydv | -e | | Output a JSON file that stores all Dataverses which do **not** contain Datasets (though they might have child Dataverses which have Datasets). | |\n",
"| --failed | -f | | Output a JSON file of Dataverses/Datasets that failed to be crawled. | |\n",
"| --spreadsheet | -s | | Output a CSV file of the metadata of Datasets. | |\n",
"| --help | | | Show the help message. | |\n",
"\n",
"Example:\n",
"1. Export the metadata of latest version of datasets under collection 'demo' to JSON\n",
"\n",
" `!python3 dvmeta/main.py -c demo -v latest -d`\n",
"\n",
"2. Export the metadata of version 1.0 of all datasets under collection 'demo' to JSON and CSV\n",
"\n",
" `!python3 dvmeta/main.py -c demo -v 1.0 -d -s`\n",
"\n",
"3. Export the metadata and permission metadata of version latest-published of all datasets under collection 'toronto' to JSON and CSV. Also export the empty dataverses and datasets failed to be rawled\n",
"\n",
" `!python3 dvmeta/main.py -c toronto -v 1.0 -d -s -p -e -f`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Run the command line interface\n",
"# Replace 'COLLECTION_ALIAS' and 'VERSION' with your values\n",
"# Modify the flags as needed referring to the table above\n",
"!python3 dvmeta/main.py -c 'COLLECTION_ALIAS' -v 'VERSION' -d -s -p -e -f"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 1f802b8

Please sign in to comment.