From 833f8baee91625837608b64e0e4f0e147868f679 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Fri, 17 Jan 2025 17:25:46 -0500 Subject: [PATCH 01/28] Create poetry-export_dependencies.yml Added CI/CD workflow for exporting requirements.txt when poetry.lock/requirements.txt/pyproject.toml is changed. --- .../workflows/poetry-export_dependencies.yml | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 .github/workflows/poetry-export_dependencies.yml diff --git a/.github/workflows/poetry-export_dependencies.yml b/.github/workflows/poetry-export_dependencies.yml new file mode 100644 index 0000000..36306c8 --- /dev/null +++ b/.github/workflows/poetry-export_dependencies.yml @@ -0,0 +1,73 @@ +name: Poetry export requirements.txt +on: + push: + branches: + - '*' # Trigger on any push to any branch + paths: + - 'requirements.txt' + - 'pyproject.toml' + - 'poetry.lock' +jobs: + poetry-export_dependencies: + strategy: + fail-fast: false + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: Install poetry + uses: abatilo/actions-poetry@v4 + with: + poetry-version: 'latest' + - name: Install the poetry-plugin-export + run: poetry self add poetry-plugin-export + - name: Update poetry lock file + run: poetry lock + - name: Export the project dependencies to requirements.txt + run: | + poetry export -f requirements.txt --output requirements.txt + - name: Get branch name + shell: bash + run: echo "BRANCH_NAME=${GITHUB_REF#refs/heads/}" >> $GITHUB_ENV + - name: Check for changes + id: check_changes + run: | + if [[ -n "$(git status --porcelain requirements.txt poetry.lock)" ]]; then + echo "changes=true" >> $GITHUB_OUTPUT + else + echo "changes=false" >> $GITHUB_OUTPUT + fi + - name: Configure Git + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + - name: Commit and push if changed + if: steps.check_changes.outputs.changes == 'true' + run: | + # Pull with rebase to get latest changes + git pull --rebase origin ${{ env.BRANCH_NAME }} + + # Stage and commit changes + git add requirements.txt poetry.lock + git commit -m "chore: update requirements.txt and poetry.lock [skip ci]" + + # Push with retry logic + max_attempts=3 + attempt=1 + while [ $attempt -le $max_attempts ]; do + if git push origin ${{ env.BRANCH_NAME }}; then + break + else + if [ $attempt -eq $max_attempts ]; then + echo "Failed to push after $max_attempts attempts" + exit 1 + fi + echo "Push failed, attempt $attempt of $max_attempts. Pulling and retrying..." + git pull --rebase origin ${{ env.BRANCH_NAME }} + attempt=$((attempt + 1)) + fi + done + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From f23640f9badf95caabe07bcd2145a85fafc5ffcf Mon Sep 17 00:00:00 2001 From: Ken Lui Date: Mon, 20 Jan 2025 18:28:16 -0500 Subject: [PATCH 02/28] 1. Added GitHub Actions workflows for Jekyll deployment and Poetry dependency export 2. Updated CITATION.cff & README --- .github/workflows/jekyll-gh-pages.yml | 51 +++++++++++++ .../workflows/poetry-export_dependencies.yml | 73 +++++++++++++++++++ CITATION.cff | 4 +- README.md | 16 ++-- _config.yml | 19 +++++ 5 files changed, 154 insertions(+), 9 deletions(-) create mode 100644 .github/workflows/jekyll-gh-pages.yml create mode 100644 .github/workflows/poetry-export_dependencies.yml create mode 100644 _config.yml diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml new file mode 100644 index 0000000..e31d81c --- /dev/null +++ b/.github/workflows/jekyll-gh-pages.yml @@ -0,0 +1,51 @@ +# Sample workflow for building and deploying a Jekyll site to GitHub Pages +name: Deploy Jekyll with GitHub Pages dependencies preinstalled + +on: + # Runs on pushes targeting the default branch + push: + branches: ["main"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Build job + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Build with Jekyll + uses: actions/jekyll-build-pages@v1 + with: + source: ./ + destination: ./_site + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + + # Deployment job + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/poetry-export_dependencies.yml b/.github/workflows/poetry-export_dependencies.yml new file mode 100644 index 0000000..36306c8 --- /dev/null +++ b/.github/workflows/poetry-export_dependencies.yml @@ -0,0 +1,73 @@ +name: Poetry export requirements.txt +on: + push: + branches: + - '*' # Trigger on any push to any branch + paths: + - 'requirements.txt' + - 'pyproject.toml' + - 'poetry.lock' +jobs: + poetry-export_dependencies: + strategy: + fail-fast: false + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: Install poetry + uses: abatilo/actions-poetry@v4 + with: + poetry-version: 'latest' + - name: Install the poetry-plugin-export + run: poetry self add poetry-plugin-export + - name: Update poetry lock file + run: poetry lock + - name: Export the project dependencies to requirements.txt + run: | + poetry export -f requirements.txt --output requirements.txt + - name: Get branch name + shell: bash + run: echo "BRANCH_NAME=${GITHUB_REF#refs/heads/}" >> $GITHUB_ENV + - name: Check for changes + id: check_changes + run: | + if [[ -n "$(git status --porcelain requirements.txt poetry.lock)" ]]; then + echo "changes=true" >> $GITHUB_OUTPUT + else + echo "changes=false" >> $GITHUB_OUTPUT + fi + - name: Configure Git + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + - name: Commit and push if changed + if: steps.check_changes.outputs.changes == 'true' + run: | + # Pull with rebase to get latest changes + git pull --rebase origin ${{ env.BRANCH_NAME }} + + # Stage and commit changes + git add requirements.txt poetry.lock + git commit -m "chore: update requirements.txt and poetry.lock [skip ci]" + + # Push with retry logic + max_attempts=3 + attempt=1 + while [ $attempt -le $max_attempts ]; do + if git push origin ${{ env.BRANCH_NAME }}; then + break + else + if [ $attempt -eq $max_attempts ]; then + echo "Failed to push after $max_attempts attempts" + exit 1 + fi + echo "Push failed, attempt $attempt of $max_attempts. Pulling and retrying..." + git pull --rebase origin ${{ env.BRANCH_NAME }} + attempt=$((attempt + 1)) + fi + done + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/CITATION.cff b/CITATION.cff index 4700932..97e309c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -4,7 +4,7 @@ authors: - family-names: "Lui" given-names: "Lok Hei" orcid: "https://orcid.org/0000-0001-5077-1530" -title: "Dataverse metadata Crawler" +title: "Dataverse Metadata Crawler" version: 0.1.0 date-released: 2025-01-16 -url: "https://github.com/kenlhlui/dataverse-metadata-crawler-p" \ No newline at end of file +url: "https://github.com/scholarsportal/dataverse-metadata-crawler" diff --git a/README.md b/README.md index cf4b45d..6189d31 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/ 2. Change to the project directory ```sh - cd ~/dataverse-metadata-export-p + cd ./dataverse-metadata-crawler ``` 3. Create an environment file (.env) @@ -65,6 +65,7 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/ python3 dvmeta/main.py [-a AUTH] [-l] [-d] [-p] [-f] [-e] [-s] -c COLLECTION_ALIAS -v VERSION ``` **Required arguments:** + | **Option** | **Short** | **Type** | **Description** | **Default** | |--------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| | --collection_alias | -c | TEXT | Name of the collection to crawl.
**[required]** | None | @@ -72,6 +73,7 @@ python3 dvmeta/main.py [-a AUTH] [-l] [-d] [-p] [-f] [-e] [-s] -c COLLECTION_ALI **Optional arguments:** + | **Option** | **Short** | **Type** | **Description** | **Default** | |----------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------| | --auth | -a | TEXT | Authentication token to access the Dataverse repository.
If | None | @@ -96,6 +98,7 @@ python3 dvmeta/main.py -c demo -v 1.0 -d -s -p -a xxxxxxxx-xxxx-xxxx-xxxx-xxxxxx ``` ## 📂Output Structure + | File | Description | |-------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------| | ds_metadata_yyyymmdd-HHMMSS.json | Datasets' their data files' metadata in JSON format. | @@ -145,21 +148,20 @@ If you use this software in your work, please cite it using the following metada APA: ``` -Lui, L. H. (2025). Dataverse metadata Crawler (Version 0.1.0) [Computer software]. https://github.com/kenlhlui/dataverse-metadata-crawler-p +Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.0) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler ``` BibTeX: ``` -@software{Lui_Dataverse_metadata_Crawler_2025, +@software{Lui_Dataverse_Metadata_Crawler_2025, author = {Lui, Lok Hei}, month = jan, -title = {{Dataverse metadata Crawler}}, -url = {https://github.com/kenlhlui/dataverse-metadata-crawler-p}, +title = {{Dataverse Metadata Crawler}}, +url = {https://github.com/scholarsportal/dataverse-metadata-crawler}, version = {0.1.0}, year = {2025} } ``` ## ✍️Authors -Ken Lui - Data Curation Specialist, Map and Data Library, University of Toronto - kenlh.lui@utoronto.ca - +Ken Lui - Data Curation Specialist, Map and Data Library, University of Toronto - [kenlh.lui@utoronto.ca](mailto:kenlh.lui@utoronto.ca) diff --git a/_config.yml b/_config.yml new file mode 100644 index 0000000..2217569 --- /dev/null +++ b/_config.yml @@ -0,0 +1,19 @@ +# Site settings +title: Dataverse Metadata Crawler +description: A Python CLI tool for extracting and exporting metadata from Dataverse repositories to JSON and CSV formats. +baseurl: "/dataverse-metadata-crawler" # Base URL (leave blank for root deployment) +url: "https://scholarsportal.github.io" # Your GitHub Pages URL + +remote_theme: pages-themes/primer +plugins: +- jekyll-remote-theme # add this line to the plugins list if you already have one +- jekyll-seo-tag # Required by primer theme + +# Markdown settings +markdown: kramdown +kramdown: + input: GFM # Enables GitHub Flavored Markdown (GFM) + +# Build settings +source: ./ +destination: ./_site From 4380e3cedd9cc5832379e3c1af79b59588f72595 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 23 Jan 2025 12:48:03 -0500 Subject: [PATCH 03/28] 1. Updated gitignore. --- .gitignore | 177 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..29e00d5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,177 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# exported_files folder +exported_files/ \ No newline at end of file From d16781c22c45f0f110c40a24302249eaaedddc06 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 23 Jan 2025 13:13:01 -0500 Subject: [PATCH 04/28] Update README.md --- README.md | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 6189d31..965fe8d 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) -[![Licnese: MIT](https://img.shields.io/badge/Licnese-MIT-blue)](https://opensource.org/license/mit) +[![License: MIT](https://img.shields.io/badge/License-MIT-blue)](https://opensource.org/license/mit) [![Dataverse](https://img.shields.io/badge/Dataverse-FFA500?)](https://dataverse.org/) [![Code Style: Black](https://img.shields.io/badge/code_style-black-black?)](https://github.com/psf/black) @@ -7,10 +7,10 @@ ![Screencapture of the CLI tool](res/screenshot.png) ## 📜Description -A Python CLI tool for extracting and exporting metadata from [Dataverse](https://dataverse.org/) repositories. It supports bulk extraction of dataverses, datasets, and data file metadata from any chosen level of dataverse collection (whole Dataverse repository/sub-Dataverse), with flexible export options to JSON and CSV formats. +A Python CLI tool for extracting and exporting metadata from [Dataverse](https://dataverse.org/) repositories. It supports bulk extraction of dataverses, datasets, and data file metadata from any chosen level of dataverse collection (an entire Dataverse repository/sub-Dataverse), with flexible export options to JSON and CSV formats. ## ✨Features -1. Bulk metadata extraction from Dataverse repositories from any chosen level of collection (top level or selected collection) +1. Bulk metadata extraction from Dataverse repositories at any chosen level of collection (top level or selected collection) 2. JSON & CSV file export options ## 📦Prerequisites @@ -38,11 +38,11 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/ notepad .env ``` -4. Configure environment file using your text editor at your choice +4. Configure the environment (.env) file using the text editor of your choice. ```sh # .env file BASE_URL = "TARGET_REPO_URL" # e.g., "https://demo.borealisdata.ca/" - API_KEY = "YOUR_API_KEY" # Find in your Dataverse account settings. You may also specify it in the CLI interface (with -a flag) + API_KEY = "YOUR_API_KEY" # Found in your Dataverse account settings. Can also be specified in the CLI interface using the -a flag. ``` 5. Set up virtual environment (recommended) @@ -68,7 +68,7 @@ python3 dvmeta/main.py [-a AUTH] [-l] [-d] [-p] [-f] [-e] [-s] -c COLLECTION_ALI | **Option** | **Short** | **Type** | **Description** | **Default** | |--------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| -| --collection_alias | -c | TEXT | Name of the collection to crawl.
**[required]** | None | +| --collection_alias | -c | TEXT | The alias of the collection to crawl.
**[required]** | None | | --version | -v | TEXT | The Dataset version to crawl. Options include:
• `draft` - The draft version, if any
• `latest` - Either a draft (if exists) or the latest published version
• `latest-published` - The latest published version
• `x.y` - A specific version
**[required]** | None (required) | @@ -76,7 +76,7 @@ python3 dvmeta/main.py [-a AUTH] [-l] [-d] [-p] [-f] [-e] [-s] -c COLLECTION_ALI | **Option** | **Short** | **Type** | **Description** | **Default** | |----------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------| -| --auth | -a | TEXT | Authentication token to access the Dataverse repository.
If | None | +| --auth | -a | TEXT | Authentication token to access the Dataverse repository.
| None | | --log
--no-log | -l | | Output a log file.
Use `--no-log` to disable logging. | `log` (unless `--no-log`) | | --dvdfds_metadata | -d | | Output a JSON file containing metadata of Dataverses, Datasets, and Data Files. | | | --permission | -p | | Output a JSON file that stores permission metadata for all Datasets in the repository. | | @@ -101,13 +101,13 @@ python3 dvmeta/main.py -c demo -v 1.0 -d -s -p -a xxxxxxxx-xxxx-xxxx-xxxx-xxxxxx | File | Description | |-------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------| -| ds_metadata_yyyymmdd-HHMMSS.json | Datasets' their data files' metadata in JSON format. | +| ds_metadata_yyyymmdd-HHMMSS.json | Datasets representation & data files metadata in JSON format. | | empty_dv_yyyymmdd-HHMMSS.json | The id of empty dataverse(s) in list format. | | failed_metadata_uris_yyyymmdd-HHMMSS.json | The URIs (URL) of datasets failed to be downloaded. | | permission_dict_yyyymmdd-HHMMSS.json | The perission metadata of datasets with their dataset id. | | pid_dict_yyyymmdd-HHMMSS.json | Datasets' basic info with hierarchical information dictionary.Only exported if -p (permission) flag is used without -d (metadata) flag. | | pid_dict_dd_yyyymmdd-HHMMSS.json | The Hierarchical information of deaccessioned/draft datasets. | -| ds_metadata_yyyymmdd-HHMMSS.csv | Datasets' their data files' metadata in CSV format. | +| ds_metadata_yyyymmdd-HHMMSS.csv | Datasets and their data files' metadata in CSV format. | | log_yyyymmdd-HHMMSS.txt | Summary of the crawling work. | ```sh @@ -129,8 +129,8 @@ exported_files/ No tests have been written yet. Contributions welcome! ## 💻Development -1. Dependencies managment: [poetry](https://python-poetry.org/) - Update the pyproject.toml dependencies changes -2. Linter: [ruff](https://docs.astral.sh/ruff/) - Linting rules are outlined in the pyproject.toml +1. Dependencies managment: [poetry](https://python-poetry.org/) - Use `poetry` to manage dependencies and reflect changes in the `pyproject.toml` file. +2. Linter: [ruff](https://docs.astral.sh/ruff/) - Follow the linting rules outlined in the `pyproject.toml` file. ## 🙌Contributing 1. Fork the repository @@ -148,18 +148,18 @@ If you use this software in your work, please cite it using the following metada APA: ``` -Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.0) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler +Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.1) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler ``` BibTeX: ``` @software{Lui_Dataverse_Metadata_Crawler_2025, -author = {Lui, Lok Hei}, -month = jan, -title = {{Dataverse Metadata Crawler}}, -url = {https://github.com/scholarsportal/dataverse-metadata-crawler}, -version = {0.1.0}, -year = {2025} + author = {Lui, Lok Hei}, + month = {jan}, + title = {Dataverse Metadata Crawler}, + url = {https://github.com/scholarsportal/dataverse-metadata-crawler}, + version = {0.1.1}, + year = {2025} } ``` From d91a1dea5f33392bd381d05a52de4f02b35afc42 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 23 Jan 2025 14:33:50 -0500 Subject: [PATCH 05/28] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 965fe8d..7ddda7b 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ python3 dvmeta/main.py [-a AUTH] [-l] [-d] [-p] [-f] [-e] [-s] -c COLLECTION_ALI | **Option** | **Short** | **Type** | **Description** | **Default** | |--------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| -| --collection_alias | -c | TEXT | The alias of the collection to crawl.
**[required]** | None | +| --collection_alias | -c | TEXT | The alias of the collection to crawl.
See the guide [here](https://github.com/scholarsportal/dataverse-metadata-crawler/wiki/Guide:-How-to-find-the-COLLECTION_ALIAS-of-a-Dataverse-collection) to learn how to look for a the collection alias.
**[required]** | None | | --version | -v | TEXT | The Dataset version to crawl. Options include:
• `draft` - The draft version, if any
• `latest` - Either a draft (if exists) or the latest published version
• `latest-published` - The latest published version
• `x.y` - A specific version
**[required]** | None (required) | From d2915b781be9e0fd135abebb0cf705d55abc2ee4 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 23 Jan 2025 14:35:16 -0500 Subject: [PATCH 06/28] Update CITATION.cff --- CITATION.cff | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 97e309c..1386c30 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,10 +1,10 @@ -cff-version: 0.1.0 +cff-version: 0.1.1 message: "If you use this software, please cite it as below." authors: - family-names: "Lui" given-names: "Lok Hei" orcid: "https://orcid.org/0000-0001-5077-1530" title: "Dataverse Metadata Crawler" -version: 0.1.0 -date-released: 2025-01-16 +version: 0.1.1 +date-released: 2025-01-23 url: "https://github.com/scholarsportal/dataverse-metadata-crawler" From df17fc0451b7dc46bf48360c6fa7828716ec4667 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 23 Jan 2025 16:35:42 -0500 Subject: [PATCH 07/28] 1. Updated spreadsheet.py for inclusion of DF_Hierarchy, DF_Tags & DF_Description. 2. Updated res/spreadsheet_order.csv --- dvmeta/spreadsheet.py | 36 ++++++++++++++++++++++++++++++++++-- res/spreadsheet_order.csv | 7 +------ 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/dvmeta/spreadsheet.py b/dvmeta/spreadsheet.py index 826a8e1..822ff0e 100644 --- a/dvmeta/spreadsheet.py +++ b/dvmeta/spreadsheet.py @@ -177,7 +177,7 @@ def _get_dataset_subjects(dictionary: dict) -> dict: return result_dict @staticmethod - def _get_metadata_blocks(dictionary: dict) -> dict: + def _get_metadata_blocks_usage(dictionary: dict) -> dict: metadata_block_dict = { 'Meta_Geo': 'geospatial', 'Meta_SSHM': 'socialscience', @@ -194,6 +194,33 @@ def _get_metadata_blocks(dictionary: dict) -> dict: return result_dict + @staticmethod + def _get_datafile_meta_usage(dictionary: dict) -> dict: + # Get the use of data file directoryLabel (DF_Hierarchy), + # tags (categories; DF_Tags) & description (DF_Description). + if dictionary.get('data', {}).get('files'): + file_nested_list = jmespath.search('data.files[*]', dictionary) + + # Get the count of directoryLabel if it is not None + directorylabel_count = len([file for file in file_nested_list if file.get('directoryLabel') is not None]) + + # Get the count of categories if it is not None + categories_count = len([ + file for file in file_nested_list + if file.get('dataFile', {}).get('categories') is not None + ]) + + # Get the count of description if it is not None + description_count = len([ + file for file in file_nested_list + if file.get('dataFile', {}).get('description') is not None + ]) + + return {'DF_Hierarchy': directorylabel_count, + 'DF_Tags': categories_count, + 'DF_Description': description_count} + return {'DF_Hierarchy': 0, 'DF_Tags': 0, 'DF_Description': 0} + def _get_spreadsheet_order(self) -> list[str]: with Path(self.spreadsheet_order_file_path).open(encoding='utf-8') as file: return file.read().splitlines() @@ -224,6 +251,11 @@ def make_csv(self, meta_dict: dict) -> tuple[str, str]: holding_list = [] for key, _value in meta_dict.items(): jmespath_dict: dict = jmespath.search(f'{self.search_string}', meta_dict[key]) + + # Get the use of data file hierarchy (folders, DF_Hierarchy), + # file tags (categories; DF_Tags) & description (DF_Description) + jmespath_dict.update(self._get_datafile_meta_usage(meta_dict[key])) + # Get the file size and count jmespath_dict['FileSize'] = self._get_data_files_size(meta_dict[key]) jmespath_dict['FileSize_normalized'] = convert_size(jmespath_dict['FileSize']) @@ -245,7 +277,7 @@ def make_csv(self, meta_dict: dict) -> tuple[str, str]: jmespath_dict.update(self._get_dataset_subjects(jmespath_dict)) # Get the metadata blocks and add them to the result dictionary - jmespath_dict.update(self._get_metadata_blocks(jmespath_dict)) + jmespath_dict.update(self._get_metadata_blocks_usage(jmespath_dict)) # Drop the versionNumber and versionMinorNumber keys from the dictionary jmespath_dict.pop('versionNumber', None) diff --git a/res/spreadsheet_order.csv b/res/spreadsheet_order.csv index a547b43..b5cff61 100644 --- a/res/spreadsheet_order.csv +++ b/res/spreadsheet_order.csv @@ -11,14 +11,11 @@ Version FileCount FileSize FileSize_normalized -FileFormat DataverseSubCollection License RestrictedFiles RequestAcces TermsAccess -TermsUse -Citationrequirements DF_Hierarchy DF_Tags DF_Description @@ -113,6 +110,4 @@ DS_Contrib DS_ContribPlus DS_Curator DS_FileDown -DS_Member -DS_UOFT_Admin -DS_Groups +DS_Member \ No newline at end of file From dbebd7cd4ebac4d0a06a03d27e1213631b37cc46 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 23 Jan 2025 16:54:01 -0500 Subject: [PATCH 08/28] 1. Added CM_AltURL, CM_Agency, CM_ID, CM_CollectionEnd 2. Fixed CM_AuthorAff, CM_TimeEnd --- dvmeta/spreadsheet.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dvmeta/spreadsheet.py b/dvmeta/spreadsheet.py index 822ff0e..80fb291 100644 --- a/dvmeta/spreadsheet.py +++ b/dvmeta/spreadsheet.py @@ -33,8 +33,11 @@ def __init__(self, config: dict) -> None: versionMinorNumber: data.versionMinorNumber, CM_Subtitle: data.metadataBlocks.citation.fields[?typeName==`subtitle`].value|[] CM_AltTitle: data.metadataBlocks.citation.fields[?typeName==`alternativeTitle`].value|[] + CM_AltURL: data.metadataBlocks.citation.fields[?typeName==`alternativeURL`].value|[] + CM_Agency: data.metadataBlocks.citation.fields[?typeName==`otherId`].value|[*]|[].otherIdAgency.value + CM_ID: data.metadataBlocks.citation.fields[?typeName==`otherId`].value|[*]|[].otherIdValue.value CM_Author: data.metadataBlocks.citation.fields[?typeName==`author`].value|[*]|[].authorName.value - CM_ContactAff: data.metadataBlocks.citation.fields[?typeName==`author`].value|[*]|[].authorAffiliation.value + CM_AuthorAff: data.metadataBlocks.citation.fields[?typeName==`author`].value|[*]|[].authorAffiliation.value CM_AuthorID: data.metadataBlocks.citation.fields[?typeName==`author`].value|[*]|[].authorIdentifier.value CM_AuthorIDType: data.metadataBlocks.citation.fields[?typeName==`author`].value|[*]|[].authorIdentifierScheme.value CM_ContactName: data.metadataBlocks.citation.fields[?typeName==`datasetContact`].value|[*]|[].datasetContactName.value @@ -74,8 +77,9 @@ def __init__(self, config: dict) -> None: CM_Depositor: data.metadataBlocks.citation.fields[?typeName==`depositor`].value|[] CM_DepositDate: data.metadataBlocks.citation.fields[?typeName==`dateOfDeposit`].value|[] CM_TimeStart: data.metadataBlocks.citation.fields[?typeName==`timePeriodCovered`].value|[].timePeriodCoveredStart.value - CM_TimeEnd: data.metadataBlocks.citation.fields[?typeName==`dateOfCollection`].value|[].dateOfCollectionStart.value - CM_CollectionStart: data.metadataBlocks.citation.fields[?typeName==`dateOfCollection`].value|[].dateOfCollectionEnd.value + CM_TimeEnd: data.metadataBlocks.citation.fields[?typeName==`timePeriodCovered`].value|[].timePeriodCoveredEnd.value + CM_CollectionStart: data.metadataBlocks.citation.fields[?typeName==`dateOfCollection`].value|[].dateOfCollectionStart.value + CM_CollectionEnd: data.metadataBlocks.citation.fields[?typeName==`dateOfCollection`].value|[].dateOfCollectionEnd.value CM_DataType: data.metadataBlocks.citation.fields[?typeName==`kindOfData`].value|[] CM_SeriesName: data.metadataBlocks.citation.fields[?typeName==`series`].value|[].seriesName.value CM_SeriesInfo: data.metadataBlocks.citation.fields[?typeName==`series`].value|[].seriesInformation.value From fbbe2b1ec8ce8db6418db304864869b9f428a7c5 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 23 Jan 2025 16:54:45 -0500 Subject: [PATCH 09/28] Update .gitignore to include test.ipynb --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 29e00d5..56bdf5f 100644 --- a/.gitignore +++ b/.gitignore @@ -174,4 +174,7 @@ cython_debug/ .pypirc # exported_files folder -exported_files/ \ No newline at end of file +exported_files/ + +# test.ipynb +test.ipynb \ No newline at end of file From cb2254c8ad697c88e724433c6f1dd98b10fd8f11 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 23 Jan 2025 17:26:06 -0500 Subject: [PATCH 10/28] 1. Modularize the cm_metadata_holding_list creation & make_csv 2. Moved spreadsheet to the last section (to preprare for integrating permission_dict writing). --- dvmeta/main.py | 23 ++++++++++++++--------- dvmeta/spreadsheet.py | 27 +++++++++++++++++++++------ 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/dvmeta/main.py b/dvmeta/main.py index b587516..a0955bd 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -101,7 +101,12 @@ def main( sys.exit(1) # Crawl the collection tree metadata - collections_tree = metadata_crawler.get_collections_tree(collection_alias).json() + response = metadata_crawler.get_collections_tree(collection_alias) + if response is None: + print('Error: Failed to retrieve collections tree. The API request returned None.') + sys.exit(1) + + collections_tree = response.json() # Add collection id and alias to config if collections_tree['status'] == 'OK': @@ -148,7 +153,7 @@ async def main_crawler(): failed_metadata_uris = [] if dvdfds_matadata: # Export dataverse_contents - print('\nCrawling Representation and File metadata of datasets...\n') + print('Crawling Representation and File metadata of datasets...\n') pid_list = list(pid_dict) meta_dict, failed_metadata_uris = await metadata_crawler.get_datasets_meta(pid_list) @@ -190,13 +195,6 @@ async def main_crawler(): } ) - if spreadsheet: - # Export the metadata to a CSV file - csv_file_path, csv_file_checksum = Spreadsheet(config).make_csv(meta_dict) - json_file_checksum_dict.append( - {'type': 'Dataset Metadata CSV', 'path': csv_file_path, 'checksum': csv_file_checksum} - ) - if permission: print('\nCrawling Permission metadata of datasets...\n') ds_id_list = [item['ds_id'] for item in pid_dict.values()] @@ -232,6 +230,13 @@ async def main_crawler(): {'type': 'Empty Dataverses', 'path': empty_dv_json, 'checksum': empty_dv_checksum} ) + if spreadsheet: + # Export the metadata to a CSV file + csv_file_path, csv_file_checksum = Spreadsheet(config).make_csv_file(meta_dict) + json_file_checksum_dict.append( + {'type': 'Dataset Metadata CSV', 'path': csv_file_path, 'checksum': csv_file_checksum} + ) + return meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten = asyncio.run(main_crawler()) diff --git a/dvmeta/spreadsheet.py b/dvmeta/spreadsheet.py index 80fb291..f8fc0c1 100644 --- a/dvmeta/spreadsheet.py +++ b/dvmeta/spreadsheet.py @@ -243,14 +243,14 @@ def _reoder_df_columns(self, df: pd.DataFrame) -> pd.DataFrame: return df[final_column_order] - def make_csv(self, meta_dict: dict) -> tuple[str, str]: - """Create a CSV file from the metadata dictionary. + def _make_cm_meta_holding_list(self, meta_dict: dict) -> list[dict]: + """Create a nested list of metadata dictionaries. Args: - meta_dict (dict): Metadata dictionary + meta_dict (dict): Dataset metadata dictionary. Returns: - tuple[str, str]: Path to the CSV file, Checksum of the CSV file + list[dict]: List of metadata dictionaries (nested) """ holding_list = [] for key, _value in meta_dict.items(): @@ -292,9 +292,24 @@ def make_csv(self, meta_dict: dict) -> tuple[str, str]: holding_list.append(jmespath_dict) - df = pd.DataFrame(holding_list) + return holding_list + + def make_csv_file(self, meta_dict: dict) -> tuple[str, str]: + """Create a CSV file from the nested metadata list. + + Args: + meta_dict (dict): Dataset metadata dictionary + + Returns: + tuple[str, str]: Path to the CSV file, Checksum of the CSV file + """ + # Create a DataFrame from the nested list + + cm_meta_holding_list = self._make_cm_meta_holding_list(meta_dict) + + df = pd.DataFrame(cm_meta_holding_list) - # Reoder the columns in the DataFrame + # Reoder the columns in the DataFrame according to to the preset order (/res/spreadsheet_order.csv) df = self._reoder_df_columns(df) # Create the CSV file From 956664d11887fc6f198eb93b8be657c58829fa0c Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Fri, 24 Jan 2025 18:22:18 -0500 Subject: [PATCH 11/28] 1. Added GitHub Actions workflows for Jekyll deployment and Poetry dependency export 2. Updated CITATION.cff & README 3. Updated merging representation and permission metadata into one dictionary (& JSON file). 4. Updated valid spreadsheet order fileds. --- .github/workflows/jekyll-gh-pages.yml | 51 ++++++++++ .gitignore | 5 +- CITATION.cff | 10 +- README.md | 46 ++++----- _config.yml | 19 ++++ dvmeta/func.py | 72 +++++++++++--- dvmeta/main.py | 134 ++++++++++++++++---------- dvmeta/spreadsheet.py | 73 +++++++++++--- res/spreadsheet_order.csv | 7 +- 9 files changed, 308 insertions(+), 109 deletions(-) create mode 100644 .github/workflows/jekyll-gh-pages.yml create mode 100644 _config.yml diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml new file mode 100644 index 0000000..e31d81c --- /dev/null +++ b/.github/workflows/jekyll-gh-pages.yml @@ -0,0 +1,51 @@ +# Sample workflow for building and deploying a Jekyll site to GitHub Pages +name: Deploy Jekyll with GitHub Pages dependencies preinstalled + +on: + # Runs on pushes targeting the default branch + push: + branches: ["main"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Build job + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Build with Jekyll + uses: actions/jekyll-build-pages@v1 + with: + source: ./ + destination: ./_site + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + + # Deployment job + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.gitignore b/.gitignore index 29e00d5..56bdf5f 100644 --- a/.gitignore +++ b/.gitignore @@ -174,4 +174,7 @@ cython_debug/ .pypirc # exported_files folder -exported_files/ \ No newline at end of file +exported_files/ + +# test.ipynb +test.ipynb \ No newline at end of file diff --git a/CITATION.cff b/CITATION.cff index 4700932..1386c30 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,10 +1,10 @@ -cff-version: 0.1.0 +cff-version: 0.1.1 message: "If you use this software, please cite it as below." authors: - family-names: "Lui" given-names: "Lok Hei" orcid: "https://orcid.org/0000-0001-5077-1530" -title: "Dataverse metadata Crawler" -version: 0.1.0 -date-released: 2025-01-16 -url: "https://github.com/kenlhlui/dataverse-metadata-crawler-p" \ No newline at end of file +title: "Dataverse Metadata Crawler" +version: 0.1.1 +date-released: 2025-01-23 +url: "https://github.com/scholarsportal/dataverse-metadata-crawler" diff --git a/README.md b/README.md index cf4b45d..7ddda7b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ [![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) -[![Licnese: MIT](https://img.shields.io/badge/Licnese-MIT-blue)](https://opensource.org/license/mit) +[![License: MIT](https://img.shields.io/badge/License-MIT-blue)](https://opensource.org/license/mit) [![Dataverse](https://img.shields.io/badge/Dataverse-FFA500?)](https://dataverse.org/) [![Code Style: Black](https://img.shields.io/badge/code_style-black-black?)](https://github.com/psf/black) @@ -7,10 +7,10 @@ ![Screencapture of the CLI tool](res/screenshot.png) ## 📜Description -A Python CLI tool for extracting and exporting metadata from [Dataverse](https://dataverse.org/) repositories. It supports bulk extraction of dataverses, datasets, and data file metadata from any chosen level of dataverse collection (whole Dataverse repository/sub-Dataverse), with flexible export options to JSON and CSV formats. +A Python CLI tool for extracting and exporting metadata from [Dataverse](https://dataverse.org/) repositories. It supports bulk extraction of dataverses, datasets, and data file metadata from any chosen level of dataverse collection (an entire Dataverse repository/sub-Dataverse), with flexible export options to JSON and CSV formats. ## ✨Features -1. Bulk metadata extraction from Dataverse repositories from any chosen level of collection (top level or selected collection) +1. Bulk metadata extraction from Dataverse repositories at any chosen level of collection (top level or selected collection) 2. JSON & CSV file export options ## 📦Prerequisites @@ -26,7 +26,7 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/ 2. Change to the project directory ```sh - cd ~/dataverse-metadata-export-p + cd ./dataverse-metadata-crawler ``` 3. Create an environment file (.env) @@ -38,11 +38,11 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/ notepad .env ``` -4. Configure environment file using your text editor at your choice +4. Configure the environment (.env) file using the text editor of your choice. ```sh # .env file BASE_URL = "TARGET_REPO_URL" # e.g., "https://demo.borealisdata.ca/" - API_KEY = "YOUR_API_KEY" # Find in your Dataverse account settings. You may also specify it in the CLI interface (with -a flag) + API_KEY = "YOUR_API_KEY" # Found in your Dataverse account settings. Can also be specified in the CLI interface using the -a flag. ``` 5. Set up virtual environment (recommended) @@ -65,16 +65,18 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/ python3 dvmeta/main.py [-a AUTH] [-l] [-d] [-p] [-f] [-e] [-s] -c COLLECTION_ALIAS -v VERSION ``` **Required arguments:** + | **Option** | **Short** | **Type** | **Description** | **Default** | |--------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| -| --collection_alias | -c | TEXT | Name of the collection to crawl.
**[required]** | None | +| --collection_alias | -c | TEXT | The alias of the collection to crawl.
See the guide [here](https://github.com/scholarsportal/dataverse-metadata-crawler/wiki/Guide:-How-to-find-the-COLLECTION_ALIAS-of-a-Dataverse-collection) to learn how to look for a the collection alias.
**[required]** | None | | --version | -v | TEXT | The Dataset version to crawl. Options include:
• `draft` - The draft version, if any
• `latest` - Either a draft (if exists) or the latest published version
• `latest-published` - The latest published version
• `x.y` - A specific version
**[required]** | None (required) | **Optional arguments:** + | **Option** | **Short** | **Type** | **Description** | **Default** | |----------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------| -| --auth | -a | TEXT | Authentication token to access the Dataverse repository.
If | None | +| --auth | -a | TEXT | Authentication token to access the Dataverse repository.
| None | | --log
--no-log | -l | | Output a log file.
Use `--no-log` to disable logging. | `log` (unless `--no-log`) | | --dvdfds_metadata | -d | | Output a JSON file containing metadata of Dataverses, Datasets, and Data Files. | | | --permission | -p | | Output a JSON file that stores permission metadata for all Datasets in the repository. | | @@ -96,15 +98,16 @@ python3 dvmeta/main.py -c demo -v 1.0 -d -s -p -a xxxxxxxx-xxxx-xxxx-xxxx-xxxxxx ``` ## 📂Output Structure + | File | Description | |-------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------| -| ds_metadata_yyyymmdd-HHMMSS.json | Datasets' their data files' metadata in JSON format. | +| ds_metadata_yyyymmdd-HHMMSS.json | Datasets representation & data files metadata in JSON format. | | empty_dv_yyyymmdd-HHMMSS.json | The id of empty dataverse(s) in list format. | | failed_metadata_uris_yyyymmdd-HHMMSS.json | The URIs (URL) of datasets failed to be downloaded. | | permission_dict_yyyymmdd-HHMMSS.json | The perission metadata of datasets with their dataset id. | | pid_dict_yyyymmdd-HHMMSS.json | Datasets' basic info with hierarchical information dictionary.Only exported if -p (permission) flag is used without -d (metadata) flag. | | pid_dict_dd_yyyymmdd-HHMMSS.json | The Hierarchical information of deaccessioned/draft datasets. | -| ds_metadata_yyyymmdd-HHMMSS.csv | Datasets' their data files' metadata in CSV format. | +| ds_metadata_yyyymmdd-HHMMSS.csv | Datasets and their data files' metadata in CSV format. | | log_yyyymmdd-HHMMSS.txt | Summary of the crawling work. | ```sh @@ -126,8 +129,8 @@ exported_files/ No tests have been written yet. Contributions welcome! ## 💻Development -1. Dependencies managment: [poetry](https://python-poetry.org/) - Update the pyproject.toml dependencies changes -2. Linter: [ruff](https://docs.astral.sh/ruff/) - Linting rules are outlined in the pyproject.toml +1. Dependencies managment: [poetry](https://python-poetry.org/) - Use `poetry` to manage dependencies and reflect changes in the `pyproject.toml` file. +2. Linter: [ruff](https://docs.astral.sh/ruff/) - Follow the linting rules outlined in the `pyproject.toml` file. ## 🙌Contributing 1. Fork the repository @@ -145,21 +148,20 @@ If you use this software in your work, please cite it using the following metada APA: ``` -Lui, L. H. (2025). Dataverse metadata Crawler (Version 0.1.0) [Computer software]. https://github.com/kenlhlui/dataverse-metadata-crawler-p +Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.1) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler ``` BibTeX: ``` -@software{Lui_Dataverse_metadata_Crawler_2025, -author = {Lui, Lok Hei}, -month = jan, -title = {{Dataverse metadata Crawler}}, -url = {https://github.com/kenlhlui/dataverse-metadata-crawler-p}, -version = {0.1.0}, -year = {2025} +@software{Lui_Dataverse_Metadata_Crawler_2025, + author = {Lui, Lok Hei}, + month = {jan}, + title = {Dataverse Metadata Crawler}, + url = {https://github.com/scholarsportal/dataverse-metadata-crawler}, + version = {0.1.1}, + year = {2025} } ``` ## ✍️Authors -Ken Lui - Data Curation Specialist, Map and Data Library, University of Toronto - kenlh.lui@utoronto.ca - +Ken Lui - Data Curation Specialist, Map and Data Library, University of Toronto - [kenlh.lui@utoronto.ca](mailto:kenlh.lui@utoronto.ca) diff --git a/_config.yml b/_config.yml new file mode 100644 index 0000000..2217569 --- /dev/null +++ b/_config.yml @@ -0,0 +1,19 @@ +# Site settings +title: Dataverse Metadata Crawler +description: A Python CLI tool for extracting and exporting metadata from Dataverse repositories to JSON and CSV formats. +baseurl: "/dataverse-metadata-crawler" # Base URL (leave blank for root deployment) +url: "https://scholarsportal.github.io" # Your GitHub Pages URL + +remote_theme: pages-themes/primer +plugins: +- jekyll-remote-theme # add this line to the plugins list if you already have one +- jekyll-seo-tag # Required by primer theme + +# Markdown settings +markdown: kramdown +kramdown: + input: GFM # Enables GitHub Flavored Markdown (GFM) + +# Build settings +source: ./ +destination: ./_site diff --git a/dvmeta/func.py b/dvmeta/func.py index 8784d83..759a437 100644 --- a/dvmeta/func.py +++ b/dvmeta/func.py @@ -1,6 +1,7 @@ """This module contains functions used in the dvmeta package.""" import os import re +from typing import Optional import httpx import jmespath @@ -26,21 +27,21 @@ def get_pids(read_dict: dict, config: dict) -> tuple: write_dict = {} for key, _item in read_dict.items(): result = jmespath.search( - "data[?type=='dataset'].{ds_id: id, protocol: protocol, authority: authority, identifier: identifier, path: path, path_ids: path_ids}", # noqa: E501 + "data[?type=='dataset'].{id: id, protocol: protocol, authority: authority, identifier: identifier, path: path, path_ids: path_ids}", # noqa: E501 read_dict[key], # noqa: PLR1733 ) if result: for item in result: pid = f"{item['protocol']}:{item['authority']}/{item['identifier']}" - ds_id = item['ds_id'] + id = item['id'] path = '/' + item['path'] if item['path'] else None path_ids = item['path_ids'] dict_to_append = { - str(pid): { # pid needs to be converted to string if it's not already + str(id): { # pid needs to be converted to string if it's not already 'collection_alias': config['COLLECTION_ALIAS'], 'collection_id': config['COLLECTION_ID'], 'pid': pid, - 'ds_id': ds_id, + 'id': id, 'path': path, 'path_ids': path_ids, } @@ -160,15 +161,41 @@ def add_path_to_dataverse_contents(des_dict: dict, ref_dict: dict) -> dict: return des_dict -def add_path_info(meta_dict: dict, pid_dict: dict) -> tuple: - """Add path_info to the metadata dictionary.""" - pid_dict_copy = pid_dict.copy() - for key in list(pid_dict_copy.keys()): - if key in meta_dict: - meta_dict[key]['path_info'] = pid_dict_copy[key] - pid_dict_copy.pop(key) +def add_path_info(meta_dict: dict, ds_dict: dict) -> tuple: + """Add path_info to the metadata dictionary, handling nested structures.""" + ds_dict_copy = ds_dict.copy() + for pid_key, pid_value in list(ds_dict_copy.items()): + pid_key_str = str(pid_key) + # Traverse the meta_dict to find matching datasetId + for _meta_key, meta_value in meta_dict.items(): + if isinstance(meta_value, dict) and meta_value.get('data', {}).get('datasetId') == int(pid_key_str): + # Add path_info to the appropriate nested dictionary + meta_value['path_info'] = pid_value + # Remove from ds_dict_copy + ds_dict_copy.pop(pid_key) + break - return meta_dict, pid_dict_copy + return meta_dict, ds_dict_copy + + +def add_perrmission_info(meta_dict: dict, permission_dict: Optional[dict] = None) -> tuple: + """Add permission_info to the metadata dictionary, handling nested structures.""" + if isinstance(permission_dict, dict): + permission_dict_copy = permission_dict.copy() + for pid_key, pid_value in list(permission_dict_copy.items()): + pid_key_str = str(pid_key) + # Traverse the meta_dict to find matching datasetId + for _meta_key, meta_value in meta_dict.items(): + if isinstance(meta_value, dict) and meta_value.get('data', {}).get('datasetId') == int(pid_key_str): + # Add path_info to the appropriate nested dictionary + meta_value['permission_info'] = pid_value + # Remove from permission_dict_copy + permission_dict_copy.pop(pid_key) + break + + return meta_dict, permission_dict_copy + + return meta_dict, None def load_env() -> dict: @@ -190,3 +217,24 @@ def load_env() -> dict: else: config['HEADERS'] = {'Accept': 'application/json'} return config + + +def replace_key_with_dataset_id(dictionary: dict) -> dict: + """Replace the top-level key in the dictionary with the value of 'datasetId' in the nested 'data'. + + Args: + dictionary (dict): The original dictionary. + + Returns: + dict: A new dictionary with keys replaced by the value of 'datasetId'. + """ + new_dict = {} + for old_key, value in dictionary.items(): + # Check if the 'data' key exists and has 'id' + if isinstance(value, dict) and value.get('data', {}).get('datasetId'): + new_key = value.get('data', {}).get('datasetId') # Get the value of 'datasetId' + new_dict[new_key] = value # Use it as the new key + else: + # Keep the original key if 'id' is missing + new_dict[old_key] = value + return new_dict diff --git a/dvmeta/main.py b/dvmeta/main.py index b587516..4f307a5 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -101,7 +101,12 @@ def main( sys.exit(1) # Crawl the collection tree metadata - collections_tree = metadata_crawler.get_collections_tree(collection_alias).json() + response = metadata_crawler.get_collections_tree(collection_alias) + if response is None: + print('Error: Failed to retrieve collections tree. The API request returned None.') + sys.exit(1) + + collections_tree = response.json() # Add collection id and alias to config if collections_tree['status'] == 'OK': @@ -120,9 +125,10 @@ def main( async def main_crawler(): # Initialize empty dict and list to store metadata - pid_dict = {'pid': []} + ds_dict = {'pid': []} failed_metadata_ids = [] json_file_checksum_dict = [] + permission_dict = {} # Flatten the collections tree collections_tree_flatten = utils.flatten_collection(collections_tree) @@ -140,43 +146,48 @@ async def main_crawler(): # Add path_ids and path to dataverse_contents from collections_tree_flatten dataverse_contents = func.add_path_to_dataverse_contents(dataverse_contents, collections_tree_flatten) - # Get URIs in collections_tree_flatten and append them to pid_dict, and return empty dataverse to empty_dv - empty_dv_dict, pid_dict = func.get_pids(dataverse_contents, config) + # Get URIs in collections_tree_flatten and append them to ds_dict, and return empty dataverse to empty_dv + empty_dv_dict, ds_dict = func.get_pids(dataverse_contents, config) # Optional arguments meta_dict = {} failed_metadata_uris = [] if dvdfds_matadata: # Export dataverse_contents - print('\nCrawling Representation and File metadata of datasets...\n') - pid_list = list(pid_dict) + print('Crawling Representation and File metadata of datasets...\n') + pid_list = [item['pid'] for item in ds_dict.values()] meta_dict, failed_metadata_uris = await metadata_crawler.get_datasets_meta(pid_list) + # Replace the key with the Data #TEMPORARY FIX + meta_dict = func.replace_key_with_dataset_id(meta_dict) + # Add the path_info to the metadata - meta_dict, pid_dict_dd = func.add_path_info(meta_dict, pid_dict) + meta_dict, pid_dict_dd = func.add_path_info(meta_dict, ds_dict) - # Export the metadata to a JSON file - meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'meta_dict') - json_file_checksum_dict.append( - { - 'type': 'Dataset Metadata (Representation & File)', - 'path': meta_json_file_path, - 'checksum': meta_json_checksum, - } - ) - print( - f'Successfully crawled {utils.count_key(meta_dict)} metadata of dataset representation and file in total.\n' - ) + if not permission: # Delay the merging of permission metadata until the permission metadata is crawled - # Export the updated pid_dict_dd (Which contains deaccessioned/draft datasets) to a JSON file - pid_dict_json, pid_dict_checksum = utils.orjson_export(pid_dict_dd, 'pid_dict_dd') - json_file_checksum_dict.append( - { - 'type': 'Hierarchical Information of Datasets(deaccessioned/draft)', - 'path': pid_dict_json, - 'checksum': pid_dict_checksum, - } - ) + # Export the metadata to a JSON file + meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'meta_dict') + json_file_checksum_dict.append( + { + 'type': 'Dataset Metadata (Representation & File)', + 'path': meta_json_file_path, + 'checksum': meta_json_checksum, + } + ) + print( + f'Successfully crawled {utils.count_key(meta_dict)} metadata of dataset representation and file in total.\n' + ) + + # Export the updated pid_dict_dd (Which contains deaccessioned/draft datasets) to a JSON file + pid_dict_json, pid_dict_checksum = utils.orjson_export(pid_dict_dd, 'pid_dict_dd') + json_file_checksum_dict.append( + { + 'type': 'Hierarchical Information of Datasets(deaccessioned/draft)', + 'path': pid_dict_json, + 'checksum': pid_dict_checksum, + } + ) if failed: failed_metadata_uris_json, failed_metadata_uris_checksum = utils.orjson_export( @@ -190,34 +201,29 @@ async def main_crawler(): } ) - if spreadsheet: - # Export the metadata to a CSV file - csv_file_path, csv_file_checksum = Spreadsheet(config).make_csv(meta_dict) - json_file_checksum_dict.append( - {'type': 'Dataset Metadata CSV', 'path': csv_file_path, 'checksum': csv_file_checksum} - ) - if permission: print('\nCrawling Permission metadata of datasets...\n') - ds_id_list = [item['ds_id'] for item in pid_dict.values()] + ds_id_list = [item['id'] for item in ds_dict.values()] permission_dict, failed_permission_uris = await (metadata_crawler.get_datasets_permissions(ds_id_list)) - permission_json_file_path, permission_json_checksum = utils.orjson_export( - permission_dict, 'permission_dict' - ) - json_file_checksum_dict.append( - { - 'type': 'Dataset Metadata (Permission)', - 'path': permission_json_file_path, - 'checksum': permission_json_checksum, - } - ) - print( - f'Successfully crawled permission metadata for {utils.count_key(permission_dict)} datasets in total.\n' - ) - # Export the pid_dict to a JSON file, if dfdfds_metadata is not provided - if not dvdfds_matadata: - pid_dict_json, pid_dict_checksum = utils.orjson_export(pid_dict, 'pid_dict') + if not dvdfds_matadata: # Delay the merging of permission metadata until the representation/file metadata is crawled + # Export the permission metadata to a JSON file + permission_json_file_path, permission_json_checksum = utils.orjson_export( + permission_dict, 'permission_dict' + ) + json_file_checksum_dict.append( + { + 'type': 'Dataset Metadata (Permission)', + 'path': permission_json_file_path, + 'checksum': permission_json_checksum, + } + ) + print( + f'Successfully crawled permission metadata for {utils.count_key(permission_dict)} datasets in total.\n' + ) + + # Export the pid_dict to a JSON file, if dfdfds_metadata is not provided + pid_dict_json, pid_dict_checksum = utils.orjson_export(ds_dict, 'pid_dict') json_file_checksum_dict.append( { 'type': 'Hierarchical Information of Datasets', @@ -226,12 +232,35 @@ async def main_crawler(): } ) + # Combine the metadata and permission metadata + if dvdfds_matadata and permission: + if isinstance(permission_dict, dict): + meta_dict = func.add_perrmission_info(meta_dict, permission_dict)[0] + + # Export the metadata to a JSON file + + meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'meta_dict_with_permission') + json_file_checksum_dict.append( + { + 'type': 'Dataset Metadata (Representation, File & Permission)', + 'path': meta_json_file_path, + 'checksum': meta_json_checksum, + } + ) + if empty_dv: empty_dv_json, empty_dv_checksum = utils.orjson_export(empty_dv_dict, 'empty_dv') json_file_checksum_dict.append( {'type': 'Empty Dataverses', 'path': empty_dv_json, 'checksum': empty_dv_checksum} ) + if spreadsheet: + # Export the metadata to a CSV file + csv_file_path, csv_file_checksum = Spreadsheet(config).make_csv_file(meta_dict) + json_file_checksum_dict.append( + {'type': 'Dataset Metadata CSV', 'path': csv_file_path, 'checksum': csv_file_checksum} + ) + return meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten = asyncio.run(main_crawler()) @@ -255,5 +284,6 @@ async def main_crawler(): failed_metadata_uris, json_file_checksum_dict) + if __name__ == '__main__': app() diff --git a/dvmeta/spreadsheet.py b/dvmeta/spreadsheet.py index 826a8e1..f8fc0c1 100644 --- a/dvmeta/spreadsheet.py +++ b/dvmeta/spreadsheet.py @@ -33,8 +33,11 @@ def __init__(self, config: dict) -> None: versionMinorNumber: data.versionMinorNumber, CM_Subtitle: data.metadataBlocks.citation.fields[?typeName==`subtitle`].value|[] CM_AltTitle: data.metadataBlocks.citation.fields[?typeName==`alternativeTitle`].value|[] + CM_AltURL: data.metadataBlocks.citation.fields[?typeName==`alternativeURL`].value|[] + CM_Agency: data.metadataBlocks.citation.fields[?typeName==`otherId`].value|[*]|[].otherIdAgency.value + CM_ID: data.metadataBlocks.citation.fields[?typeName==`otherId`].value|[*]|[].otherIdValue.value CM_Author: data.metadataBlocks.citation.fields[?typeName==`author`].value|[*]|[].authorName.value - CM_ContactAff: data.metadataBlocks.citation.fields[?typeName==`author`].value|[*]|[].authorAffiliation.value + CM_AuthorAff: data.metadataBlocks.citation.fields[?typeName==`author`].value|[*]|[].authorAffiliation.value CM_AuthorID: data.metadataBlocks.citation.fields[?typeName==`author`].value|[*]|[].authorIdentifier.value CM_AuthorIDType: data.metadataBlocks.citation.fields[?typeName==`author`].value|[*]|[].authorIdentifierScheme.value CM_ContactName: data.metadataBlocks.citation.fields[?typeName==`datasetContact`].value|[*]|[].datasetContactName.value @@ -74,8 +77,9 @@ def __init__(self, config: dict) -> None: CM_Depositor: data.metadataBlocks.citation.fields[?typeName==`depositor`].value|[] CM_DepositDate: data.metadataBlocks.citation.fields[?typeName==`dateOfDeposit`].value|[] CM_TimeStart: data.metadataBlocks.citation.fields[?typeName==`timePeriodCovered`].value|[].timePeriodCoveredStart.value - CM_TimeEnd: data.metadataBlocks.citation.fields[?typeName==`dateOfCollection`].value|[].dateOfCollectionStart.value - CM_CollectionStart: data.metadataBlocks.citation.fields[?typeName==`dateOfCollection`].value|[].dateOfCollectionEnd.value + CM_TimeEnd: data.metadataBlocks.citation.fields[?typeName==`timePeriodCovered`].value|[].timePeriodCoveredEnd.value + CM_CollectionStart: data.metadataBlocks.citation.fields[?typeName==`dateOfCollection`].value|[].dateOfCollectionStart.value + CM_CollectionEnd: data.metadataBlocks.citation.fields[?typeName==`dateOfCollection`].value|[].dateOfCollectionEnd.value CM_DataType: data.metadataBlocks.citation.fields[?typeName==`kindOfData`].value|[] CM_SeriesName: data.metadataBlocks.citation.fields[?typeName==`series`].value|[].seriesName.value CM_SeriesInfo: data.metadataBlocks.citation.fields[?typeName==`series`].value|[].seriesInformation.value @@ -177,7 +181,7 @@ def _get_dataset_subjects(dictionary: dict) -> dict: return result_dict @staticmethod - def _get_metadata_blocks(dictionary: dict) -> dict: + def _get_metadata_blocks_usage(dictionary: dict) -> dict: metadata_block_dict = { 'Meta_Geo': 'geospatial', 'Meta_SSHM': 'socialscience', @@ -194,6 +198,33 @@ def _get_metadata_blocks(dictionary: dict) -> dict: return result_dict + @staticmethod + def _get_datafile_meta_usage(dictionary: dict) -> dict: + # Get the use of data file directoryLabel (DF_Hierarchy), + # tags (categories; DF_Tags) & description (DF_Description). + if dictionary.get('data', {}).get('files'): + file_nested_list = jmespath.search('data.files[*]', dictionary) + + # Get the count of directoryLabel if it is not None + directorylabel_count = len([file for file in file_nested_list if file.get('directoryLabel') is not None]) + + # Get the count of categories if it is not None + categories_count = len([ + file for file in file_nested_list + if file.get('dataFile', {}).get('categories') is not None + ]) + + # Get the count of description if it is not None + description_count = len([ + file for file in file_nested_list + if file.get('dataFile', {}).get('description') is not None + ]) + + return {'DF_Hierarchy': directorylabel_count, + 'DF_Tags': categories_count, + 'DF_Description': description_count} + return {'DF_Hierarchy': 0, 'DF_Tags': 0, 'DF_Description': 0} + def _get_spreadsheet_order(self) -> list[str]: with Path(self.spreadsheet_order_file_path).open(encoding='utf-8') as file: return file.read().splitlines() @@ -212,18 +243,23 @@ def _reoder_df_columns(self, df: pd.DataFrame) -> pd.DataFrame: return df[final_column_order] - def make_csv(self, meta_dict: dict) -> tuple[str, str]: - """Create a CSV file from the metadata dictionary. + def _make_cm_meta_holding_list(self, meta_dict: dict) -> list[dict]: + """Create a nested list of metadata dictionaries. Args: - meta_dict (dict): Metadata dictionary + meta_dict (dict): Dataset metadata dictionary. Returns: - tuple[str, str]: Path to the CSV file, Checksum of the CSV file + list[dict]: List of metadata dictionaries (nested) """ holding_list = [] for key, _value in meta_dict.items(): jmespath_dict: dict = jmespath.search(f'{self.search_string}', meta_dict[key]) + + # Get the use of data file hierarchy (folders, DF_Hierarchy), + # file tags (categories; DF_Tags) & description (DF_Description) + jmespath_dict.update(self._get_datafile_meta_usage(meta_dict[key])) + # Get the file size and count jmespath_dict['FileSize'] = self._get_data_files_size(meta_dict[key]) jmespath_dict['FileSize_normalized'] = convert_size(jmespath_dict['FileSize']) @@ -245,7 +281,7 @@ def make_csv(self, meta_dict: dict) -> tuple[str, str]: jmespath_dict.update(self._get_dataset_subjects(jmespath_dict)) # Get the metadata blocks and add them to the result dictionary - jmespath_dict.update(self._get_metadata_blocks(jmespath_dict)) + jmespath_dict.update(self._get_metadata_blocks_usage(jmespath_dict)) # Drop the versionNumber and versionMinorNumber keys from the dictionary jmespath_dict.pop('versionNumber', None) @@ -256,9 +292,24 @@ def make_csv(self, meta_dict: dict) -> tuple[str, str]: holding_list.append(jmespath_dict) - df = pd.DataFrame(holding_list) + return holding_list + + def make_csv_file(self, meta_dict: dict) -> tuple[str, str]: + """Create a CSV file from the nested metadata list. + + Args: + meta_dict (dict): Dataset metadata dictionary + + Returns: + tuple[str, str]: Path to the CSV file, Checksum of the CSV file + """ + # Create a DataFrame from the nested list + + cm_meta_holding_list = self._make_cm_meta_holding_list(meta_dict) + + df = pd.DataFrame(cm_meta_holding_list) - # Reoder the columns in the DataFrame + # Reoder the columns in the DataFrame according to to the preset order (/res/spreadsheet_order.csv) df = self._reoder_df_columns(df) # Create the CSV file diff --git a/res/spreadsheet_order.csv b/res/spreadsheet_order.csv index a547b43..b5cff61 100644 --- a/res/spreadsheet_order.csv +++ b/res/spreadsheet_order.csv @@ -11,14 +11,11 @@ Version FileCount FileSize FileSize_normalized -FileFormat DataverseSubCollection License RestrictedFiles RequestAcces TermsAccess -TermsUse -Citationrequirements DF_Hierarchy DF_Tags DF_Description @@ -113,6 +110,4 @@ DS_Contrib DS_ContribPlus DS_Curator DS_FileDown -DS_Member -DS_UOFT_Admin -DS_Groups +DS_Member \ No newline at end of file From e09856538fb042604e8a8f8703765ca50d1acca5 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Sat, 25 Jan 2025 03:13:28 -0500 Subject: [PATCH 12/28] 1. Integrated permission metadata into spreadsheet --- dvmeta/func.py | 4 ++++ dvmeta/spreadsheet.py | 33 ++++++++++++++++++++++++++++++++- res/spreadsheet_order.csv | 5 +++-- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/dvmeta/func.py b/dvmeta/func.py index 759a437..2db69cc 100644 --- a/dvmeta/func.py +++ b/dvmeta/func.py @@ -192,6 +192,10 @@ def add_perrmission_info(meta_dict: dict, permission_dict: Optional[dict] = None # Remove from permission_dict_copy permission_dict_copy.pop(pid_key) break + for _meta_key, meta_value in meta_dict.items(): + if isinstance(meta_value, dict) and meta_value.get('data', {}).get('datasetId'): + if 'permission_info' not in meta_value: + meta_value['permission_info'] = {'status': 'NA', 'data': []} return meta_dict, permission_dict_copy diff --git a/dvmeta/spreadsheet.py b/dvmeta/spreadsheet.py index f8fc0c1..1467136 100644 --- a/dvmeta/spreadsheet.py +++ b/dvmeta/spreadsheet.py @@ -19,6 +19,7 @@ def __init__(self, config: dict) -> None: self.config = config self.search_string = """{ DatasetTitle: data.metadataBlocks.citation.fields[?typeName==`title`].value|[] + DS_Path: path_info.path DatasetPersistentId: data.datasetPersistentId, ID: data.id, DatasetId: data.datasetId, @@ -92,7 +93,14 @@ def __init__(self, config: dict) -> None: CM_OriginSources: data.metadataBlocks.citation.fields[?typeName==`originOfSources`].value|[] CM_CharSources: data.metadataBlocks.citation.fields[?typeName==`characteristicOfSources`].value|[] CM_DocSources: data.metadataBlocks.citation.fields[?typeName==`accessToSources`].value|[] - DataverseSubCollection: path_info.path + DS_Permission: permission_info.data + DS_Collab: length(permission_info.data) + DS_Admin: length(permission_info.data[?_roleAlias=='admin']) + DS_Contrib: length(permission_info.data[?_roleAlias=='contributor']) + DS_ContribPlus: length(permission_info.data[?_roleAlias=='fullContributor']) + DS_Curator: length(permission_info.data[?_roleAlias=='curator']) + DS_FileDown: length(permission_info.data[?_roleAlias=='fileDownloader']) + DS_Member: length(permission_info.data[?_roleAlias=='member']) }""" # noqa: E501 self.csv_file_dir = DirManager().csv_files_dir() self.spreadsheet_order_file_path = Path(DirManager().res_dir) / 'spreadsheet_order.csv' @@ -225,6 +233,24 @@ def _get_datafile_meta_usage(dictionary: dict) -> dict: 'DF_Description': description_count} return {'DF_Hierarchy': 0, 'DF_Tags': 0, 'DF_Description': 0} + @staticmethod + def _parse_permission_values(dictionary: dict) -> dict | None: + """Parse the NA value to permission_info.data, if the value is not available.""" + if dictionary.get('permission_info', {}).get('status', {}) == 'NA': + # If the status is NA, set the DS_Permission, DS_Collab, DS_Admin, DS_Contrib + # DS_ContribPlus, DS_Curator, DS_FileDown, DS_Member to NA + return { + 'DS_Permission': False, + 'DS_Collab': 'NA', + 'DS_Admin': 'NA', + 'DS_Contrib': 'NA', + 'DS_ContribPlus': 'NA', + 'DS_Curator': 'NA', + 'DS_FileDown': 'NA', + 'DS_Member': 'NA' + } + return {'DS_Permission': True} + def _get_spreadsheet_order(self) -> list[str]: with Path(self.spreadsheet_order_file_path).open(encoding='utf-8') as file: return file.read().splitlines() @@ -287,6 +313,9 @@ def _make_cm_meta_holding_list(self, meta_dict: dict) -> list[dict]: jmespath_dict.pop('versionNumber', None) jmespath_dict.pop('versionMinorNumber', None) + # Update the permission info if the status is NA + jmespath_dict.update(self._parse_permission_values(meta_dict[key]) or {}) + # Last step: Turn the lists in the dictionary into strings jmespath_dict = {key: list_to_string(value) if isinstance(value, list) else value for key, value in jmespath_dict.items()} @@ -307,6 +336,8 @@ def make_csv_file(self, meta_dict: dict) -> tuple[str, str]: cm_meta_holding_list = self._make_cm_meta_holding_list(meta_dict) + + df = pd.DataFrame(cm_meta_holding_list) # Reoder the columns in the DataFrame according to to the preset order (/res/spreadsheet_order.csv) diff --git a/res/spreadsheet_order.csv b/res/spreadsheet_order.csv index b5cff61..b784c9a 100644 --- a/res/spreadsheet_order.csv +++ b/res/spreadsheet_order.csv @@ -1,7 +1,8 @@ DatasetTitle DatasetURL -DatasetPersistentId +DS_Path ID +DatasetPersistentId DatasetId VersionState LastUpdateTime @@ -11,7 +12,6 @@ Version FileCount FileSize FileSize_normalized -DataverseSubCollection License RestrictedFiles RequestAcces @@ -101,6 +101,7 @@ Meta_Astro Meta_LS Meta_Journal Meta_CWF +DS_Permission DS_Collab DS_Collab_In DS_Collab_Ex From 00d74b310626428a45e006b6d5ce6768c4506e5a Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Sat, 25 Jan 2025 03:18:13 -0500 Subject: [PATCH 13/28] 1. Added TermsOfUse field. --- dvmeta/spreadsheet.py | 3 ++- res/spreadsheet_order.csv | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dvmeta/spreadsheet.py b/dvmeta/spreadsheet.py index 1467136..39ee74d 100644 --- a/dvmeta/spreadsheet.py +++ b/dvmeta/spreadsheet.py @@ -28,8 +28,9 @@ def __init__(self, config: dict) -> None: ReleaseTime: data.releaseTime, CreateTime: data.createTime, License: data.license.name - TermsAccess: data.termsOfAccess + TermsOfUse: data.termsOfUse RequestAcces: data.fileAccessRequest + TermsAccess: data.termsOfAccess versionNumber: data.versionNumber, versionMinorNumber: data.versionMinorNumber, CM_Subtitle: data.metadataBlocks.citation.fields[?typeName==`subtitle`].value|[] diff --git a/res/spreadsheet_order.csv b/res/spreadsheet_order.csv index b784c9a..4e90cbf 100644 --- a/res/spreadsheet_order.csv +++ b/res/spreadsheet_order.csv @@ -14,6 +14,7 @@ FileSize FileSize_normalized License RestrictedFiles +TermsOfUse RequestAcces TermsAccess DF_Hierarchy From cf968f578e82d9c62c70bc82d1c4bc86e3d2d7ab Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Mon, 27 Jan 2025 10:56:06 -0500 Subject: [PATCH 14/28] 1. Foramting changes --- dvmeta/spreadsheet.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dvmeta/spreadsheet.py b/dvmeta/spreadsheet.py index 39ee74d..d954e44 100644 --- a/dvmeta/spreadsheet.py +++ b/dvmeta/spreadsheet.py @@ -337,8 +337,6 @@ def make_csv_file(self, meta_dict: dict) -> tuple[str, str]: cm_meta_holding_list = self._make_cm_meta_holding_list(meta_dict) - - df = pd.DataFrame(cm_meta_holding_list) # Reoder the columns in the DataFrame according to to the preset order (/res/spreadsheet_order.csv) From 1d4591db1371dfceaa79d5a101d142ec01f7a514 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Mon, 27 Jan 2025 11:13:20 -0500 Subject: [PATCH 15/28] 1. Unify the use of `datasetId` acorss reading native API & search API --- dvmeta/func.py | 8 ++++---- dvmeta/main.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dvmeta/func.py b/dvmeta/func.py index 2db69cc..ba489e5 100644 --- a/dvmeta/func.py +++ b/dvmeta/func.py @@ -27,13 +27,13 @@ def get_pids(read_dict: dict, config: dict) -> tuple: write_dict = {} for key, _item in read_dict.items(): result = jmespath.search( - "data[?type=='dataset'].{id: id, protocol: protocol, authority: authority, identifier: identifier, path: path, path_ids: path_ids}", # noqa: E501 + "data[?type=='dataset'].{datasetId: id, protocol: protocol, authority: authority, identifier: identifier, path: path, path_ids: path_ids}", # noqa: E501 read_dict[key], # noqa: PLR1733 ) if result: for item in result: pid = f"{item['protocol']}:{item['authority']}/{item['identifier']}" - id = item['id'] + id = item['datasetId'] path = '/' + item['path'] if item['path'] else None path_ids = item['path_ids'] dict_to_append = { @@ -41,7 +41,7 @@ def get_pids(read_dict: dict, config: dict) -> tuple: 'collection_alias': config['COLLECTION_ALIAS'], 'collection_id': config['COLLECTION_ID'], 'pid': pid, - 'id': id, + 'datasetId': id, 'path': path, 'path_ids': path_ids, } @@ -234,7 +234,7 @@ def replace_key_with_dataset_id(dictionary: dict) -> dict: """ new_dict = {} for old_key, value in dictionary.items(): - # Check if the 'data' key exists and has 'id' + # Check if the 'data' key exists and has 'datasetId' if isinstance(value, dict) and value.get('data', {}).get('datasetId'): new_key = value.get('data', {}).get('datasetId') # Get the value of 'datasetId' new_dict[new_key] = value # Use it as the new key diff --git a/dvmeta/main.py b/dvmeta/main.py index 4f307a5..a24d4e0 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -203,7 +203,7 @@ async def main_crawler(): if permission: print('\nCrawling Permission metadata of datasets...\n') - ds_id_list = [item['id'] for item in ds_dict.values()] + ds_id_list = [item['datasetId'] for item in ds_dict.values()] permission_dict, failed_permission_uris = await (metadata_crawler.get_datasets_permissions(ds_id_list)) if not dvdfds_matadata: # Delay the merging of permission metadata until the representation/file metadata is crawled From 6abf1a1933d1156f61a8448b37c33f1638663c78 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Mon, 27 Jan 2025 11:25:19 -0500 Subject: [PATCH 16/28] 1. Changed path_ids to pathIds. 2. Changed collection_alias to CollectionAlias 3. Changed pid to datasetPersistentId --- dvmeta/func.py | 18 +++++++++--------- dvmeta/main.py | 6 +++--- dvmeta/utils.py | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dvmeta/func.py b/dvmeta/func.py index ba489e5..52347bd 100644 --- a/dvmeta/func.py +++ b/dvmeta/func.py @@ -27,7 +27,7 @@ def get_pids(read_dict: dict, config: dict) -> tuple: write_dict = {} for key, _item in read_dict.items(): result = jmespath.search( - "data[?type=='dataset'].{datasetId: id, protocol: protocol, authority: authority, identifier: identifier, path: path, path_ids: path_ids}", # noqa: E501 + "data[?type=='dataset'].{datasetId: id, protocol: protocol, authority: authority, identifier: identifier, path: path, pathIds: pathIds}", # noqa: E501 read_dict[key], # noqa: PLR1733 ) if result: @@ -35,15 +35,15 @@ def get_pids(read_dict: dict, config: dict) -> tuple: pid = f"{item['protocol']}:{item['authority']}/{item['identifier']}" id = item['datasetId'] path = '/' + item['path'] if item['path'] else None - path_ids = item['path_ids'] + path_ids = item['pathIds'] dict_to_append = { str(id): { # pid needs to be converted to string if it's not already - 'collection_alias': config['COLLECTION_ALIAS'], - 'collection_id': config['COLLECTION_ID'], - 'pid': pid, + 'CollectionAlias': config['COLLECTION_ALIAS'], + 'CollectionID': config['COLLECTION_ID'], + 'datasetPersistentId': pid, 'datasetId': id, 'path': path, - 'path_ids': path_ids, + 'pathIds': path_ids, } } write_dict.update(dict_to_append) @@ -140,7 +140,7 @@ def count_files_size(read_dict: dict) -> tuple: def add_path_to_dataverse_contents(des_dict: dict, ref_dict: dict) -> dict: - """Add path_ids and path to dataverse_contents from collections_tree_flatten. + """Add pathIds and path to dataverse_contents from collections_tree_flatten. Args: des_dict (dict): Dictionary containing the metadata of datasets @@ -154,10 +154,10 @@ def add_path_to_dataverse_contents(des_dict: dict, ref_dict: dict) -> dict: if value['data']: for item in value['data']: item.update({'path': ref_dict[key]['path']}) - item.update({'path_ids': ref_dict[key]['path_ids']}) + item.update({'pathIds': ref_dict[key]['pathIds']}) else: value['data'].append({'path': ref_dict[key]['path']}) - value['data'].append({'path_ids': ref_dict[key]['path_ids']}) + value['data'].append({'pathIds': ref_dict[key]['pathIds']}) return des_dict diff --git a/dvmeta/main.py b/dvmeta/main.py index a24d4e0..20499c0 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -125,7 +125,7 @@ def main( async def main_crawler(): # Initialize empty dict and list to store metadata - ds_dict = {'pid': []} + ds_dict = {'datasetPersistentId': []} failed_metadata_ids = [] json_file_checksum_dict = [] permission_dict = {} @@ -143,7 +143,7 @@ async def main_crawler(): print('Getting basic metadata of datasets in across dataverses (incl. all children)...\n') dataverse_contents, failed_dataverse_contents = await metadata_crawler.get_dataverse_contents(collection_id_list) - # Add path_ids and path to dataverse_contents from collections_tree_flatten + # Add pathIds and path to dataverse_contents from collections_tree_flatten dataverse_contents = func.add_path_to_dataverse_contents(dataverse_contents, collections_tree_flatten) # Get URIs in collections_tree_flatten and append them to ds_dict, and return empty dataverse to empty_dv @@ -155,7 +155,7 @@ async def main_crawler(): if dvdfds_matadata: # Export dataverse_contents print('Crawling Representation and File metadata of datasets...\n') - pid_list = [item['pid'] for item in ds_dict.values()] + pid_list = [item['datasetPersistentId'] for item in ds_dict.values()] meta_dict, failed_metadata_uris = await metadata_crawler.get_datasets_meta(pid_list) # Replace the key with the Data #TEMPORARY FIX diff --git a/dvmeta/utils.py b/dvmeta/utils.py index e2c1ab1..732cef3 100644 --- a/dvmeta/utils.py +++ b/dvmeta/utils.py @@ -169,7 +169,7 @@ def loop_item(dictionary_data, path_name='', path_ids=[]): current_path_ids = path_ids + [item['id']] - new_item['path_ids'] = current_path_ids + new_item['pathIds'] = current_path_ids new_item['path'] = f"{path_name}/{item['name']}" if path_name else item['name'] new_item.pop('children', None) write_dict[item['id']] = new_item From bde6f3e4ccac4b07b8024165509c695ad4b957bc Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Mon, 27 Jan 2025 23:01:54 -0500 Subject: [PATCH 17/28] 1. Updated CLI description 2. Updated minor syntax --- dvmeta/main.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dvmeta/main.py b/dvmeta/main.py index 20499c0..89389f1 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -64,15 +64,13 @@ def main( False, '--spreadsheet', '-s', help='Output a CSV file of the metadata of datasets' ), ): - """A command line utility that crawls a dataverse repository, extracting metadata for dataverses, datasets, and permissions, and then stores it in JSON format.""" - # Load the environment variables #! This need to be modified as it nullifies the auth token provided by the user + """A Python CLI tool for extracting and exporting metadata from Dataverse repositories to JSON and CSV formats.""" + # Load the environment variables config: dict = func.load_env() config['COLLECTION_ALIAS'] = collection_alias config['VERSION'] = version - config['API_KEY'] = ( - auth if auth else config['API_KEY'] - ) # Reassign the API_KEY and replace it specified in the .env file + config['API_KEY'] = (auth if auth else config['API_KEY']) # Reassign the API_KEY and replace it specified in the .env file, if provided in the CLI interface # Check if -s flag is provided without -d flag func.validate_spreadsheet(spreadsheet, dvdfds_matadata) From 4d0ade56b2d052034aaacb9f968bcf3df4a58603 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Mon, 27 Jan 2025 23:49:58 -0500 Subject: [PATCH 18/28] 1. Hotfix for failed_metadata_dict parsing, handling error request. --- dvmeta/httpxclient.py | 6 +++--- dvmeta/metadatacrawler.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dvmeta/httpxclient.py b/dvmeta/httpxclient.py index c353684..8812173 100644 --- a/dvmeta/httpxclient.py +++ b/dvmeta/httpxclient.py @@ -63,7 +63,7 @@ async def __aexit__(self, await self.async_client.aclose() self.sync_client.close() - async def _async_semaphore_client(self, url: str) -> httpx.Response | None: + async def _async_semaphore_client(self, url: str) -> httpx.Response | list[str]: """Asynchronous HTTP client with semaphore. Args: @@ -79,9 +79,9 @@ async def _async_semaphore_client(self, url: str) -> httpx.Response | None: # print(f'HTTP request Error for {url}: {response.status_code}') return response return response - except (httpx.HTTPStatusError, httpx.RequestError) as exc: + except (httpx.HTTPStatusError, httpx.RequestError): # print(f'HTTP request Error for {url}: {exc}') - return None + return [url, 'Error'] def sync_get(self, url: str) -> httpx.Response | None: """Synchronous GET request. diff --git a/dvmeta/metadatacrawler.py b/dvmeta/metadatacrawler.py index 481ca14..2d7dca5 100644 --- a/dvmeta/metadatacrawler.py +++ b/dvmeta/metadatacrawler.py @@ -100,8 +100,10 @@ async def get_datasets_meta(self, id_list: list) -> tuple[dict, dict]: if item and item.status_code == self.http_success_status and item.json(): dataset_persistent_idd = item.json().get('data').get('datasetPersistentId') dataset_meta[dataset_persistent_idd] = item.json() - else: + elif item and item.status_code != self.http_success_status: failed_dataset_meta[str(item.url)] = item.status_code + elif isinstance(item, list): + failed_dataset_meta[item[0]] = item[1] return dataset_meta, failed_dataset_meta From dc6629da0b5bd839f93e2f71f42ae293b438e0f6 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Tue, 28 Jan 2025 00:28:34 -0500 Subject: [PATCH 19/28] 1. Added dummy value to the meta_dict even if permission flag is not enabled. This is to prevent error when making csv, without specify -p flag (jmespath length error) 2. Revamped the above logic. --- dvmeta/func.py | 20 +++++++---------- dvmeta/main.py | 61 +++++++++++++++++++------------------------------- 2 files changed, 31 insertions(+), 50 deletions(-) diff --git a/dvmeta/func.py b/dvmeta/func.py index 52347bd..7cc72c5 100644 --- a/dvmeta/func.py +++ b/dvmeta/func.py @@ -178,28 +178,24 @@ def add_path_info(meta_dict: dict, ds_dict: dict) -> tuple: return meta_dict, ds_dict_copy -def add_perrmission_info(meta_dict: dict, permission_dict: Optional[dict] = None) -> tuple: +def add_permission_info(meta_dict: dict, permission_dict: Optional[dict] = None) -> dict: """Add permission_info to the metadata dictionary, handling nested structures.""" if isinstance(permission_dict, dict): - permission_dict_copy = permission_dict.copy() - for pid_key, pid_value in list(permission_dict_copy.items()): + for pid_key, pid_value in list(permission_dict.items()): pid_key_str = str(pid_key) # Traverse the meta_dict to find matching datasetId for _meta_key, meta_value in meta_dict.items(): if isinstance(meta_value, dict) and meta_value.get('data', {}).get('datasetId') == int(pid_key_str): # Add path_info to the appropriate nested dictionary meta_value['permission_info'] = pid_value - # Remove from permission_dict_copy - permission_dict_copy.pop(pid_key) + # Remove from permission_dict + permission_dict.pop(pid_key) break - for _meta_key, meta_value in meta_dict.items(): - if isinstance(meta_value, dict) and meta_value.get('data', {}).get('datasetId'): - if 'permission_info' not in meta_value: - meta_value['permission_info'] = {'status': 'NA', 'data': []} - - return meta_dict, permission_dict_copy + for _meta_key, meta_value in meta_dict.items(): + if 'permission_info' not in meta_value: + meta_value['permission_info'] = {'status': 'NA', 'data': []} - return meta_dict, None + return meta_dict def load_env() -> dict: diff --git a/dvmeta/main.py b/dvmeta/main.py index 89389f1..d1bc468 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -162,30 +162,15 @@ async def main_crawler(): # Add the path_info to the metadata meta_dict, pid_dict_dd = func.add_path_info(meta_dict, ds_dict) - if not permission: # Delay the merging of permission metadata until the permission metadata is crawled - - # Export the metadata to a JSON file - meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'meta_dict') - json_file_checksum_dict.append( - { - 'type': 'Dataset Metadata (Representation & File)', - 'path': meta_json_file_path, - 'checksum': meta_json_checksum, - } - ) - print( - f'Successfully crawled {utils.count_key(meta_dict)} metadata of dataset representation and file in total.\n' - ) - - # Export the updated pid_dict_dd (Which contains deaccessioned/draft datasets) to a JSON file - pid_dict_json, pid_dict_checksum = utils.orjson_export(pid_dict_dd, 'pid_dict_dd') - json_file_checksum_dict.append( - { - 'type': 'Hierarchical Information of Datasets(deaccessioned/draft)', - 'path': pid_dict_json, - 'checksum': pid_dict_checksum, - } - ) + # Export the updated pid_dict_dd (Which contains deaccessioned/draft datasets) to a JSON file + pid_dict_json, pid_dict_checksum = utils.orjson_export(pid_dict_dd, 'pid_dict_dd') + json_file_checksum_dict.append( + { + 'type': 'Hierarchical Information of Datasets(deaccessioned/draft)', + 'path': pid_dict_json, + 'checksum': pid_dict_checksum, + } + ) if failed: failed_metadata_uris_json, failed_metadata_uris_checksum = utils.orjson_export( @@ -230,21 +215,21 @@ async def main_crawler(): } ) - # Combine the metadata and permission metadata - if dvdfds_matadata and permission: - if isinstance(permission_dict, dict): - meta_dict = func.add_perrmission_info(meta_dict, permission_dict)[0] - - # Export the metadata to a JSON file + # Combine the metadata and permission metadata, if both are provided + # Else write dummy permission metadata to the metadata + meta_dict = func.add_permission_info(meta_dict, permission_dict if isinstance(permission_dict, dict) and permission_dict else None) + + # Export the metadata to a JSON file + meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'meta_dict_with_permission') + json_file_checksum_dict.append( + { + 'type': 'Dataset Metadata (Representation, File & Permission)', + 'path': meta_json_file_path, + 'checksum': meta_json_checksum, + } + ) - meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'meta_dict_with_permission') - json_file_checksum_dict.append( - { - 'type': 'Dataset Metadata (Representation, File & Permission)', - 'path': meta_json_file_path, - 'checksum': meta_json_checksum, - } - ) + print(f'Successfully crawled {utils.count_key(meta_dict)} metadata of dataset representation and file in total.\n') if empty_dv: empty_dv_json, empty_dv_checksum = utils.orjson_export(empty_dv_dict, 'empty_dv') From 94470462d20c50a9a15f0bc8603588bb0a5dcd61 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Tue, 28 Jan 2025 00:33:05 -0500 Subject: [PATCH 20/28] 1. Fixed prompt output for ds_meta --- dvmeta/main.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dvmeta/main.py b/dvmeta/main.py index d1bc468..f12cbf3 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -219,17 +219,17 @@ async def main_crawler(): # Else write dummy permission metadata to the metadata meta_dict = func.add_permission_info(meta_dict, permission_dict if isinstance(permission_dict, dict) and permission_dict else None) - # Export the metadata to a JSON file - meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'meta_dict_with_permission') - json_file_checksum_dict.append( - { - 'type': 'Dataset Metadata (Representation, File & Permission)', - 'path': meta_json_file_path, - 'checksum': meta_json_checksum, - } - ) - - print(f'Successfully crawled {utils.count_key(meta_dict)} metadata of dataset representation and file in total.\n') + if meta_dict: + # Export the metadata to a JSON file + meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'meta_dict_with_permission') + json_file_checksum_dict.append( + { + 'type': 'Dataset Metadata (Representation, File & Permission)', + 'path': meta_json_file_path, + 'checksum': meta_json_checksum, + } + ) + print(f'Successfully crawled {utils.count_key(meta_dict)} metadata of dataset representation and file in total.\n') if empty_dv: empty_dv_json, empty_dv_checksum = utils.orjson_export(empty_dv_dict, 'empty_dv') From ce65c09f33ebe5801248227c518277b120d75318 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Tue, 28 Jan 2025 00:35:00 -0500 Subject: [PATCH 21/28] 1. Updated name of ds_meta. --- dvmeta/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvmeta/main.py b/dvmeta/main.py index f12cbf3..2df929d 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -221,7 +221,7 @@ async def main_crawler(): if meta_dict: # Export the metadata to a JSON file - meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'meta_dict_with_permission') + meta_json_file_path, meta_json_checksum = utils.orjson_export(meta_dict, 'ds_metadata') json_file_checksum_dict.append( { 'type': 'Dataset Metadata (Representation, File & Permission)', From c30bcced7b4d3ac25708a3a64cd81daee9875869 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Tue, 28 Jan 2025 00:36:13 -0500 Subject: [PATCH 22/28] 1. Updated README for exported_files section & clearer instructions --- README.md | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7ddda7b..988b39d 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/ cd ./dataverse-metadata-crawler ``` -3. Create an environment file (.env) +3. Create an environment file (`.env`) ```sh touch .env # For Unix/MacOS nano .env # or vim .env, or your preferred editor @@ -38,12 +38,17 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/ notepad .env ``` -4. Configure the environment (.env) file using the text editor of your choice. +4. Configure the environment (`.env`) file using the text editor of your choice. ```sh # .env file - BASE_URL = "TARGET_REPO_URL" # e.g., "https://demo.borealisdata.ca/" + BASE_URL = "TARGET_REPO_URL" # Base URL of the repository; e.g., "https://demo.borealisdata.ca/" API_KEY = "YOUR_API_KEY" # Found in your Dataverse account settings. Can also be specified in the CLI interface using the -a flag. ``` + Your `.env` file should look like this: + ```sh + BASE_URL = "https://demo.borealisdata.ca/" + API_KEY = "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXX" + ``` 5. Set up virtual environment (recommended) ```sh @@ -115,10 +120,10 @@ exported_files/ ├── json_files/ │ └── ds_metadata_yyyymmdd-HHMMSS.json # With -d flag enabled │ └── empty_dv_yyyymmdd-HHMMSS.json # With -e flag enabled -│ └── failed_metadata_uris_yyyymmdd-HHMMSS.json -│ └── permission_dict_yyyymmdd-HHMMSS.json # With -p flag enabled -│ └── pid_dict_yyyymmdd-HHMMSS.json # Only exported if -p flag is used without -d flag -│ └── pid_dict_dd_yyyymmdd-HHMMSS.json # Hierarchical information of deaccessioned/draft datasets +│ └── failed_metadata_uris_yyyymmdd-HHMMSS.json # With -f flag enabled +│ └── permission_dict_yyyymmdd-HHMMSS.json # With only -p flag enabled +│ └── pid_dict_yyyymmdd-HHMMSS.json # With only -p flag enabled +│ └── pid_dict_dd_yyyymmdd-HHMMSS.json # Hierarchical information of deaccessioned/draft datasets. ├── csv_files/ │ └── ds_metadata_yyyymmdd-HHMMSS.csv # with -s flag enabled └── logs_files/ From 9421e33bc3415263626413daa1e91f5c2fee3efb Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Tue, 28 Jan 2025 16:07:31 -0500 Subject: [PATCH 23/28] 1. Updated README: adding Disclaimer section. --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 988b39d..f93d90d 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,10 @@ exported_files/ └── log_yyyymmdd-HHMMSS.txt # Exported by default, without specifying --no-log ``` +## ⚠️Disclaimer +> [!WARNING] +> To retrieve data about unpublished datasets or information that is not available publicly (e.g. collaborators/permissions), you will need to have necessary access rights. **Please note that any publication or use of non-publicly available data may require review by a Research Ethics Board**. + ## ✅Tests No tests have been written yet. Contributions welcome! From f2078352ce4f592144743025b7638420e0304e41 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Mon, 20 Jan 2025 18:28:16 -0500 Subject: [PATCH 24/28] 1. Added GitHub Actions workflows for Jekyll deployment and Poetry dependency export 2. Updated CITATION.cff & README --- .github/workflows/jekyll-gh-pages.yml | 51 +++++++++++++ .../workflows/poetry-export_dependencies.yml | 73 +++++++++++++++++++ CITATION.cff | 4 +- README.md | 16 ++-- _config.yml | 19 +++++ 5 files changed, 154 insertions(+), 9 deletions(-) create mode 100644 .github/workflows/jekyll-gh-pages.yml create mode 100644 .github/workflows/poetry-export_dependencies.yml create mode 100644 _config.yml diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml new file mode 100644 index 0000000..e31d81c --- /dev/null +++ b/.github/workflows/jekyll-gh-pages.yml @@ -0,0 +1,51 @@ +# Sample workflow for building and deploying a Jekyll site to GitHub Pages +name: Deploy Jekyll with GitHub Pages dependencies preinstalled + +on: + # Runs on pushes targeting the default branch + push: + branches: ["main"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Build job + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v5 + - name: Build with Jekyll + uses: actions/jekyll-build-pages@v1 + with: + source: ./ + destination: ./_site + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + + # Deployment job + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/poetry-export_dependencies.yml b/.github/workflows/poetry-export_dependencies.yml new file mode 100644 index 0000000..36306c8 --- /dev/null +++ b/.github/workflows/poetry-export_dependencies.yml @@ -0,0 +1,73 @@ +name: Poetry export requirements.txt +on: + push: + branches: + - '*' # Trigger on any push to any branch + paths: + - 'requirements.txt' + - 'pyproject.toml' + - 'poetry.lock' +jobs: + poetry-export_dependencies: + strategy: + fail-fast: false + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: Install poetry + uses: abatilo/actions-poetry@v4 + with: + poetry-version: 'latest' + - name: Install the poetry-plugin-export + run: poetry self add poetry-plugin-export + - name: Update poetry lock file + run: poetry lock + - name: Export the project dependencies to requirements.txt + run: | + poetry export -f requirements.txt --output requirements.txt + - name: Get branch name + shell: bash + run: echo "BRANCH_NAME=${GITHUB_REF#refs/heads/}" >> $GITHUB_ENV + - name: Check for changes + id: check_changes + run: | + if [[ -n "$(git status --porcelain requirements.txt poetry.lock)" ]]; then + echo "changes=true" >> $GITHUB_OUTPUT + else + echo "changes=false" >> $GITHUB_OUTPUT + fi + - name: Configure Git + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + - name: Commit and push if changed + if: steps.check_changes.outputs.changes == 'true' + run: | + # Pull with rebase to get latest changes + git pull --rebase origin ${{ env.BRANCH_NAME }} + + # Stage and commit changes + git add requirements.txt poetry.lock + git commit -m "chore: update requirements.txt and poetry.lock [skip ci]" + + # Push with retry logic + max_attempts=3 + attempt=1 + while [ $attempt -le $max_attempts ]; do + if git push origin ${{ env.BRANCH_NAME }}; then + break + else + if [ $attempt -eq $max_attempts ]; then + echo "Failed to push after $max_attempts attempts" + exit 1 + fi + echo "Push failed, attempt $attempt of $max_attempts. Pulling and retrying..." + git pull --rebase origin ${{ env.BRANCH_NAME }} + attempt=$((attempt + 1)) + fi + done + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/CITATION.cff b/CITATION.cff index 4700932..97e309c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -4,7 +4,7 @@ authors: - family-names: "Lui" given-names: "Lok Hei" orcid: "https://orcid.org/0000-0001-5077-1530" -title: "Dataverse metadata Crawler" +title: "Dataverse Metadata Crawler" version: 0.1.0 date-released: 2025-01-16 -url: "https://github.com/kenlhlui/dataverse-metadata-crawler-p" \ No newline at end of file +url: "https://github.com/scholarsportal/dataverse-metadata-crawler" diff --git a/README.md b/README.md index cf4b45d..6189d31 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/ 2. Change to the project directory ```sh - cd ~/dataverse-metadata-export-p + cd ./dataverse-metadata-crawler ``` 3. Create an environment file (.env) @@ -65,6 +65,7 @@ A Python CLI tool for extracting and exporting metadata from [Dataverse](https:/ python3 dvmeta/main.py [-a AUTH] [-l] [-d] [-p] [-f] [-e] [-s] -c COLLECTION_ALIAS -v VERSION ``` **Required arguments:** + | **Option** | **Short** | **Type** | **Description** | **Default** | |--------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| | --collection_alias | -c | TEXT | Name of the collection to crawl.
**[required]** | None | @@ -72,6 +73,7 @@ python3 dvmeta/main.py [-a AUTH] [-l] [-d] [-p] [-f] [-e] [-s] -c COLLECTION_ALI **Optional arguments:** + | **Option** | **Short** | **Type** | **Description** | **Default** | |----------------------|-----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------| | --auth | -a | TEXT | Authentication token to access the Dataverse repository.
If | None | @@ -96,6 +98,7 @@ python3 dvmeta/main.py -c demo -v 1.0 -d -s -p -a xxxxxxxx-xxxx-xxxx-xxxx-xxxxxx ``` ## 📂Output Structure + | File | Description | |-------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------| | ds_metadata_yyyymmdd-HHMMSS.json | Datasets' their data files' metadata in JSON format. | @@ -145,21 +148,20 @@ If you use this software in your work, please cite it using the following metada APA: ``` -Lui, L. H. (2025). Dataverse metadata Crawler (Version 0.1.0) [Computer software]. https://github.com/kenlhlui/dataverse-metadata-crawler-p +Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.0) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler ``` BibTeX: ``` -@software{Lui_Dataverse_metadata_Crawler_2025, +@software{Lui_Dataverse_Metadata_Crawler_2025, author = {Lui, Lok Hei}, month = jan, -title = {{Dataverse metadata Crawler}}, -url = {https://github.com/kenlhlui/dataverse-metadata-crawler-p}, +title = {{Dataverse Metadata Crawler}}, +url = {https://github.com/scholarsportal/dataverse-metadata-crawler}, version = {0.1.0}, year = {2025} } ``` ## ✍️Authors -Ken Lui - Data Curation Specialist, Map and Data Library, University of Toronto - kenlh.lui@utoronto.ca - +Ken Lui - Data Curation Specialist, Map and Data Library, University of Toronto - [kenlh.lui@utoronto.ca](mailto:kenlh.lui@utoronto.ca) diff --git a/_config.yml b/_config.yml new file mode 100644 index 0000000..2217569 --- /dev/null +++ b/_config.yml @@ -0,0 +1,19 @@ +# Site settings +title: Dataverse Metadata Crawler +description: A Python CLI tool for extracting and exporting metadata from Dataverse repositories to JSON and CSV formats. +baseurl: "/dataverse-metadata-crawler" # Base URL (leave blank for root deployment) +url: "https://scholarsportal.github.io" # Your GitHub Pages URL + +remote_theme: pages-themes/primer +plugins: +- jekyll-remote-theme # add this line to the plugins list if you already have one +- jekyll-seo-tag # Required by primer theme + +# Markdown settings +markdown: kramdown +kramdown: + input: GFM # Enables GitHub Flavored Markdown (GFM) + +# Build settings +source: ./ +destination: ./_site From cbd2fab18402c6e18dad2029bb8ee092752214ae Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Tue, 28 Jan 2025 16:38:47 -0500 Subject: [PATCH 25/28] Update poetry-export_dependencies.yml --- .github/workflows/poetry-export_dependencies.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/poetry-export_dependencies.yml b/.github/workflows/poetry-export_dependencies.yml index 36306c8..d84f5b6 100644 --- a/.github/workflows/poetry-export_dependencies.yml +++ b/.github/workflows/poetry-export_dependencies.yml @@ -1,12 +1,15 @@ name: Poetry export requirements.txt on: - push: + pull_request: branches: - - '*' # Trigger on any push to any branch + - 'main' # Trigger only on pull requests made to the main branch paths: - 'requirements.txt' - 'pyproject.toml' - 'poetry.lock' + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + jobs: poetry-export_dependencies: strategy: From 45b55c0d468f18033c8650e7e460217cf2d53abe Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Tue, 28 Jan 2025 16:43:40 -0500 Subject: [PATCH 26/28] Update poetry-export_dependencies.yml --- .github/workflows/poetry-export_dependencies.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/poetry-export_dependencies.yml b/.github/workflows/poetry-export_dependencies.yml index 28a6383..da392ed 100644 --- a/.github/workflows/poetry-export_dependencies.yml +++ b/.github/workflows/poetry-export_dependencies.yml @@ -1,7 +1,6 @@ name: Poetry export requirements.txt on: - - pull_request: + pull_request_target: branches: - 'main' # Trigger only on pull requests made to the main branch From d95ac7bb875987c18e312e8becd17e2c9a4e8b3e Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Tue, 28 Jan 2025 16:53:08 -0500 Subject: [PATCH 27/28] Update poetry-export_dependencies.yml --- .github/workflows/poetry-export_dependencies.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/poetry-export_dependencies.yml b/.github/workflows/poetry-export_dependencies.yml index da392ed..11bdb7e 100644 --- a/.github/workflows/poetry-export_dependencies.yml +++ b/.github/workflows/poetry-export_dependencies.yml @@ -1,6 +1,6 @@ name: Poetry export requirements.txt on: - pull_request_target: + push: branches: - 'main' # Trigger only on pull requests made to the main branch From 5a0bf38338444409026371ad03902c3f06345a48 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Tue, 28 Jan 2025 16:55:33 -0500 Subject: [PATCH 28/28] Update pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3b2d991..5f3d38a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "dataverse-metadata-crawler" -version = "0.1.0" -description = "A Python CLI tool for extracting and exporting metadata from Dataverse repositories to JSON and CSV formats." +version = "0.1.1" +description = "A Python CLI tool for bulk extracting and exporting metadata from Dataverse repositories' collections to JSON and CSV formats." authors = ["Ken Lui "] license = "MIT" readme = "README.md"