From 1cfbd7402e1621f0a2659875966c7883daa0450e Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 6 Feb 2025 19:10:54 -0500 Subject: [PATCH 1/8] Update jekyll-gh-pages.yml 1. Added build page if README.md has update. --- .github/workflows/jekyll-gh-pages.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml index 2af8ea0..8062833 100644 --- a/.github/workflows/jekyll-gh-pages.yml +++ b/.github/workflows/jekyll-gh-pages.yml @@ -5,6 +5,8 @@ on: # Runs on pushes targeting the default branch push: branches: ["main"] + paths: + - 'README.md' # Allows you to run this workflow manually from the Actions tab workflow_dispatch: From 38410e8fe3ecfacb29f52a708b179dd4c0571f63 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 6 Feb 2025 21:20:37 -0500 Subject: [PATCH 2/8] Feature/deaccession ds count in log (#12) * 1. Added dection of deaccessioned/draft datasets info into log. * Update log_template.txt 1. Deleted a space. --- dvmeta/log_generation.py | 3 +++ dvmeta/main.py | 6 ++++-- res/log_template.txt | 3 ++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/dvmeta/log_generation.py b/dvmeta/log_generation.py index 435ad98..bb1b5d4 100644 --- a/dvmeta/log_generation.py +++ b/dvmeta/log_generation.py @@ -15,6 +15,7 @@ def write_to_log( # noqa: PLR0913 meta_dict: dict, collections_tree_flatten: dict, failed_metadata_ids: dict, + pid_dict_dd: dict, json_file_checksum_dict: dict, ) -> None: """Write the crawl log to a file. @@ -27,6 +28,7 @@ def write_to_log( # noqa: PLR0913 meta_dict (dict): Metadata dictionary collections_tree_flatten (dict): Flattened collections tree failed_metadata_ids (dict): Dictionary of failed metadata IDs + pid_dict_dd (dict): Dictionary of deacessioned/draft datasets json_file_checksum_dict (dict): Dictionary of JSON file checksums Returns: @@ -39,6 +41,7 @@ def write_to_log( # noqa: PLR0913 elapsed_time=elapsed_time, meta_dict=utils.count_key(meta_dict), collections_tree_flatten=utils.count_key(collections_tree_flatten), + pid_dict_dd=utils.count_key(pid_dict_dd), failed_metadata_ids=utils.count_key(failed_metadata_ids), file_num=count_files_size(meta_dict)[0], file_size=count_files_size(meta_dict)[1], diff --git a/dvmeta/main.py b/dvmeta/main.py index bcbee85..ae45e4b 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -156,6 +156,7 @@ async def main_crawler(): # Optional arguments meta_dict = {} failed_metadata_uris = [] + pid_dict_dd = {} if dvdfds_matadata: # Export dataverse_contents print('Crawling Representation and File metadata of datasets...\n') @@ -250,9 +251,9 @@ async def main_crawler(): {'type': 'Dataset Metadata CSV', 'path': csv_file_path, 'checksum': csv_file_checksum} ) - return meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten + return meta_dict, json_file_checksum_dict, failed_metadata_uris, pid_dict_dd, collections_tree_flatten - meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten = asyncio.run(main_crawler()) + meta_dict, json_file_checksum_dict, failed_metadata_uris, pid_dict_dd, collections_tree_flatten = asyncio.run(main_crawler()) # End time end_time_obj, end_time_display = utils.Timestamp().get_current_time(), utils.Timestamp().get_display_time() @@ -270,6 +271,7 @@ async def main_crawler(): elapsed_time, meta_dict, collections_tree_flatten, + pid_dict_dd, failed_metadata_uris, json_file_checksum_dict) diff --git a/res/log_template.txt b/res/log_template.txt index d94118b..7386bd1 100644 --- a/res/log_template.txt +++ b/res/log_template.txt @@ -11,6 +11,7 @@ Execution time: {{ elapsed_time }} Total number of dataset crawled from the collection: {{ meta_dict }} Number of dataverses (at all levels/depths) crawled from the collection: {{ collections_tree_flatten }} +Number of deaccessioned/draft datasets detected (include draft if chose latest-published option but the dataset never published): {{ pid_dict_dd }} Number of datasets failed to be crawled: {{ failed_metadata_ids }} Total number of files in the collection: {{ file_num }} @@ -24,4 +25,4 @@ Item path: {{ item.path }} Item checksum (SHA-256): {{ item.checksum }} {% endfor %} {% endif %} ---- End of Log --- \ No newline at end of file +--- End of Log --- From ea53f79aa2ff297ae312cd02352cd792afa5e866 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 6 Feb 2025 21:21:34 -0500 Subject: [PATCH 3/8] 1. Added `replace_key_with_dataset_id` function to delete deaccession dataset from the failed uris metadata dict, avoding confusion. (#13) --- dvmeta/func.py | 21 +++++++++++++++++++++ dvmeta/main.py | 3 +++ 2 files changed, 24 insertions(+) diff --git a/dvmeta/func.py b/dvmeta/func.py index 01ebf3a..ede53e9 100644 --- a/dvmeta/func.py +++ b/dvmeta/func.py @@ -245,3 +245,24 @@ def replace_key_with_dataset_id(dictionary: dict) -> dict: # Keep the original key if 'id' is missing new_dict[old_key] = value return new_dict + + +def rm_dd_from_failed_uris(failed_uris: dict, pid_dict_dd: dict) -> dict: + """Remove the deaccessioned datasets from the failed_uris dictionary. + + Args: + failed_uris (dict): Dictionary containing the failed URIs + pid_dict_dd (dict): Dictionary containing the deaccessioned datasets metadata + + Returns: + dict: Dictionary containing the failed URIs without the deaccessioned datasets + """ + # Get the datasetPersistentId from the pid_dict_dd + dd_pids = [v['datasetPersistentId'] for v in pid_dict_dd.values()] + + # Loop through the dd_pids, and remove the item if it contains the pid in the key of the failed_uris + keys_to_remove = [k for k in failed_uris if any(pid in k for pid in dd_pids)] + for k in keys_to_remove: + failed_uris.pop(k) + + return failed_uris diff --git a/dvmeta/main.py b/dvmeta/main.py index ae45e4b..9888d2a 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -169,6 +169,9 @@ async def main_crawler(): # Add the path_info to the metadata meta_dict, pid_dict_dd = func.add_path_info(meta_dict, ds_dict) + # Remove the deaccessioned/draft datasets from the pid_dict_dd for the failed_metadata_uris + failed_metadata_uris = func.rm_dd_from_failed_uris(failed_metadata_uris, pid_dict_dd) + # Export the updated pid_dict_dd (Which contains deaccessioned/draft datasets) to a JSON file pid_dict_json, pid_dict_checksum = utils.orjson_export(pid_dict_dd, 'pid_dict_dd') json_file_checksum_dict.append( From 956147cc863237d7cf7f6db22efe3054aa8bd878 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 6 Feb 2025 21:25:39 -0500 Subject: [PATCH 4/8] Hot fix for the log generation part. --- dvmeta/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvmeta/main.py b/dvmeta/main.py index 9888d2a..eb39350 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -274,8 +274,8 @@ async def main_crawler(): elapsed_time, meta_dict, collections_tree_flatten, - pid_dict_dd, failed_metadata_uris, + pid_dict_dd, json_file_checksum_dict) From b26391e410f5abfce470adbf4bf3290421016742 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 6 Feb 2025 21:49:35 -0500 Subject: [PATCH 5/8] 1. Upated log_template to remove empty dict being listed in the log 'Files saved' section. --- res/log_template.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/res/log_template.txt b/res/log_template.txt index 7386bd1..f28c9bd 100644 --- a/res/log_template.txt +++ b/res/log_template.txt @@ -19,10 +19,10 @@ Total size of files in the collection: {{ file_size }} bytes {% if json_file_checksum_dict %} Files saved: -{% for item in json_file_checksum_dict %} +{% for item in json_file_checksum_dict %}{% if item.path %} Item type: {{ item.type }} Item path: {{ item.path }} Item checksum (SHA-256): {{ item.checksum }} -{% endfor %} +{% endif %}{% endfor %} {% endif %} --- End of Log --- From db65e0c519851f95927e610f89d4651278412529 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Thu, 6 Feb 2025 21:52:11 -0500 Subject: [PATCH 6/8] 1. Formatting prompts --- dvmeta/log_generation.py | 2 +- dvmeta/main.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dvmeta/log_generation.py b/dvmeta/log_generation.py index bb1b5d4..c069622 100644 --- a/dvmeta/log_generation.py +++ b/dvmeta/log_generation.py @@ -53,7 +53,7 @@ def write_to_log( # noqa: PLR0913 with Path(log_file_path).open('w', encoding='utf-8') as file: file.write(rendered) - return print(f'The crawl log is saved at: {log_file_path}') + return print(f'The crawl log is saved at: {log_file_path}\n') def read_template() -> str: diff --git a/dvmeta/main.py b/dvmeta/main.py index eb39350..874694d 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -278,6 +278,7 @@ async def main_crawler(): pid_dict_dd, json_file_checksum_dict) + print('✅ Crawling process completed successfully.\n') if __name__ == '__main__': app() From d3435f076a1d280ed2395dd4ca72cfae066336d1 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Fri, 7 Feb 2025 00:53:48 -0500 Subject: [PATCH 7/8] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ced81a1..cfff746 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ If you use this software in your work, please cite it using the following metada APA: ``` -Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.3) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler +Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.4) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler ``` BibTeX: @@ -173,7 +173,7 @@ BibTeX: month = {jan}, title = {Dataverse Metadata Crawler}, url = {https://github.com/scholarsportal/dataverse-metadata-crawler}, - version = {0.1.3}, + version = {0.1.4}, year = {2025} } ``` From eb8ed2e96ec0678e27e10160cbb49339bf486bf6 Mon Sep 17 00:00:00 2001 From: Ken Lui <116421546+kenlhlui@users.noreply.github.com> Date: Fri, 7 Feb 2025 00:54:23 -0500 Subject: [PATCH 8/8] Update CITATION.cff --- CITATION.cff | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index dc2d401..41e857f 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,10 +1,10 @@ -cff-version: 0.1.3 +cff-version: 0.1.4 message: "If you use this software, please cite it as below." authors: - family-names: "Lui" given-names: "Lok Hei" orcid: "https://orcid.org/0000-0001-5077-1530" title: "Dataverse Metadata Crawler" -version: 0.1.3 -date-released: 2025-02-04 +version: 0.1.4 +date-released: 2025-02-07 url: "https://github.com/scholarsportal/dataverse-metadata-crawler"