diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml index 2af8ea0..8062833 100644 --- a/.github/workflows/jekyll-gh-pages.yml +++ b/.github/workflows/jekyll-gh-pages.yml @@ -5,6 +5,8 @@ on: # Runs on pushes targeting the default branch push: branches: ["main"] + paths: + - 'README.md' # Allows you to run this workflow manually from the Actions tab workflow_dispatch: diff --git a/CITATION.cff b/CITATION.cff index dc2d401..41e857f 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,10 +1,10 @@ -cff-version: 0.1.3 +cff-version: 0.1.4 message: "If you use this software, please cite it as below." authors: - family-names: "Lui" given-names: "Lok Hei" orcid: "https://orcid.org/0000-0001-5077-1530" title: "Dataverse Metadata Crawler" -version: 0.1.3 -date-released: 2025-02-04 +version: 0.1.4 +date-released: 2025-02-07 url: "https://github.com/scholarsportal/dataverse-metadata-crawler" diff --git a/README.md b/README.md index ced81a1..cfff746 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ If you use this software in your work, please cite it using the following metada APA: ``` -Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.3) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler +Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.4) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler ``` BibTeX: @@ -173,7 +173,7 @@ BibTeX: month = {jan}, title = {Dataverse Metadata Crawler}, url = {https://github.com/scholarsportal/dataverse-metadata-crawler}, - version = {0.1.3}, + version = {0.1.4}, year = {2025} } ``` diff --git a/dvmeta/func.py b/dvmeta/func.py index 01ebf3a..ede53e9 100644 --- a/dvmeta/func.py +++ b/dvmeta/func.py @@ -245,3 +245,24 @@ def replace_key_with_dataset_id(dictionary: dict) -> dict: # Keep the original key if 'id' is missing new_dict[old_key] = value return new_dict + + +def rm_dd_from_failed_uris(failed_uris: dict, pid_dict_dd: dict) -> dict: + """Remove the deaccessioned datasets from the failed_uris dictionary. + + Args: + failed_uris (dict): Dictionary containing the failed URIs + pid_dict_dd (dict): Dictionary containing the deaccessioned datasets metadata + + Returns: + dict: Dictionary containing the failed URIs without the deaccessioned datasets + """ + # Get the datasetPersistentId from the pid_dict_dd + dd_pids = [v['datasetPersistentId'] for v in pid_dict_dd.values()] + + # Loop through the dd_pids, and remove the item if it contains the pid in the key of the failed_uris + keys_to_remove = [k for k in failed_uris if any(pid in k for pid in dd_pids)] + for k in keys_to_remove: + failed_uris.pop(k) + + return failed_uris diff --git a/dvmeta/log_generation.py b/dvmeta/log_generation.py index 435ad98..c069622 100644 --- a/dvmeta/log_generation.py +++ b/dvmeta/log_generation.py @@ -15,6 +15,7 @@ def write_to_log( # noqa: PLR0913 meta_dict: dict, collections_tree_flatten: dict, failed_metadata_ids: dict, + pid_dict_dd: dict, json_file_checksum_dict: dict, ) -> None: """Write the crawl log to a file. @@ -27,6 +28,7 @@ def write_to_log( # noqa: PLR0913 meta_dict (dict): Metadata dictionary collections_tree_flatten (dict): Flattened collections tree failed_metadata_ids (dict): Dictionary of failed metadata IDs + pid_dict_dd (dict): Dictionary of deacessioned/draft datasets json_file_checksum_dict (dict): Dictionary of JSON file checksums Returns: @@ -39,6 +41,7 @@ def write_to_log( # noqa: PLR0913 elapsed_time=elapsed_time, meta_dict=utils.count_key(meta_dict), collections_tree_flatten=utils.count_key(collections_tree_flatten), + pid_dict_dd=utils.count_key(pid_dict_dd), failed_metadata_ids=utils.count_key(failed_metadata_ids), file_num=count_files_size(meta_dict)[0], file_size=count_files_size(meta_dict)[1], @@ -50,7 +53,7 @@ def write_to_log( # noqa: PLR0913 with Path(log_file_path).open('w', encoding='utf-8') as file: file.write(rendered) - return print(f'The crawl log is saved at: {log_file_path}') + return print(f'The crawl log is saved at: {log_file_path}\n') def read_template() -> str: diff --git a/dvmeta/main.py b/dvmeta/main.py index bcbee85..874694d 100644 --- a/dvmeta/main.py +++ b/dvmeta/main.py @@ -156,6 +156,7 @@ async def main_crawler(): # Optional arguments meta_dict = {} failed_metadata_uris = [] + pid_dict_dd = {} if dvdfds_matadata: # Export dataverse_contents print('Crawling Representation and File metadata of datasets...\n') @@ -168,6 +169,9 @@ async def main_crawler(): # Add the path_info to the metadata meta_dict, pid_dict_dd = func.add_path_info(meta_dict, ds_dict) + # Remove the deaccessioned/draft datasets from the pid_dict_dd for the failed_metadata_uris + failed_metadata_uris = func.rm_dd_from_failed_uris(failed_metadata_uris, pid_dict_dd) + # Export the updated pid_dict_dd (Which contains deaccessioned/draft datasets) to a JSON file pid_dict_json, pid_dict_checksum = utils.orjson_export(pid_dict_dd, 'pid_dict_dd') json_file_checksum_dict.append( @@ -250,9 +254,9 @@ async def main_crawler(): {'type': 'Dataset Metadata CSV', 'path': csv_file_path, 'checksum': csv_file_checksum} ) - return meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten + return meta_dict, json_file_checksum_dict, failed_metadata_uris, pid_dict_dd, collections_tree_flatten - meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten = asyncio.run(main_crawler()) + meta_dict, json_file_checksum_dict, failed_metadata_uris, pid_dict_dd, collections_tree_flatten = asyncio.run(main_crawler()) # End time end_time_obj, end_time_display = utils.Timestamp().get_current_time(), utils.Timestamp().get_display_time() @@ -271,8 +275,10 @@ async def main_crawler(): meta_dict, collections_tree_flatten, failed_metadata_uris, + pid_dict_dd, json_file_checksum_dict) + print('✅ Crawling process completed successfully.\n') if __name__ == '__main__': app() diff --git a/res/log_template.txt b/res/log_template.txt index d94118b..f28c9bd 100644 --- a/res/log_template.txt +++ b/res/log_template.txt @@ -11,6 +11,7 @@ Execution time: {{ elapsed_time }} Total number of dataset crawled from the collection: {{ meta_dict }} Number of dataverses (at all levels/depths) crawled from the collection: {{ collections_tree_flatten }} +Number of deaccessioned/draft datasets detected (include draft if chose latest-published option but the dataset never published): {{ pid_dict_dd }} Number of datasets failed to be crawled: {{ failed_metadata_ids }} Total number of files in the collection: {{ file_num }} @@ -18,10 +19,10 @@ Total size of files in the collection: {{ file_size }} bytes {% if json_file_checksum_dict %} Files saved: -{% for item in json_file_checksum_dict %} +{% for item in json_file_checksum_dict %}{% if item.path %} Item type: {{ item.type }} Item path: {{ item.path }} Item checksum (SHA-256): {{ item.checksum }} -{% endfor %} +{% endif %}{% endfor %} {% endif %} ---- End of Log --- \ No newline at end of file +--- End of Log ---