Skip to content

Commit

Permalink
Release/v0.1.4 (#14)
Browse files Browse the repository at this point in the history
* Update jekyll-gh-pages.yml

1. Added build page if README.md has update.

* Feature/deaccession ds count in log (#12)

* 1. Added dection of deaccessioned/draft datasets info into log.

* Update log_template.txt

1. Deleted a space.

* 1. Added `replace_key_with_dataset_id` function to delete deaccession dataset from the failed uris metadata dict, avoding confusion. (#13)

* Hot fix for the log generation part.

* 1. Upated log_template to remove empty dict being listed in the log 'Files saved' section.

* 1. Formatting prompts

* Update README.md

* Update CITATION.cff
  • Loading branch information
kenlhlui authored Feb 7, 2025
1 parent a46c2a4 commit 49e7e44
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 11 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/jekyll-gh-pages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ on:
# Runs on pushes targeting the default branch
push:
branches: ["main"]
paths:
- 'README.md'

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
Expand Down
6 changes: 3 additions & 3 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
cff-version: 0.1.3
cff-version: 0.1.4
message: "If you use this software, please cite it as below."
authors:
- family-names: "Lui"
given-names: "Lok Hei"
orcid: "https://orcid.org/0000-0001-5077-1530"
title: "Dataverse Metadata Crawler"
version: 0.1.3
date-released: 2025-02-04
version: 0.1.4
date-released: 2025-02-07
url: "https://github.com/scholarsportal/dataverse-metadata-crawler"
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ If you use this software in your work, please cite it using the following metada

APA:
```
Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.3) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler
Lui, L. H. (2025). Dataverse Metadata Crawler (Version 0.1.4) [Computer software]. https://github.com/scholarsportal/dataverse-metadata-crawler
```

BibTeX:
Expand All @@ -173,7 +173,7 @@ BibTeX:
month = {jan},
title = {Dataverse Metadata Crawler},
url = {https://github.com/scholarsportal/dataverse-metadata-crawler},
version = {0.1.3},
version = {0.1.4},
year = {2025}
}
```
Expand Down
21 changes: 21 additions & 0 deletions dvmeta/func.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,24 @@ def replace_key_with_dataset_id(dictionary: dict) -> dict:
# Keep the original key if 'id' is missing
new_dict[old_key] = value
return new_dict


def rm_dd_from_failed_uris(failed_uris: dict, pid_dict_dd: dict) -> dict:
"""Remove the deaccessioned datasets from the failed_uris dictionary.
Args:
failed_uris (dict): Dictionary containing the failed URIs
pid_dict_dd (dict): Dictionary containing the deaccessioned datasets metadata
Returns:
dict: Dictionary containing the failed URIs without the deaccessioned datasets
"""
# Get the datasetPersistentId from the pid_dict_dd
dd_pids = [v['datasetPersistentId'] for v in pid_dict_dd.values()]

# Loop through the dd_pids, and remove the item if it contains the pid in the key of the failed_uris
keys_to_remove = [k for k in failed_uris if any(pid in k for pid in dd_pids)]
for k in keys_to_remove:
failed_uris.pop(k)

return failed_uris
5 changes: 4 additions & 1 deletion dvmeta/log_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def write_to_log( # noqa: PLR0913
meta_dict: dict,
collections_tree_flatten: dict,
failed_metadata_ids: dict,
pid_dict_dd: dict,
json_file_checksum_dict: dict,
) -> None:
"""Write the crawl log to a file.
Expand All @@ -27,6 +28,7 @@ def write_to_log( # noqa: PLR0913
meta_dict (dict): Metadata dictionary
collections_tree_flatten (dict): Flattened collections tree
failed_metadata_ids (dict): Dictionary of failed metadata IDs
pid_dict_dd (dict): Dictionary of deacessioned/draft datasets
json_file_checksum_dict (dict): Dictionary of JSON file checksums
Returns:
Expand All @@ -39,6 +41,7 @@ def write_to_log( # noqa: PLR0913
elapsed_time=elapsed_time,
meta_dict=utils.count_key(meta_dict),
collections_tree_flatten=utils.count_key(collections_tree_flatten),
pid_dict_dd=utils.count_key(pid_dict_dd),
failed_metadata_ids=utils.count_key(failed_metadata_ids),
file_num=count_files_size(meta_dict)[0],
file_size=count_files_size(meta_dict)[1],
Expand All @@ -50,7 +53,7 @@ def write_to_log( # noqa: PLR0913
with Path(log_file_path).open('w', encoding='utf-8') as file:
file.write(rendered)

return print(f'The crawl log is saved at: {log_file_path}')
return print(f'The crawl log is saved at: {log_file_path}\n')


def read_template() -> str:
Expand Down
10 changes: 8 additions & 2 deletions dvmeta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ async def main_crawler():
# Optional arguments
meta_dict = {}
failed_metadata_uris = []
pid_dict_dd = {}
if dvdfds_matadata:
# Export dataverse_contents
print('Crawling Representation and File metadata of datasets...\n')
Expand All @@ -168,6 +169,9 @@ async def main_crawler():
# Add the path_info to the metadata
meta_dict, pid_dict_dd = func.add_path_info(meta_dict, ds_dict)

# Remove the deaccessioned/draft datasets from the pid_dict_dd for the failed_metadata_uris
failed_metadata_uris = func.rm_dd_from_failed_uris(failed_metadata_uris, pid_dict_dd)

# Export the updated pid_dict_dd (Which contains deaccessioned/draft datasets) to a JSON file
pid_dict_json, pid_dict_checksum = utils.orjson_export(pid_dict_dd, 'pid_dict_dd')
json_file_checksum_dict.append(
Expand Down Expand Up @@ -250,9 +254,9 @@ async def main_crawler():
{'type': 'Dataset Metadata CSV', 'path': csv_file_path, 'checksum': csv_file_checksum}
)

return meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten
return meta_dict, json_file_checksum_dict, failed_metadata_uris, pid_dict_dd, collections_tree_flatten

meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten = asyncio.run(main_crawler())
meta_dict, json_file_checksum_dict, failed_metadata_uris, pid_dict_dd, collections_tree_flatten = asyncio.run(main_crawler())

# End time
end_time_obj, end_time_display = utils.Timestamp().get_current_time(), utils.Timestamp().get_display_time()
Expand All @@ -271,8 +275,10 @@ async def main_crawler():
meta_dict,
collections_tree_flatten,
failed_metadata_uris,
pid_dict_dd,
json_file_checksum_dict)

print('✅ Crawling process completed successfully.\n')

if __name__ == '__main__':
app()
7 changes: 4 additions & 3 deletions res/log_template.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,18 @@ Execution time: {{ elapsed_time }}

Total number of dataset crawled from the collection: {{ meta_dict }}
Number of dataverses (at all levels/depths) crawled from the collection: {{ collections_tree_flatten }}
Number of deaccessioned/draft datasets detected (include draft if chose latest-published option but the dataset never published): {{ pid_dict_dd }}
Number of datasets failed to be crawled: {{ failed_metadata_ids }}

Total number of files in the collection: {{ file_num }}
Total size of files in the collection: {{ file_size }} bytes

{% if json_file_checksum_dict %}
Files saved:
{% for item in json_file_checksum_dict %}
{% for item in json_file_checksum_dict %}{% if item.path %}
Item type: {{ item.type }}
Item path: {{ item.path }}
Item checksum (SHA-256): {{ item.checksum }}
{% endfor %}
{% endif %}{% endfor %}
{% endif %}
--- End of Log ---
--- End of Log ---

0 comments on commit 49e7e44

Please sign in to comment.