Skip to content

Commit

Permalink
Feature/deaccession ds count in log (#10)
Browse files Browse the repository at this point in the history
* 1. Added dection of deaccessioned/draft datasets info into log.

* Update log_template.txt

1. Deleted a space.
  • Loading branch information
kenlhlui authored Feb 7, 2025
1 parent 1cfbd74 commit ca606db
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 3 deletions.
3 changes: 3 additions & 0 deletions dvmeta/log_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def write_to_log( # noqa: PLR0913
meta_dict: dict,
collections_tree_flatten: dict,
failed_metadata_ids: dict,
pid_dict_dd: dict,
json_file_checksum_dict: dict,
) -> None:
"""Write the crawl log to a file.
Expand All @@ -27,6 +28,7 @@ def write_to_log( # noqa: PLR0913
meta_dict (dict): Metadata dictionary
collections_tree_flatten (dict): Flattened collections tree
failed_metadata_ids (dict): Dictionary of failed metadata IDs
pid_dict_dd (dict): Dictionary of deacessioned/draft datasets
json_file_checksum_dict (dict): Dictionary of JSON file checksums
Returns:
Expand All @@ -39,6 +41,7 @@ def write_to_log( # noqa: PLR0913
elapsed_time=elapsed_time,
meta_dict=utils.count_key(meta_dict),
collections_tree_flatten=utils.count_key(collections_tree_flatten),
pid_dict_dd=utils.count_key(pid_dict_dd),
failed_metadata_ids=utils.count_key(failed_metadata_ids),
file_num=count_files_size(meta_dict)[0],
file_size=count_files_size(meta_dict)[1],
Expand Down
6 changes: 4 additions & 2 deletions dvmeta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ async def main_crawler():
# Optional arguments
meta_dict = {}
failed_metadata_uris = []
pid_dict_dd = {}
if dvdfds_matadata:
# Export dataverse_contents
print('Crawling Representation and File metadata of datasets...\n')
Expand Down Expand Up @@ -250,9 +251,9 @@ async def main_crawler():
{'type': 'Dataset Metadata CSV', 'path': csv_file_path, 'checksum': csv_file_checksum}
)

return meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten
return meta_dict, json_file_checksum_dict, failed_metadata_uris, pid_dict_dd, collections_tree_flatten

meta_dict, json_file_checksum_dict, failed_metadata_uris, collections_tree_flatten = asyncio.run(main_crawler())
meta_dict, json_file_checksum_dict, failed_metadata_uris, pid_dict_dd, collections_tree_flatten = asyncio.run(main_crawler())

# End time
end_time_obj, end_time_display = utils.Timestamp().get_current_time(), utils.Timestamp().get_display_time()
Expand All @@ -270,6 +271,7 @@ async def main_crawler():
elapsed_time,
meta_dict,
collections_tree_flatten,
pid_dict_dd,
failed_metadata_uris,
json_file_checksum_dict)

Expand Down
3 changes: 2 additions & 1 deletion res/log_template.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Execution time: {{ elapsed_time }}

Total number of dataset crawled from the collection: {{ meta_dict }}
Number of dataverses (at all levels/depths) crawled from the collection: {{ collections_tree_flatten }}
Number of deaccessioned/draft datasets detected (include draft if chose latest-published option but the dataset never published): {{ pid_dict_dd }}
Number of datasets failed to be crawled: {{ failed_metadata_ids }}

Total number of files in the collection: {{ file_num }}
Expand All @@ -24,4 +25,4 @@ Item path: {{ item.path }}
Item checksum (SHA-256): {{ item.checksum }}
{% endfor %}
{% endif %}
--- End of Log ---
--- End of Log ---

0 comments on commit ca606db

Please sign in to comment.