From ff8016c283e24b019016d39988e270f8333f5168 Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Sat, 21 Oct 2023 14:07:37 +0200 Subject: [PATCH 1/5] Raise RuntimeError in _remove_jobs_helper re-arrange early exit --- pyiron_base/project/generic.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/pyiron_base/project/generic.py b/pyiron_base/project/generic.py index 276d32989..3dfa2c51f 100644 --- a/pyiron_base/project/generic.py +++ b/pyiron_base/project/generic.py @@ -1593,23 +1593,22 @@ def _remove_jobs_helper(self, recursive=False, progress=True): """ if not isinstance(recursive, bool): raise ValueError("recursive must be a boolean") - if not self.db.view_mode: - job_id_lst = self.get_job_ids(recursive=recursive) - if progress and len(job_id_lst) > 0: - job_id_lst = tqdm(job_id_lst) - for job_id in job_id_lst: - if job_id not in self.get_job_ids(recursive=recursive): - continue - else: - try: - self.remove_job(job_specifier=job_id) - state.logger.debug("Remove job with ID {0} ".format(job_id)) - except (IndexError, Exception): - state.logger.debug( - "Could not remove job with ID {0} ".format(job_id) - ) - else: - raise EnvironmentError("copy_to: is not available in Viewermode !") + if self.db.view_mode: + raise RuntimeError("copy_to: is not available in Viewermode !") + job_id_lst = self.get_job_ids(recursive=recursive) + if progress and len(job_id_lst) > 0: + job_id_lst = tqdm(job_id_lst) + for job_id in job_id_lst: + if job_id not in self.get_job_ids(recursive=recursive): + continue + else: + try: + self.remove_job(job_specifier=job_id) + state.logger.debug("Remove job with ID {0} ".format(job_id)) + except (IndexError, Exception): + state.logger.debug( + "Could not remove job with ID {0} ".format(job_id) + ) def _remove_files(self, pattern="*"): """ From d2e518ec975030fa7775541b069f1b4d81ed55ad Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Sat, 21 Oct 2023 15:05:05 +0200 Subject: [PATCH 2/5] Inline get_jobs to IsDatabase --- pyiron_base/database/filetable.py | 3 ++- pyiron_base/database/generic.py | 38 +++++++++++++++++++++++++++++++ pyiron_base/database/jobtable.py | 38 +------------------------------ pyiron_base/project/generic.py | 4 +--- 4 files changed, 42 insertions(+), 41 deletions(-) diff --git a/pyiron_base/database/filetable.py b/pyiron_base/database/filetable.py index e2c7bec5f..341daa50e 100644 --- a/pyiron_base/database/filetable.py +++ b/pyiron_base/database/filetable.py @@ -314,11 +314,12 @@ def get_items_dict(self, item_dict, return_all_columns=True): else: return [{"id": i} for i in df_dict["id"].values()] - def get_jobs(self, project=None, recursive=True, columns=None): + def _get_jobs(self, user, sql_query, project=None, recursive=True, columns=None): """ Get jobs as dictionary from filetable Args: + user/sql_query: ignored for compat with IsDatabase project (str/ None): path to the project recursive (boolean): recursively iterate over all sub projects columns (list/ None): list of columns to return diff --git a/pyiron_base/database/generic.py b/pyiron_base/database/generic.py index fb9f49ff1..c315c2a05 100644 --- a/pyiron_base/database/generic.py +++ b/pyiron_base/database/generic.py @@ -317,6 +317,31 @@ def get_db_columns(self): """ return self.get_table_headings() + @abstractmethod + def _get_jobs(self, sql_query, user, project_path, recursive=True, columns=None): + pass + + def get_jobs(self, sql_query, user, project_path, recursive=True, columns=None): + """ + Internal function to return the jobs as dictionary rather than a pandas.Dataframe + + Args: + sql_query (str): SQL query to enter a more specific request + user (str): username of the user whoes user space should be searched + project_path (str): root_path - this is in contrast to the project_path in GenericPath + recursive (bool): search subprojects [True/False] + columns (list): by default only the columns ['id', 'project'] are selected, but the user can select a subset + of ['id', 'status', 'chemicalformula', 'job', 'subjob', 'project', 'projectpath', 'timestart', + 'timestop', 'totalcputime', 'computer', 'hamilton', 'hamversion', 'parentid', 'masterid'] + + Returns: + dict: columns are used as keys and point to a list of the corresponding values + """ + if columns is None: + columns = ["id", "project"] + return self._get_jobs( + sql_query, user, project_path, recursive, columns + ) class ConnectionWatchDog(Thread): """ @@ -1033,6 +1058,19 @@ def delete_item(self, item_id): else: raise PermissionError("Not avilable in viewer mode.") + # IsDatabase impl' + def _get_jobs(self, sql_query, user, project_path, recursive=True, columns=None): + df = self.job_table( + sql_query=sql_query, + user=user, + project_path=project_path, + recursive=recursive, + columns=columns, + ) + if len(df) == 0: + return {key: list() for key in columns} + return df.to_dict(orient="list") + # Shortcut def get_item_by_id(self, item_id): """ diff --git a/pyiron_base/database/jobtable.py b/pyiron_base/database/jobtable.py index cf379bd12..dbae32f56 100644 --- a/pyiron_base/database/jobtable.py +++ b/pyiron_base/database/jobtable.py @@ -20,41 +20,6 @@ __date__ = "Sep 1, 2017" -def get_jobs(database, sql_query, user, project_path, recursive=True, columns=None): - """ - Internal function to return the jobs as dictionary rather than a pandas.Dataframe - - Args: - database (DatabaseAccess): Database object - sql_query (str): SQL query to enter a more specific request - user (str): username of the user whoes user space should be searched - project_path (str): root_path - this is in contrast to the project_path in GenericPath - recursive (bool): search subprojects [True/False] - columns (list): by default only the columns ['id', 'project'] are selected, but the user can select a subset - of ['id', 'status', 'chemicalformula', 'job', 'subjob', 'project', 'projectpath', 'timestart', - 'timestop', 'totalcputime', 'computer', 'hamilton', 'hamversion', 'parentid', 'masterid'] - - Returns: - dict: columns are used as keys and point to a list of the corresponding values - """ - if not isinstance(database, FileTable): - if columns is None: - columns = ["id", "project"] - df = database.job_table( - sql_query=sql_query, - user=user, - project_path=project_path, - recursive=recursive, - columns=columns, - ) - if len(df) == 0: - return {key: list() for key in columns} - return df.to_dict(orient="list") - else: - return database.get_jobs( - project=project_path, recursive=recursive, columns=columns - ) - def get_job_ids(database, sql_query, user, project_path, recursive=True): """ @@ -71,8 +36,7 @@ def get_job_ids(database, sql_query, user, project_path, recursive=True): list: a list of job IDs """ if not isinstance(database, FileTable): - return get_jobs( - database=database, + return database.get_jobs( sql_query=sql_query, user=user, project_path=project_path, diff --git a/pyiron_base/project/generic.py b/pyiron_base/project/generic.py index 3dfa2c51f..383967adc 100644 --- a/pyiron_base/project/generic.py +++ b/pyiron_base/project/generic.py @@ -25,7 +25,6 @@ from pyiron_base.database.jobtable import ( get_job_ids, get_job_id, - get_jobs, set_job_status, get_child_ids, get_job_working_directory, @@ -434,8 +433,7 @@ def get_jobs(self, recursive=True, columns=None): Returns: dict: columns are used as keys and point to a list of the corresponding values """ - return get_jobs( - database=self.db, + return self.db.get_jobs( sql_query=self.sql_query, user=self.user, project_path=self.project_path, From c3b1e403fa1b09126e4d84810ad390ae34801f02 Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Sat, 21 Oct 2023 15:24:31 +0200 Subject: [PATCH 3/5] Inline get_job_ids to IsDatabase --- pyiron_base/database/filetable.py | 13 ------------- pyiron_base/database/generic.py | 21 +++++++++++++++++++++ pyiron_base/database/jobtable.py | 25 ------------------------- pyiron_base/project/generic.py | 4 +--- 4 files changed, 22 insertions(+), 41 deletions(-) diff --git a/pyiron_base/database/filetable.py b/pyiron_base/database/filetable.py index 341daa50e..2b4026d3b 100644 --- a/pyiron_base/database/filetable.py +++ b/pyiron_base/database/filetable.py @@ -351,19 +351,6 @@ def _get_jobs(self, user, sql_query, project=None, recursive=True, columns=None) ].tolist() # ToDo: Check difference of tolist and to_list return dictionary - def get_job_ids(self, project=None, recursive=True): - """ - Get job IDs from filetable - - Args: - project (str/ None): path to the project - recursive (boolean): recursively iterate over all sub projects - - Returns: - list/ None: list of job IDs - """ - return self.get_jobs(project=project, recursive=recursive, columns=["id"])["id"] - def get_job_id(self, job_specifier, project=None): """ Get job ID from filetable diff --git a/pyiron_base/database/generic.py b/pyiron_base/database/generic.py index c315c2a05..af0eabaff 100644 --- a/pyiron_base/database/generic.py +++ b/pyiron_base/database/generic.py @@ -343,6 +343,27 @@ def get_jobs(self, sql_query, user, project_path, recursive=True, columns=None): sql_query, user, project_path, recursive, columns ) + def get_job_ids(self, sql_query, user, project_path, recursive=True): + """ + Return the job IDs matching a specific query + + Args: + database (DatabaseAccess): Database object + sql_query (str): SQL query to enter a more specific request + user (str): username of the user whoes user space should be searched + project_path (str): root_path - this is in contrast to the project_path in GenericPath + recursive (bool): search subprojects [True/False] + + Returns: + list: a list of job IDs + """ + return self.get_jobs( + sql_query=sql_query, + user=user, + project_path=project_path, + recursive=recursive, + )["id"] + class ConnectionWatchDog(Thread): """ Helper class that closes idle connections after a given timeout. diff --git a/pyiron_base/database/jobtable.py b/pyiron_base/database/jobtable.py index dbae32f56..adccc4632 100644 --- a/pyiron_base/database/jobtable.py +++ b/pyiron_base/database/jobtable.py @@ -21,31 +21,6 @@ -def get_job_ids(database, sql_query, user, project_path, recursive=True): - """ - Return the job IDs matching a specific query - - Args: - database (DatabaseAccess): Database object - sql_query (str): SQL query to enter a more specific request - user (str): username of the user whoes user space should be searched - project_path (str): root_path - this is in contrast to the project_path in GenericPath - recursive (bool): search subprojects [True/False] - - Returns: - list: a list of job IDs - """ - if not isinstance(database, FileTable): - return database.get_jobs( - sql_query=sql_query, - user=user, - project_path=project_path, - recursive=recursive, - )["id"] - else: - return database.get_job_ids(project=project_path, recursive=recursive) - - def get_child_ids(database, sql_query, user, project_path, job_specifier, status=None): """ Get the childs for a specific job diff --git a/pyiron_base/project/generic.py b/pyiron_base/project/generic.py index 383967adc..2d385fd87 100644 --- a/pyiron_base/project/generic.py +++ b/pyiron_base/project/generic.py @@ -23,7 +23,6 @@ from pyiron_base.database.filetable import FileTable from pyiron_base.state import state from pyiron_base.database.jobtable import ( - get_job_ids, get_job_id, set_job_status, get_child_ids, @@ -451,8 +450,7 @@ def get_job_ids(self, recursive=True): Returns: list: a list of job IDs """ - return get_job_ids( - database=self.db, + return self.db.get_job_ids( sql_query=self.sql_query, user=self.user, project_path=self.project_path, From d61c6e71c2938c41736a034acc92a897a0d9a826 Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Mon, 23 Oct 2023 13:27:20 +0200 Subject: [PATCH 4/5] Move delete_item to IsDatabase --- pyiron_base/database/filetable.py | 2 +- pyiron_base/database/generic.py | 26 +++++++++++++++++++++++++- tests/database/test_database_access.py | 6 ++---- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/pyiron_base/database/filetable.py b/pyiron_base/database/filetable.py index 2b4026d3b..756d98b39 100644 --- a/pyiron_base/database/filetable.py +++ b/pyiron_base/database/filetable.py @@ -140,7 +140,7 @@ def add_item_dict(self, par_dict): ).reset_index(drop=True) return int(par_dict_merged["id"]) - def delete_item(self, item_id): + def _delete_item(self, item_id): """ Delete Item from database diff --git a/pyiron_base/database/generic.py b/pyiron_base/database/generic.py index af0eabaff..1d9f4d62c 100644 --- a/pyiron_base/database/generic.py +++ b/pyiron_base/database/generic.py @@ -12,6 +12,7 @@ import numpy as np import re import os +from typing import Union, Iterable from datetime import datetime from pyiron_base.utils.deprecate import deprecate import pandas @@ -244,6 +245,29 @@ def _items_update(self, par_dict, item_ids): for i_id in item_ids: self._item_update(par_dict=par_dict, item_id=i_id) + @abstractmethod + def _delete_item(self, item_id): + pass + + def _delete_items(self, item_ids): + raise NotImplementedError() + + def delete_item(self, item_id: Union[int, Iterable[int]]): + """ + Delete database entry for job with given id. + + Args: + item_id (int, iterable): job id to delete or iterable there of + """ + if not isinstance(item_id, Iterable): + self._delete_item(item_id) + else: + try: + self._delete_items(item_id) + except NotImplementedError: + for i in item_id: + self._delete_item(i) + def set_job_status(self, status, job_id): """ Set status of a job or multiple jobs if job_id is iterable. @@ -1057,7 +1081,7 @@ def _item_update(self, par_dict, item_id): else: raise PermissionError("Not avilable in viewer mode.") - def delete_item(self, item_id): + def _delete_item(self, item_id): """ Delete Item from database diff --git a/tests/database/test_database_access.py b/tests/database/test_database_access.py index b7cf2bb2a..7a993b88d 100644 --- a/tests/database/test_database_access.py +++ b/tests/database/test_database_access.py @@ -189,10 +189,8 @@ def test_delete_item(self): par_dict = self.add_items("BO") key = par_dict["id"] self.database.delete_item(key) - self.assertRaises( - Exception, self.database.delete_item, [key] - ) # use only str or int - # self.assertRaises(Exception, self.database.get_item_by_id, key) # ensure item does not exist anymore + self.assertRaises(Exception, self.database.get_item_by_id, key, + "Item still exists after delete_item!") def test_get_item_by_id(self): """ From 36c311ad10cc5f7ed3a986ddf95a5520a0dcd4c0 Mon Sep 17 00:00:00 2001 From: Marvin Poul Date: Mon, 23 Oct 2023 13:45:51 +0200 Subject: [PATCH 5/5] Allow to customize bulk delete behavior --- pyiron_base/jobs/job/generic.py | 27 ++++++++++++++++++++++++ pyiron_base/jobs/master/generic.py | 7 ++++++ pyiron_base/project/generic.py | 34 +++++++++++++++++------------- 3 files changed, 53 insertions(+), 15 deletions(-) diff --git a/pyiron_base/jobs/job/generic.py b/pyiron_base/jobs/job/generic.py index 82c836ca2..9fe91b4fe 100644 --- a/pyiron_base/jobs/job/generic.py +++ b/pyiron_base/jobs/job/generic.py @@ -10,6 +10,7 @@ import os import posixpath import signal +import shutil import warnings from pyiron_base.state import state @@ -1470,6 +1471,32 @@ def _reload_update_master(self, project, master_id): self._logger.info("busy master: {} {}".format(master_id, self.get_job_id())) del self + @staticmethod + def _bulk_remove_jobs(project, job_df, progress): + """ + """ + def del_files(row): + if progress is not None: + progress.update(1) + project_path, project, job = row[["projectpath", "project", "job"]] + if project_path is not None: + base_path = os.path.join(project_path, project, job) + else: + base_path = os.path.join(project, job) + if os.path.isfile(base_path + ".h5"): + os.remove(base_path + ".h5") + shutil.rmtree(base_path + "_hdf5", ignore_errors=True) + + project.db.delete_item(job_df.id) + # if job doesn't match subjob, it's stored inside another job's HDF5 file and will be delete from there. + n_total = len(job_df) + job_df = job_df.query("job == subjob.str.slice(1, None)") + # sub jobs won't have their files deleted, so advance progress bar on its own + n_sub = n_total - len(job_df) + if progress is not None: + progress.update(n_sub) + job_df.apply(del_files, axis="columns") + class GenericError(object): def __init__(self, job): diff --git a/pyiron_base/jobs/master/generic.py b/pyiron_base/jobs/master/generic.py index 39b4b3803..9aa532598 100644 --- a/pyiron_base/jobs/master/generic.py +++ b/pyiron_base/jobs/master/generic.py @@ -504,6 +504,13 @@ def _init_child_job(self, parent): """ self.ref_job = parent + @staticmethod + def _bulk_remove_jobs(project, job_df, progress): + for job_id in job_df["id"]: + job = project.load(job_id) + job.child_project.remove(enable=True) + + super()._bulk_remove_jobs(project, job_df, progress) def get_function_from_string(function_str): """ diff --git a/pyiron_base/project/generic.py b/pyiron_base/project/generic.py index 2d385fd87..ad038b3ca 100644 --- a/pyiron_base/project/generic.py +++ b/pyiron_base/project/generic.py @@ -34,7 +34,7 @@ from pyiron_base.utils.deprecate import deprecate from pyiron_base.jobs.job.util import _special_symbol_replacements, _get_safe_job_name from pyiron_base.interfaces.has_groups import HasGroups -from pyiron_base.jobs.job.jobtype import JobType, JobTypeChoice, JobFactory +from pyiron_base.jobs.job.jobtype import JOB_CLASS_DICT, JobType, JobTypeChoice, JobFactory from pyiron_base.jobs.job.extension.server.queuestatus import ( queue_delete_job, queue_is_empty, @@ -1591,20 +1591,24 @@ def _remove_jobs_helper(self, recursive=False, progress=True): raise ValueError("recursive must be a boolean") if self.db.view_mode: raise RuntimeError("copy_to: is not available in Viewermode !") - job_id_lst = self.get_job_ids(recursive=recursive) - if progress and len(job_id_lst) > 0: - job_id_lst = tqdm(job_id_lst) - for job_id in job_id_lst: - if job_id not in self.get_job_ids(recursive=recursive): - continue - else: - try: - self.remove_job(job_specifier=job_id) - state.logger.debug("Remove job with ID {0} ".format(job_id)) - except (IndexError, Exception): - state.logger.debug( - "Could not remove job with ID {0} ".format(job_id) - ) + job_df = self.job_table( + recursive=recursive, + columns=["id", "hamilton", "parentid", "masterid", "projectpath", "project", "job", "subjob"] + ) + job_id_lst = job_df["id"] + parents = set(job_df.parentid.dropna()) + masters = set(job_df.masterid.dropna()) + if not (parents.issubset(job_id_lst) and masters.issubset(job_id_lst)): + assert False, "Somehow sort out out of project jobs" + + progress = tqdm(total=len(job_id_lst)) if progress else None + for hamilton, sub_df in job_df.groupby("hamilton"): + try: + job_class = JobType.convert_str_to_class(JOB_CLASS_DICT, hamilton) + except ValueError: # if job class is not registered in JOB_CLASS_DICT use generic routine + from pyiron_base.jobs.job.generic import GenericJob + job_class = GenericJob + job_class._bulk_remove_jobs(self, sub_df, progress) def _remove_files(self, pattern="*"): """