From 977274c7f59e67eddf0f7730b59ee68cd9bf8a94 Mon Sep 17 00:00:00 2001 From: Joachim Gabler Date: Thu, 24 Oct 2024 16:37:06 +0200 Subject: [PATCH 1/3] BF: CS-719 do full valgrind test on master branch (9.0.1) // fixed issues in category cache --- .../daemons/qmaster/sge_sched_job_category.cc | 11 +++---- source/libs/cull/cull_multitype.cc | 4 +-- source/libs/sched/sge_select_queue.cc | 32 +++++++++++++------ 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/source/daemons/qmaster/sge_sched_job_category.cc b/source/daemons/qmaster/sge_sched_job_category.cc index 263a74a8b..72da10829 100644 --- a/source/daemons/qmaster/sge_sched_job_category.cc +++ b/source/daemons/qmaster/sge_sched_job_category.cc @@ -155,7 +155,7 @@ sge_delete_job_category(lListElem *job) { /* First part */ auto *cat = static_cast(lGetRef(job, JB_category)); - if (CATEGORY_LIST && cat) { + if (CATEGORY_LIST != nullptr && cat != nullptr) { u_long32 rc = lGetUlong(cat, CT_refcount); if (rc > 1) { lSetUlong(cat, CT_refcount, --rc); @@ -176,7 +176,6 @@ sge_delete_job_category(lListElem *job) { } lSetRef(job, JB_category, nullptr); - DRETURN(0); } @@ -281,15 +280,15 @@ sge_category_count() { *******************************************************************************/ int sge_reset_job_category() { - lListElem *cat; DENTER(TOP_LAYER); + lListElem *cat; for_each_rw (cat, CATEGORY_LIST) { - const lListElem *cache; - - for_each_ep(cache, lGetList(cat, CT_cache)) { + lListElem *cache; + for_each_rw(cache, lGetList(cat, CT_cache)) { int *range = (int *) lGetRef(cache, CCT_pe_job_slots); sge_free(&range); + lSetRef(cache, CCT_pe_job_slots, nullptr); } lSetUlong(cat, CT_rejected, 0); diff --git a/source/libs/cull/cull_multitype.cc b/source/libs/cull/cull_multitype.cc index 546978624..f43cf1f53 100644 --- a/source/libs/cull/cull_multitype.cc +++ b/source/libs/cull/cull_multitype.cc @@ -1462,13 +1462,13 @@ lRef lGetPosRef(const lListElem *ep, int pos) { /****** cull/multitype/lGetRef() ********************************************** * NAME -* lGetRef() -- Returns the character for a field name +* lGetRef() -- Returns the reference for a field name * * SYNOPSIS * lRef lGetRef(const lListElem *ep, int name) * * FUNCTION -* Returns the character for a field name +* Returns the reference for a field name * * INPUTS * const lListElem *ep - element diff --git a/source/libs/sched/sge_select_queue.cc b/source/libs/sched/sge_select_queue.cc index f2c60901e..1155e97cc 100755 --- a/source/libs/sched/sge_select_queue.cc +++ b/source/libs/sched/sge_select_queue.cc @@ -100,14 +100,19 @@ enum { }; typedef struct { - lListElem *category; /* ref to the category */ - lListElem *cache; /* ref to the cache object in th category */ - bool use_category; /* if true: use the category - with immediate dispatch runs only and only if there is more than a single job of that category - prevents 'skip_host_list' and 'skip_queue_list' be used with reservation */ - bool mod_category; /* if true: update the category with new messages, queues, and hosts */ - u_long32 *possible_pe_slots; /* stores all possible slots settings for a pe job with ranges */ - bool is_pe_slots_rev; /* if it is true, the possible_pe_slots are stored in the category */ + lListElem *category; /* ref to the category */ + lListElem *cache; /* ref to the cache object in the category */ + bool use_category; /* if true: use the category + * with immediate dispatch runs only and only if there is more than a single job of that category + * prevents 'skip_host_list' and 'skip_queue_list' be used with reservation + */ + bool mod_category; /* if true: update the category with new messages, queues, and hosts */ + u_long32 *possible_pe_slots; /* stores all possible slots settings for a pe job with ranges + * it is stored in the job category cache, attribute CCT_pe_job_slots + * and is *not* freed with the category_use_t object + * (unless is_pe_slots_rev == false which means: it is not referenced in the cache) + */ + bool is_pe_slots_rev; /* if it is true, the possible_pe_slots are stored in the category */ } category_use_t; static void @@ -882,7 +887,6 @@ parallel_reservation_max_time_slots(sge_assignment_t *best, int *available_slots static dispatch_t parallel_maximize_slots_pe(sge_assignment_t *best, int *available_slots) { - int min_slots, max_slots; int max_pe_slots; int first, last; @@ -893,7 +897,7 @@ parallel_maximize_slots_pe(sge_assignment_t *best, int *available_slots) const char *pe_name = best->pe_name; bool is_first = true; int old_logging = 0; - category_use_t use_category; + category_use_t use_category{}; u_long32 max_slotsp = 0; int current = 0; int match_current = 0; @@ -953,6 +957,9 @@ parallel_maximize_slots_pe(sge_assignment_t *best, int *available_slots) } if (max_slotsp == 0) { DPRINTF("no slots in PE %s available for job " sge_u32"\n", pe_name, best->job_id); + if (!use_category.is_pe_slots_rev) { + sge_free(&(use_category.possible_pe_slots)); + } DRETURN(DISPATCH_NEVER_CAT); } @@ -3712,6 +3719,7 @@ add_pe_slots_to_category(category_use_t *use_category, u_long32 *max_slotsp, lLi use_category->is_pe_slots_rev = true; } else { use_category->is_pe_slots_rev = false; + } } else { use_category->is_pe_slots_rev = true; @@ -3749,8 +3757,10 @@ static void fill_category_use_t(const sge_assignment_t *a, category_use_t *use_c use_category->category = (lListElem *)lGetRef(job, JB_category); if (use_category->category != nullptr) { + // the category cache is stored in the job category object and contains an element per pe use_category->cache = lGetElemStrRW(lGetList(use_category->category, CT_cache), CCT_pe_name, pe_name); if (use_category->cache == nullptr) { + // there is no cache yet, create it use_category->cache = lCreateElem(CCT_Type); lSetString(use_category->cache, CCT_pe_name, pe_name); @@ -3758,6 +3768,7 @@ static void fill_category_use_t(const sge_assignment_t *a, category_use_t *use_c lSetList(use_category->cache, CCT_ignore_hosts, lCreateList(nullptr, CTI_Type)); lSetList(use_category->cache, CCT_job_messages, lCreateList(nullptr, MES_Type)); + // store the cache in the job category if (lGetList(use_category->category, CT_cache) == nullptr) { lSetList(use_category->category, CT_cache, lCreateList("pe_cache", CCT_Type)); } @@ -3769,6 +3780,7 @@ static void fill_category_use_t(const sge_assignment_t *a, category_use_t *use_c use_category->use_category = a->start == DISPATCH_TIME_NOW && lGetUlong(use_category->category, CT_refcount) > MIN_JOBS_IN_CATEGORY; } else { + // we have a job without category (does this case exist at all?) use_category->cache = nullptr; use_category->mod_category = false; use_category->use_category = false; From 7899822ccde8fb74f84ffe1f230ec6541f824221 Mon Sep 17 00:00:00 2001 From: Joachim Gabler Date: Fri, 25 Oct 2024 14:50:15 +0200 Subject: [PATCH 2/3] BF: CS-719 do full valgrind test on master branch (9.0.1) (#20) // fixed issues in binding code --- source/libs/sgeobj/sge_binding.cc | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/source/libs/sgeobj/sge_binding.cc b/source/libs/sgeobj/sge_binding.cc index 5f36dc6e7..f5420d253 100644 --- a/source/libs/sgeobj/sge_binding.cc +++ b/source/libs/sgeobj/sge_binding.cc @@ -2599,7 +2599,7 @@ static int get_core_id_from_logical_core_number_solaris(const int** matrix, /* ---------------------------------------------------------------------------*/ #if defined(OGE_HWLOC) || defined(BINDING_SOLARIS) -bool get_linear_automatic_socket_core_list_and_account(const int amount, +bool get_linear_automatic_socket_core_list_and_account(const int amount, int** list_of_sockets, int* samount, int** list_of_cores, int* camount, char** topo_by_job, int* topo_by_job_length) { @@ -2620,12 +2620,13 @@ bool get_linear_automatic_socket_core_list_and_account(const int amount, int i; /* get the topology which could be used by the job */ - tmp_topo_busy = (char *) calloc(logical_used_topology_length, sizeof(char)); - memcpy(tmp_topo_busy, logical_used_topology, logical_used_topology_length*sizeof(char)); + tmp_topo_busy = sge_strdup(nullptr, logical_used_topology); + if (tmp_topo_busy == nullptr) { + return false; + } /* 1. Find all free sockets and try to fit the request on them */ - if (get_free_sockets(tmp_topo_busy, logical_used_topology_length, &sockets, - &sockets_size)) { + if (get_free_sockets(tmp_topo_busy, logical_used_topology_length, &sockets, &sockets_size)) { /* there are free sockets: use them */ for (i = 0; i < sockets_size && used_cores < amount; i++) { @@ -2975,8 +2976,10 @@ bool get_striding_first_socket_first_core_and_account(const int amount, const in /* temporary accounting string -> account on this and when eventually successful then copy this string back to global topo_busy string */ - tmp_topo_busy = (char *) calloc(logical_used_topology_length + 1, sizeof(char)); - memcpy(tmp_topo_busy, logical_used_topology, logical_used_topology_length*sizeof(char)); + tmp_topo_busy = sge_strdup(nullptr, logical_used_topology); + if (tmp_topo_busy == nullptr) { + return false; + } DPRINTF("start_at_socket: %d, start_at_core: %d\n", start_at_socket, start_at_core); @@ -3007,7 +3010,7 @@ bool get_striding_first_socket_first_core_and_account(const int amount, const in /* check if we found the socket and core we want to start searching */ if (sc != start_at_socket || cc != start_at_core) { - /* could't find the start socket and start core */ + /* couldn't find the start socket and start core */ sge_free(&tmp_topo_busy); DRETURN(false); } @@ -3080,15 +3083,13 @@ static bool create_topology_used_per_job(char** accounted_topology, int* account (*accounted_topology_length) = logical_used_topology_length; /* copy string of current topology in use */ - (*accounted_topology) = (char *)calloc(logical_used_topology_length+1, sizeof(char)); - if ((*accounted_topology) == nullptr) { + *accounted_topology = strdup(logical_used_topology); + if (*accounted_topology == nullptr) { /* out of memory */ return false; } - memcpy((*accounted_topology), logical_used_topology, sizeof(char)*logical_used_topology_length); - - /* revert all accounting from other jobs */ + /* revert all accounting from other jobs */ for (i = 0; i < logical_used_topology_length; i++) { if ((*accounted_topology)[i] == 'c') { (*accounted_topology)[i] = 'C'; From 0ef1705e186c1b85530e19a67fb408b9b2544e4e Mon Sep 17 00:00:00 2001 From: Joachim Gabler Date: Sat, 26 Oct 2024 09:37:14 +0200 Subject: [PATCH 3/3] BF: CS-719 do full valgrind test on master branch (9.0.1) (#20) // additional cleanup in binding code --- source/libs/sgeobj/sge_binding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/sgeobj/sge_binding.cc b/source/libs/sgeobj/sge_binding.cc index f5420d253..b0d5f39ef 100644 --- a/source/libs/sgeobj/sge_binding.cc +++ b/source/libs/sgeobj/sge_binding.cc @@ -3083,7 +3083,7 @@ static bool create_topology_used_per_job(char** accounted_topology, int* account (*accounted_topology_length) = logical_used_topology_length; /* copy string of current topology in use */ - *accounted_topology = strdup(logical_used_topology); + *accounted_topology = sge_strdup(nullptr, logical_used_topology); if (*accounted_topology == nullptr) { /* out of memory */ return false;