Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BF: CS-802 sge_qmaster core dump when processing qstat request #32

Merged
merged 1 commit into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ if (NOT CMAKE_BUILD_TYPE)
"Debug"
CACHE STRING "select build type between Debug and Release" FORCE)
endif ()
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DENABLE_DEBUG_CHECKS")

# 3RD_PARTY OPTIONS
option(WITH_OS_3RDPARTY "Use 3rdparty libraries provided by OS packages" OFF)
Expand Down
2 changes: 1 addition & 1 deletion source/daemons/qmaster/sge_c_gdi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ sge_c_gdi_get_in_worker(gdi_object_t *ao, sge_gdi_packet_class_t *packet, sge_gd

/*
* DIRTY HACK: The "ok" message should be removed from the answer list
* 05/21/2007 qualitiy was ANSWER_QUALITY_INFO but this results in "ok"
* 05/21/2007 quality was ANSWER_QUALITY_INFO but this results in "ok"
* messages on qconf side
*/
snprintf(SGE_EVENT, SGE_EVENT_SIZE, SFNMAX, MSG_GDI_OKNL);
Expand Down
10 changes: 10 additions & 0 deletions source/daemons/qmaster/sge_thread_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ sge_reader_main(void *arg) {
component_set_thread_id(thread_id);
DPRINTF(SFN "(%d) started\n", thread_name, thread_id);

// this thread will use the READER data store
ocs::DataStore::select_active_ds(ocs::DataStore::Id::READER);

// init monitoring
cl_thread_func_startup(thread_config);
sge_monitor_init(p_monitor, thread_config->thread_name, GDI_EXT, RT_WARNING, RT_ERROR);
Expand Down Expand Up @@ -199,6 +202,13 @@ sge_reader_main(void *arg) {
is_only_read_request = false;
}

#if defined (ENABLE_DEBUG_CHECKS)
if (!is_only_read_request) {
CRITICAL("reader thread tries to execute write request");
abort();
}
#endif

/*
* acquire the correct lock
*/
Expand Down
2 changes: 1 addition & 1 deletion source/libs/gdi/sge_gdi_packet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ sge_gdi_task_create(sge_gdi_packet_class_t *packet, lList **answer_list, u_long3
* NOTES
* MT-NOTE: sge_gdi_task_free() is MT safe as long as the structure
* passed to this function is not accessed by more than one
* thread simultaniously.
* thread simultaneously.
*
* SEE ALSO
* gdi/request_internal/sge_gdi_task_create()
Expand Down
2 changes: 2 additions & 0 deletions source/libs/gdi/sge_gdi_packet_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
#include "gdi/sge_gdi_packet_type.h"
#include "uti/sge_tq.h"

#include "sge_gdi_packet_type.h"

extern sge_tq_queue_t *GlobalRequestQueue;
extern sge_tq_queue_t *ReaderRequestQueue;
extern sge_tq_queue_t *ReaderWaitingRequestQueue;
Expand Down
5 changes: 5 additions & 0 deletions source/libs/gdi/sge_gdi_packet_pb_cull.cc
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,11 @@ sge_gdi_packet_pack_task(sge_gdi_packet_class_t *packet, sge_gdi_task_class_t *t
DRETURN(ret);
error_with_mapping:
ret = sge_gdi_map_pack_errors(pack_ret, answer_list);
if (task->do_select_pack_simultaneous) {
// data_list references a master list
// avoid it being freed when the packet/task gets freed
task->data_list = nullptr;
}
DRETURN(ret);
}

28 changes: 27 additions & 1 deletion source/libs/sgeobj/ocs_DataStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <pthread.h>

#include "uti/sge_log.h"
#include "uti/sge_rmon_macros.h"

#include "basis_types.h"
Expand Down Expand Up @@ -118,10 +119,35 @@ namespace ocs {
* @return pointer to the master list. will never be nullptr.
*/
lList **
DataStore::get_master_list_rw(sge_object_type type) {
DataStore::get_master_list_rw(sge_object_type type, bool for_read) {
DENTER(DATA_STORE_LAYER);
GET_SPECIFIC(obj_thread_local_t, obj_state, obj_state_init, obj_state_key);

#if defined (ENABLE_DEBUG_CHECKS)
auto ds_id = obj_state->ds_id;
const char *thread_name = component_get_thread_name();
if (thread_name != nullptr) {
if (strcmp(thread_name, "worker") == 0 && ds_id != DataStore::Id::GLOBAL) {
CRITICAL("Worker thread is trying to access data store %d for list %d", ds_id, type);
abort();
}

if (strcmp(thread_name, "reader") == 0) {
if (ds_id != DataStore::Id::READER) {
CRITICAL("Reader thread is trying to access data store %d for list %d", ds_id, type);
abort();
}
// @todo enable once CS-825 is fixed
#if 0
if (!for_read) {
CRITICAL("Reader thread is trying to get master list with write access");
abort();
}
#endif
}
}
#endif

lList **ret;
ret = &(obj_thread_shared.data_store[obj_state->ds_id].master_list[type]);
#ifdef OBSERVE
Expand Down
4 changes: 2 additions & 2 deletions source/libs/sgeobj/ocs_DataStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace ocs {
select_active_ds(ocs::DataStore::Id ds_id);

static lList **
get_master_list_rw(sge_object_type type);
get_master_list_rw(sge_object_type type, bool for_read = false);

/**
* Returns a master list (RO-access) from the currently active data store of the active threads
Expand All @@ -50,7 +50,7 @@ namespace ocs {
*/
static inline const lList **
get_master_list(sge_object_type type) {
return const_cast<const lList **>(ocs::DataStore::get_master_list_rw(type));
return const_cast<const lList **>(ocs::DataStore::get_master_list_rw(type, true));
}

static void
Expand Down
Loading