Skip to content

Commit b7d7230

Browse files
committed
Fix the data server
Refactor the data server code so it is easier to read. The difficulty made it easy to make mistakes. Ensure we do not treat all publish/lookup operations as having "first-read" persistence. Track data in list format so that it can be removed when necessary (e.g., on read) and make it clearer when all info has been removed so that the data object can be released. Correctly implement the purge operation. Signed-off-by: Ralph Castain <[email protected]>
1 parent 3deb4c3 commit b7d7230

20 files changed

+1530
-1009
lines changed

.github/workflows/dvm.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,11 @@ jobs:
7676
prterun -n 4 ./openpmix/master/examples/pubi
7777
if: ${{ true }}
7878
timeout-minutes: 5
79+
80+
- name: Run pub-lookup stress test
81+
run: |
82+
export PATH=$RUNNER_TEMP/prteinstall/bin:${PATH}
83+
export LD_LIBRARY_PATH=$RUNNER_TEMP/prteinstall/lib:${LD_LIBRARY_PATH}
84+
prterun -n 5 ./openpmix/master/examples/pubstress 30
85+
if: ${{ true }}
86+
timeout-minutes: 10

src/mca/state/base/state_base_fns.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
#include "src/mca/rmaps/rmaps_types.h"
4040
#include "src/rml/rml.h"
4141
#include "src/prted/pmix/pmix_server_internal.h"
42-
#include "src/runtime/prte_data_server.h"
42+
#include "src/runtime/data_server/prte_data_server.h"
4343
#include "src/runtime/prte_globals.h"
4444
#include "src/runtime/prte_wait.h"
4545
#include "src/threads/pmix_threads.h"

src/mca/state/base/state_base_options.c

+1-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
* and Technology (RIST). All rights reserved.
66
* Copyright (c) 2020 IBM Corporation. All rights reserved.
77
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved
8-
* Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
8+
* Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
99
* $COPYRIGHT$
1010
*
1111
* Additional copyrights may follow
@@ -40,7 +40,6 @@
4040
#include "src/mca/rmaps/rmaps_types.h"
4141
#include "src/rml/rml.h"
4242
#include "src/prted/pmix/pmix_server_internal.h"
43-
#include "src/runtime/prte_data_server.h"
4443
#include "src/runtime/prte_globals.h"
4544
#include "src/runtime/prte_wait.h"
4645
#include "src/threads/pmix_threads.h"

src/mca/state/dvm/state_dvm.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
#include "src/mca/ras/base/base.h"
4242
#include "src/mca/rmaps/base/base.h"
4343
#include "src/rml/rml.h"
44-
#include "src/runtime/prte_data_server.h"
44+
#include "src/runtime/data_server/prte_data_server.h"
4545
#include "src/runtime/prte_quit.h"
4646
#include "src/runtime/prte_wait.h"
4747
#include "src/threads/pmix_threads.h"

src/mca/state/prted/state_prted.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
55
* Copyright (c) 2020-2021 IBM Corporation. All rights reserved.
66
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved
7-
* Copyright (c) 2021-2024 Nanook Consulting. All rights reserved.
7+
* Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
88
* $COPYRIGHT$
99
*
1010
* Additional copyrights may follow
@@ -28,7 +28,7 @@
2828
#include "src/mca/rmaps/rmaps_types.h"
2929
#include "src/rml/rml.h"
3030
#include "src/prted/pmix/pmix_server_internal.h"
31-
#include "src/runtime/prte_data_server.h"
31+
#include "src/runtime/data_server/prte_data_server.h"
3232
#include "src/runtime/prte_quit.h"
3333
#include "src/threads/pmix_threads.h"
3434
#include "src/util/pmix_output.h"

src/pmix/pmix-internal.h

+1-11
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* Copyright (c) 2019 Research Organization for Information Science
77
* and Technology (RIST). All rights reserved.
88
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved
9-
* Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
9+
* Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
1010
* $COPYRIGHT$
1111
*
1212
* Additional copyrights may follow
@@ -50,16 +50,6 @@ typedef struct {
5050
} prte_pmix_app_t;
5151
PMIX_CLASS_DECLARATION(prte_pmix_app_t);
5252

53-
/* define a caddy for pointing to pmix_info_t that
54-
* are to be included in an answer */
55-
typedef struct {
56-
pmix_list_item_t super;
57-
pmix_proc_t source;
58-
pmix_info_t *info;
59-
pmix_persistence_t persistence;
60-
} prte_ds_info_t;
61-
PMIX_CLASS_DECLARATION(prte_ds_info_t);
62-
6353
/* define another caddy for putting statically defined
6454
* pmix_info_t objects on a list */
6555
typedef struct {

src/pmix/pmix.c

+1-9
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* All rights reserved.
1010
* Copyright (c) 2016-2020 Cisco Systems, Inc. All rights reserved
1111
* Copyright (c) 2020 IBM Corporation. All rights reserved.
12-
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
12+
* Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
1313
* $COPYRIGHT$
1414
*
1515
* Additional copyrights may follow
@@ -504,14 +504,6 @@ static void ades(prte_pmix_app_t *p)
504504
}
505505
PMIX_CLASS_INSTANCE(prte_pmix_app_t, pmix_list_item_t, acon, ades);
506506

507-
static void dsicon(prte_ds_info_t *p)
508-
{
509-
PMIX_PROC_CONSTRUCT(&p->source);
510-
p->info = NULL;
511-
p->persistence = PMIX_PERSIST_INVALID;
512-
}
513-
PRTE_EXPORT PMIX_CLASS_INSTANCE(prte_ds_info_t, pmix_list_item_t, dsicon, NULL);
514-
515507
static void infoitmcon(prte_info_item_t *p)
516508
{
517509
PMIX_INFO_CONSTRUCT(&p->info);

src/prted/pmix/pmix_server.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
#include "src/mca/ras/base/ras_private.h"
6969
#include "src/rml/rml_contact.h"
7070
#include "src/rml/rml.h"
71-
#include "src/runtime/prte_data_server.h"
71+
#include "src/runtime/data_server/prte_data_server.h"
7272
#include "src/runtime/prte_globals.h"
7373
#include "src/threads/pmix_threads.h"
7474
#include "src/util/name_fns.h"

src/prted/pmix/pmix_server_pub.c

+71-33
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
* All rights reserved.
1919
* Copyright (c) 2014-2016 Research Organization for Information Science
2020
* and Technology (RIST). All rights reserved.
21-
* Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
21+
* Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
2222
* $COPYRIGHT$
2323
*
2424
* Additional copyrights may follow
@@ -40,7 +40,7 @@
4040
#include "src/mca/errmgr/errmgr.h"
4141
#include "src/rml/rml_contact.h"
4242
#include "src/rml/rml.h"
43-
#include "src/runtime/prte_data_server.h"
43+
#include "src/runtime/data_server/prte_data_server.h"
4444
#include "src/runtime/prte_globals.h"
4545
#include "src/threads/pmix_threads.h"
4646
#include "src/util/name_fns.h"
@@ -371,17 +371,51 @@ pmix_status_t pmix_server_unpublish_fn(const pmix_proc_t *proc, char **keys,
371371
{
372372
pmix_server_req_t *req;
373373
int ret;
374-
uint8_t cmd = PRTE_PMIX_UNPUBLISH_CMD;
374+
uint8_t cmd;
375375
size_t m, n;
376376
pmix_status_t rc;
377377

378+
// check for a "purge" command
379+
if (NULL == keys) {
380+
/* create the caddy */
381+
req = PMIX_NEW(pmix_server_req_t);
382+
pmix_asprintf(&req->operation, "PURGE: %s:%d", __FILE__, __LINE__);
383+
req->opcbfunc = cbfunc;
384+
req->cbdata = cbdata;
385+
386+
/* load the command */
387+
cmd = PRTE_PMIX_PURGE_PROC_CMD;
388+
if (PRTE_SUCCESS != (ret = PMIx_Data_pack(NULL, &req->msg, &cmd, 1, PMIX_UINT8))) {
389+
PRTE_ERROR_LOG(ret);
390+
PMIX_RELEASE(req);
391+
return PMIX_ERR_PACK_FAILURE;
392+
}
393+
394+
/* pack the name of the requestor */
395+
if (PMIX_SUCCESS
396+
!= (rc = PMIx_Data_pack(NULL, &req->msg, (pmix_proc_t *) proc, 1, PMIX_PROC))) {
397+
PMIX_ERROR_LOG(rc);
398+
PMIX_RELEASE(req);
399+
return rc;
400+
}
401+
402+
/* thread-shift so we can store the tracker */
403+
prte_event_set(prte_event_base, &(req->ev), -1, PRTE_EV_WRITE, execute, req);
404+
PMIX_POST_OBJECT(req);
405+
prte_event_active(&(req->ev), PRTE_EV_WRITE, 1);
406+
407+
return PRTE_SUCCESS;
408+
}
409+
410+
378411
/* create the caddy */
379412
req = PMIX_NEW(pmix_server_req_t);
380413
pmix_asprintf(&req->operation, "UNPUBLISH: %s:%d", __FILE__, __LINE__);
381414
req->opcbfunc = cbfunc;
382415
req->cbdata = cbdata;
383416

384417
/* load the command */
418+
cmd = PRTE_PMIX_UNPUBLISH_CMD;
385419
if (PRTE_SUCCESS != (ret = PMIx_Data_pack(NULL, &req->msg, &cmd, 1, PMIX_UINT8))) {
386420
PRTE_ERROR_LOG(ret);
387421
PMIX_RELEASE(req);
@@ -456,14 +490,14 @@ void pmix_server_keyval_client(int status, pmix_proc_t *sender,
456490
pmix_server_req_t *req = NULL;
457491
pmix_byte_object_t bo;
458492
pmix_data_buffer_t pbkt;
459-
pmix_status_t ret = PMIX_SUCCESS, rt = PMIX_SUCCESS;
493+
pmix_status_t ret = PMIX_SUCCESS;
460494
pmix_info_t info;
461495
pmix_pdata_t *pdata = NULL;
462496
size_t n, npdata = 0;
463-
PRTE_HIDE_UNUSED_PARAMS(sender, tg, cbdata);
497+
PRTE_HIDE_UNUSED_PARAMS(status, sender, tg, cbdata);
464498

465499
pmix_output_verbose(1, prte_pmix_server_globals.output,
466-
"%s recvd lookup data return",
500+
"%s recvd data server return",
467501
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME));
468502

469503
/* unpack the room number of the request tracker */
@@ -485,23 +519,16 @@ void pmix_server_keyval_client(int status, pmix_proc_t *sender,
485519

486520
/* unpack the return status */
487521
cnt = 1;
488-
rc = PMIx_Data_unpack(NULL, buffer, &status, &cnt, PMIX_INT);
522+
rc = PMIx_Data_unpack(NULL, buffer, &ret, &cnt, PMIX_STATUS);
489523
if (PMIX_SUCCESS != rc) {
490524
PMIX_ERROR_LOG(rc);
491-
ret = PMIX_ERR_UNPACK_FAILURE;
525+
ret = rc;
492526
goto release;
493527
}
494528

495-
if (PRTE_ERR_NOT_FOUND == status) {
496-
ret = PMIX_ERR_NOT_FOUND;
497-
goto release;
498-
} else if (PRTE_ERR_PARTIAL_SUCCESS == status) {
499-
rt = PMIX_QUERY_PARTIAL_SUCCESS;
500-
} else {
501-
ret = PMIX_SUCCESS;
502-
}
503-
if (PRTE_PMIX_UNPUBLISH_CMD == command) {
504-
/* nothing else will be included */
529+
if (PMIX_ERR_NOT_FOUND == ret ||
530+
PRTE_PMIX_UNPUBLISH_CMD == command ||
531+
PRTE_PMIX_PUBLISH_CMD == command) {
505532
goto release;
506533
}
507534

@@ -512,9 +539,12 @@ void pmix_server_keyval_client(int status, pmix_proc_t *sender,
512539
* command will not return any data if no matching pending
513540
* requests were found */
514541
if (PMIX_SUCCESS != rc) {
515-
if (PMIX_SUCCESS == ret) {
516-
ret = rt;
542+
if (PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER == rc) {
543+
// not necessarily an error, so don't log it
544+
goto release;
517545
}
546+
PMIX_ERROR_LOG(rc);
547+
ret = rc;
518548
goto release;
519549
}
520550

@@ -523,12 +553,19 @@ void pmix_server_keyval_client(int status, pmix_proc_t *sender,
523553
rc = PMIx_Data_load(&pbkt, &bo);
524554
bo.bytes = NULL;
525555
PMIX_BYTE_OBJECT_DESTRUCT(&bo);
556+
if (PMIX_SUCCESS != rc) {
557+
PMIX_ERROR_LOG(rc);
558+
ret = rc;
559+
goto release;
560+
}
526561

527562
/* unpack the number of data items */
528563
cnt = 1;
529-
if (PMIX_SUCCESS != (ret = PMIx_Data_unpack(NULL, &pbkt, &npdata, &cnt, PMIX_SIZE))) {
530-
PMIX_ERROR_LOG(ret);
564+
rc = PMIx_Data_unpack(NULL, &pbkt, &npdata, &cnt, PMIX_SIZE);
565+
if (PMIX_SUCCESS != rc) {
566+
PMIX_ERROR_LOG(rc);
531567
PMIX_DATA_BUFFER_DESTRUCT(&pbkt);
568+
ret = rc;
532569
goto release;
533570
}
534571

@@ -537,31 +574,32 @@ void pmix_server_keyval_client(int status, pmix_proc_t *sender,
537574
for (n = 0; n < npdata; n++) {
538575
PMIX_INFO_CONSTRUCT(&info);
539576
cnt = 1;
540-
if (PMIX_SUCCESS
541-
!= (ret = PMIx_Data_unpack(NULL, &pbkt, &pdata[n].proc, &cnt, PMIX_PROC))) {
542-
PMIX_ERROR_LOG(ret);
577+
rc = PMIx_Data_unpack(NULL, &pbkt, &pdata[n].proc, &cnt, PMIX_PROC);
578+
if (PMIX_SUCCESS != rc) {
579+
PMIX_ERROR_LOG(rc);
543580
PMIX_DATA_BUFFER_DESTRUCT(&pbkt);
581+
ret = rc;
544582
goto release;
545583
}
546584
cnt = 1;
547-
if (PMIX_SUCCESS != (ret = PMIx_Data_unpack(NULL, &pbkt, &info, &cnt, PMIX_INFO))) {
548-
PMIX_ERROR_LOG(ret);
585+
rc = PMIx_Data_unpack(NULL, &pbkt, &info, &cnt, PMIX_INFO);
586+
if (PMIX_SUCCESS != rc) {
587+
PMIX_ERROR_LOG(rc);
549588
PMIX_DATA_BUFFER_DESTRUCT(&pbkt);
589+
ret = rc;
550590
goto release;
551591
}
552592
PMIX_LOAD_KEY(pdata[n].key, info.key);
553-
PMIX_VALUE_XFER_DIRECT(ret, &pdata[n].value, &info.value);
554-
if (PMIX_SUCCESS != ret) {
555-
PMIX_ERROR_LOG(ret);
593+
PMIX_VALUE_XFER_DIRECT(rc, &pdata[n].value, &info.value);
594+
if (PMIX_SUCCESS != rc) {
595+
PMIX_ERROR_LOG(rc);
556596
PMIX_DATA_BUFFER_DESTRUCT(&pbkt);
597+
ret = rc;
557598
goto release;
558599
}
559600
PMIX_INFO_DESTRUCT(&info);
560601
}
561602
}
562-
if (PMIX_SUCCESS == ret) {
563-
ret = rt;
564-
}
565603

566604
release:
567605
if (0 <= room_num) {

src/runtime/Makefile.am

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# Copyright (c) 2012 Los Alamos National Security, LLC.
1515
# All rights reserved.
1616
# Copyright (c) 2017-2020 Intel, Inc. All rights reserved.
17-
# Copyright (c) 2021 Nanook Consulting. All rights reserved.
17+
# Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
1818
# Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
1919
# $COPYRIGHT$
2020
#
@@ -32,7 +32,6 @@ headers += \
3232
runtime/prte_quit.h \
3333
runtime/runtime_internals.h \
3434
runtime/prte_wait.h \
35-
runtime/prte_data_server.h \
3635
runtime/prte_progress_threads.h
3736

3837
libprrte_la_SOURCES += \
@@ -47,5 +46,6 @@ libprrte_la_SOURCES += \
4746
runtime/data_type_support/prte_dt_unpacking_fns.c \
4847
runtime/prte_mca_params.c \
4948
runtime/prte_wait.c \
50-
runtime/prte_data_server.c \
5149
runtime/prte_progress_threads.c
50+
51+
include runtime/data_server/Makefile.am

src/runtime/data_server/Makefile.am

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# -*- makefile -*-
2+
#
3+
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4+
# University Research and Technology
5+
# Corporation. All rights reserved.
6+
# Copyright (c) 2004-2005 The University of Tennessee and The University
7+
# of Tennessee Research Foundation. All rights
8+
# reserved.
9+
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10+
# University of Stuttgart. All rights reserved.
11+
# Copyright (c) 2004-2005 The Regents of the University of California.
12+
# All rights reserved.
13+
# Copyright (c) 2010-2020 Cisco Systems, Inc. All rights reserved
14+
# Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
15+
# Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
16+
# $COPYRIGHT$
17+
#
18+
# Additional copyrights may follow
19+
#
20+
# $HEADER$
21+
#
22+
23+
# This makefile.am does not stand on its own - it is included from src/runtime/Makefile.am
24+
25+
headers += \
26+
runtime/data_server/prte_data_server.h \
27+
runtime/data_server/ds.h
28+
29+
libprrte_la_SOURCES += \
30+
runtime/data_server/ds_main.c \
31+
runtime/data_server/ds_publish.c \
32+
runtime/data_server/ds_lookup.c \
33+
runtime/data_server/ds_unpublish.c \
34+
runtime/data_server/ds_purge.c

0 commit comments

Comments
 (0)