Skip to content

Commit 38a7fbb

Browse files
committed
Fix for issue #13432
@mentOS31 correctly identified few issues with the handling of persistent requests with regard to their error codes (more info in the issue). This PR fixes all wait and test function to have a consistent outcome for all types of requests. It also prevents the error handler invocation from releasing persistent requests. Signed-off-by: George Bosilca <[email protected]>
1 parent 333d8ad commit 38a7fbb

File tree

3 files changed

+43
-47
lines changed

3 files changed

+43
-47
lines changed

ompi/errhandler/errhandler_invoke.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
* and Technology (RIST). All rights reserved.
1919
* Copyright (c) 2023 Triad National Security, LLC. All rights
2020
* reserved.
21+
* Copyright (c) 2025 NVIDIA Corporation. All rights reserved.
2122
* $COPYRIGHT$
2223
*
2324
* Additional copyrights may follow
@@ -180,6 +181,7 @@ int ompi_errhandler_request_invoke(int count,
180181
that had an error. */
181182
for (; i < count; ++i) {
182183
if (MPI_REQUEST_NULL != requests[i] &&
184+
!requests[i]->req_persistent &&
183185
MPI_SUCCESS != requests[i]->req_status.MPI_ERROR) {
184186
#if OPAL_ENABLE_FT_MPI
185187
/* Special case for MPI_ANY_SOURCE when marked as

ompi/request/req_test.c

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2010-2012 Oracle and/or its affiliates. All rights reserved.
1515
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
16+
* Copyright (c) 2025 NVIDIA Corporation. All rights reserved.
1617
* $COPYRIGHT$
1718
*
1819
* Additional copyrights may follow
@@ -136,7 +137,7 @@ int ompi_request_default_test_any(
136137

137138
if( request->req_persistent ) {
138139
request->req_state = OMPI_REQUEST_INACTIVE;
139-
return OMPI_SUCCESS;
140+
return request->req_status.MPI_ERROR;
140141
}
141142
/* If there is an error on the request, don't free it */
142143
if (MPI_SUCCESS != request->req_status.MPI_ERROR) {
@@ -248,27 +249,26 @@ int ompi_request_default_test_all(
248249
ompi_grequest_invoke_query(request, &request->req_status);
249250
}
250251
OMPI_COPY_STATUS(&statuses[i], request->req_status, true);
251-
if( request->req_persistent ) {
252-
request->req_state = OMPI_REQUEST_INACTIVE;
253-
continue;
254-
}
255-
/* MPI-2:4.5.1 says that we can return MPI_ERR_IN_STATUS
256-
even if MPI_STATUSES_IGNORE was used. Woot! */
257-
/* Only free the request if there was no error on it */
258252
if (MPI_SUCCESS == request->req_status.MPI_ERROR) {
259-
int tmp = ompi_request_free(rptr);
260-
if (tmp != OMPI_SUCCESS) {
261-
return tmp;
262-
}
263-
} else {
264253
rc = MPI_ERR_IN_STATUS;
265254
#if OPAL_ENABLE_FT_MPI
266255
if (MPI_ERR_PROC_FAILED == request->req_status.MPI_ERROR
267-
|| MPI_ERR_REVOKED == request->req_status.MPI_ERROR) {
256+
|| MPI_ERR_REVOKED == request->req_status.MPI_ERROR) {
268257
rc = request->req_status.MPI_ERROR;
269258
}
270259
#endif /* OPAL_ENABLE_FT_MPI */
271260
}
261+
if (request->req_persistent) {
262+
request->req_state = OMPI_REQUEST_INACTIVE;
263+
} else if (MPI_SUCCESS == request->req_status.MPI_ERROR) {
264+
/* MPI-2:4.5.1 says that we can return MPI_ERR_IN_STATUS
265+
even if MPI_STATUSES_IGNORE was used. Woot! */
266+
/* Only free the request if there was no error on it */
267+
int tmp = ompi_request_free(rptr);
268+
if (tmp != OMPI_SUCCESS) {
269+
return tmp;
270+
}
271+
}
272272
}
273273
} else {
274274
/* free request if required */
@@ -283,25 +283,24 @@ int ompi_request_default_test_all(
283283
if (OMPI_REQUEST_GEN == request->req_type) {
284284
ompi_grequest_invoke_query(request, &request->req_status);
285285
}
286-
if( request->req_persistent ) {
287-
request->req_state = OMPI_REQUEST_INACTIVE;
288-
continue;
289-
}
290-
/* Only free the request if there was no error */
291-
if (MPI_SUCCESS == request->req_status.MPI_ERROR) {
292-
int tmp = ompi_request_free(rptr);
293-
if (tmp != OMPI_SUCCESS) {
294-
return tmp;
295-
}
296-
} else {
286+
if (MPI_SUCCESS != request->req_status.MPI_ERROR) {
297287
rc = MPI_ERR_IN_STATUS;
298288
#if OPAL_ENABLE_FT_MPI
299289
if (MPI_ERR_PROC_FAILED == request->req_status.MPI_ERROR
300-
|| MPI_ERR_REVOKED == request->req_status.MPI_ERROR) {
290+
|| MPI_ERR_REVOKED == request->req_status.MPI_ERROR) {
301291
rc = request->req_status.MPI_ERROR;
302292
}
303293
#endif /* OPAL_ENABLE_FT_MPI */
304294
}
295+
if (request->req_persistent) {
296+
request->req_state = OMPI_REQUEST_INACTIVE;
297+
} else if (MPI_SUCCESS == request->req_status.MPI_ERROR) {
298+
/* Only free the request if there was no error */
299+
int tmp = ompi_request_free(rptr);
300+
if (tmp != OMPI_SUCCESS) {
301+
return tmp;
302+
}
303+
}
305304
}
306305
}
307306

@@ -398,7 +397,7 @@ int ompi_request_default_test_some(
398397
#endif /* OPAL_ENABLE_FT_MPI */
399398
}
400399

401-
if( request->req_persistent ) {
400+
if (request->req_persistent) {
402401
request->req_state = OMPI_REQUEST_INACTIVE;
403402
} else {
404403
/* Only free the request if there was no error */

ompi/request/req_wait.c

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
* Copyright (c) 2016 Mellanox Technologies. All rights reserved.
1919
* Copyright (c) 2016 Research Organization for Information Science
2020
* and Technology (RIST). All rights reserved.
21+
* Copyright (c) 2025 NVIDIA Corporation. All rights reserved.
2122
* $COPYRIGHT$
2223
*
2324
* Additional copyrights may follow
@@ -228,7 +229,7 @@ int ompi_request_default_wait_all( size_t count,
228229
size_t i, completed = 0, failed = 0;
229230
ompi_request_t **rptr;
230231
ompi_request_t *request;
231-
int mpi_error = OMPI_SUCCESS;
232+
int mpi_error = OMPI_SUCCESS, rc;
232233
ompi_wait_sync_t sync;
233234

234235
if (OPAL_UNLIKELY(0 == count)) {
@@ -356,13 +357,12 @@ int ompi_request_default_wait_all( size_t count,
356357

357358
if( request->req_persistent ) {
358359
request->req_state = OMPI_REQUEST_INACTIVE;
359-
continue;
360-
}
361-
/* Only free the request if there is no error on it */
362-
if (MPI_SUCCESS == request->req_status.MPI_ERROR) {
360+
} else if (MPI_SUCCESS == request->req_status.MPI_ERROR) {
361+
/* Only free the request if there is no error on it */
362+
363363
/* If there's an error while freeing the request,
364-
assume that the request is still there.
365-
Otherwise, Bad Things will happen later! */
364+
assume that the request is still there.
365+
Otherwise, Bad Things will happen later! */
366366
int tmp = ompi_request_free(rptr);
367367
if (OMPI_SUCCESS == mpi_error && OMPI_SUCCESS != tmp) {
368368
mpi_error = tmp;
@@ -373,7 +373,6 @@ int ompi_request_default_wait_all( size_t count,
373373
}
374374
}
375375
} else {
376-
int rc;
377376
/* free request if required */
378377
for( i = 0; i < count; i++, rptr++ ) {
379378
void *_tmp_ptr = &sync;
@@ -382,8 +381,9 @@ int ompi_request_default_wait_all( size_t count,
382381

383382
if( request->req_state == OMPI_REQUEST_INACTIVE ) {
384383
rc = ompi_status_empty.MPI_ERROR;
385-
goto absorb_error_and_continue;
384+
continue;
386385
}
386+
rc = OMPI_SUCCESS;
387387
/*
388388
* Assert only if no requests were failed.
389389
* Since some may still be pending.
@@ -406,7 +406,8 @@ int ompi_request_default_wait_all( size_t count,
406406
rc = MPI_ERR_PROC_FAILED_PENDING;
407407
}
408408
#endif /* OPAL_ENABLE_FT_MPI */
409-
goto absorb_error_and_continue;
409+
mpi_error = MPI_ERR_IN_STATUS;
410+
continue;
410411
}
411412
}
412413
assert( REQUEST_COMPLETE(request) );
@@ -417,18 +418,12 @@ int ompi_request_default_wait_all( size_t count,
417418
rc = ompi_grequest_invoke_query(request, &request->req_status);
418419
}
419420

420-
rc = request->req_status.MPI_ERROR;
421-
422-
if( request->req_persistent ) {
421+
if (request->req_persistent) {
423422
request->req_state = OMPI_REQUEST_INACTIVE;
424-
} else if (MPI_SUCCESS == rc) {
423+
} else if (MPI_SUCCESS == request->req_status.MPI_ERROR) {
425424
/* Only free the request if there is no error on it */
426-
int tmp = ompi_request_free(rptr);
427-
if (OMPI_SUCCESS == mpi_error && OMPI_SUCCESS != tmp) {
428-
mpi_error = tmp;
429-
}
425+
rc = ompi_request_free(rptr);
430426
}
431-
absorb_error_and_continue:
432427
#if OPAL_ENABLE_FT_MPI
433428
if( (MPI_ERR_PROC_FAILED == rc) || (MPI_ERR_REVOKED == rc) ) {
434429
mpi_error = rc;
@@ -441,7 +436,7 @@ int ompi_request_default_wait_all( size_t count,
441436
* passed to that function."
442437
* So we should do so here as well.
443438
*/
444-
if( OMPI_SUCCESS == mpi_error && rc != OMPI_SUCCESS) {
439+
if (OMPI_SUCCESS == mpi_error && OMPI_SUCCESS != rc) {
445440
mpi_error = MPI_ERR_IN_STATUS;
446441
}
447442
}

0 commit comments

Comments
 (0)