diff --git a/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala b/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala index 8f8312023f..9861456731 100644 --- a/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala @@ -96,6 +96,10 @@ trait StandardAsyncExecutionActor val SIGTERM = 143 val SIGINT = 130 val SIGKILL = 137 + // From Gemini: + // An exit code of 247, particularly in the context of process execution in Linux or containerized environments like + // Docker, often indicates a process termination due to resource limitations, most commonly insufficient memory (RAM). + val SIGCONTAINERKILL = 247 // `CheckingForMemoryRetry` action exits with code 0 if the stderr file contains keys mentioned in `memory-retry` config. val StderrContainsRetryKeysCode = 0 @@ -963,7 +967,9 @@ trait StandardAsyncExecutionActor * @param returnCode The return code. * @return True if the return code is for an abort. */ - def isAbort(returnCode: Int): Boolean = returnCode == SIGINT || returnCode == SIGTERM || returnCode == SIGKILL + def isAbortRc(returnCode: Int): Boolean = returnCode == SIGINT || returnCode == SIGTERM || returnCode == SIGKILL + + def isOomKillRc(returnCode: Int): Boolean = returnCode == SIGKILL || returnCode == SIGCONTAINERKILL /** * Custom behavior to run after an abort signal is processed. @@ -1216,12 +1222,15 @@ trait StandardAsyncExecutionActor // See executeOrRecoverSuccess private var missedAbort = false + // Records whether abort has been requested for this job. + private var abortRequested = false private case class CheckMissedAbort(jobId: StandardAsyncJob) context.become(kvClientReceive orElse standardReceiveBehavior(None) orElse slowJobWarningReceive orElse receive) def standardReceiveBehavior(jobIdOption: Option[StandardAsyncJob]): Receive = LoggingReceive { case AbortJobCommand => + abortRequested = true jobIdOption match { case Some(jobId) => Try(tryAbort(jobId)) match { @@ -1241,7 +1250,9 @@ trait StandardAsyncExecutionActor val executeOrRecoverFuture = mode match { case Reconnect(jobId: StandardAsyncJob @unchecked) => reconnectAsync(jobId) - case ReconnectToAbort(jobId: StandardAsyncJob @unchecked) => reconnectToAbortAsync(jobId) + case ReconnectToAbort(jobId: StandardAsyncJob @unchecked) => + abortRequested = true + reconnectToAbortAsync(jobId) case Recover(jobId: StandardAsyncJob @unchecked) => recoverAsync(jobId) case _ => tellMetadata(startMetadataKeyValues) @@ -1430,8 +1441,9 @@ trait StandardAsyncExecutionActor outOfMemoryDetected <- memoryRetryRC } yield (stderrSize, returnCodeAsString, outOfMemoryDetected) - stderrSizeAndReturnCodeAndMemoryRetry flatMap { case (stderrSize, returnCodeAsString, outOfMemoryDetected) => + stderrSizeAndReturnCodeAndMemoryRetry flatMap { case (stderrSize, returnCodeAsString, isStderrOomDetected) => val tryReturnCodeAsInt = Try(returnCodeAsString.trim.toInt) + def isOomDetected(rc: Int): Boolean = isOomKillRc(rc) || isStderrOomDetected if (isDone(status)) { tryReturnCodeAsInt match { @@ -1445,9 +1457,11 @@ trait StandardAsyncExecutionActor retryElseFail(executionHandle) case Success(returnCodeAsInt) if continueOnReturnCode.continueFor(returnCodeAsInt) => handleExecutionSuccess(status, oldHandle, returnCodeAsInt) - // It's important that we check retryWithMoreMemory case before isAbort. RC could be 137 in either case; - // if it was caused by OOM killer, want to handle as OOM and not job abort. - case Success(returnCodeAsInt) if outOfMemoryDetected && memoryRetryRequested => + // A job can receive a SIGKILL (137) if it was aborted or OOM killed. Abort must have been requested for this + // to actually be an abort. + case Success(returnCodeAsInt) if abortRequested && isAbortRc(returnCodeAsInt) => + Future.successful(AbortedExecutionHandle) + case Success(returnCodeAsInt) if memoryRetryRequested && isOomDetected(returnCodeAsInt) => val executionHandle = Future.successful( FailedNonRetryableExecutionHandle( RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log), @@ -1456,10 +1470,8 @@ trait StandardAsyncExecutionActor ) ) retryElseFail(executionHandle, - MemoryRetryResult(outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier) + MemoryRetryResult(isOomDetected(returnCodeAsInt), memoryRetryFactor, previousMemoryMultiplier) ) - case Success(returnCodeAsInt) if isAbort(returnCodeAsInt) => - Future.successful(AbortedExecutionHandle) case Success(returnCodeAsInt) => val executionHandle = Future.successful( FailedNonRetryableExecutionHandle(WrongReturnCode(jobDescriptor.key.tag, returnCodeAsInt, stderrAsOption), @@ -1479,7 +1491,9 @@ trait StandardAsyncExecutionActor } else { tryReturnCodeAsInt match { case Success(returnCodeAsInt) - if outOfMemoryDetected && memoryRetryRequested && !continueOnReturnCode.continueFor(returnCodeAsInt) => + if isOomDetected(returnCodeAsInt) && memoryRetryRequested && !continueOnReturnCode.continueFor( + returnCodeAsInt + ) => val executionHandle = Future.successful( FailedNonRetryableExecutionHandle( RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log), @@ -1488,7 +1502,7 @@ trait StandardAsyncExecutionActor ) ) retryElseFail(executionHandle, - MemoryRetryResult(outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier) + MemoryRetryResult(isOomDetected(returnCodeAsInt), memoryRetryFactor, previousMemoryMultiplier) ) case _ => val failureStatus = handleExecutionFailure(status, tryReturnCodeAsInt.toOption) diff --git a/centaur/src/main/resources/standardTestCases/retry_with_more_memory/retry_with_more_memory_assorted_ooms.wdl b/centaur/src/main/resources/standardTestCases/retry_with_more_memory/retry_with_more_memory_assorted_ooms.wdl new file mode 100644 index 0000000000..2d959a5494 --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/retry_with_more_memory/retry_with_more_memory_assorted_ooms.wdl @@ -0,0 +1,67 @@ +version 1.0 + +workflow retry_with_more_memory_assorted_ooms { + call java_oom_kill + call python_oom_kill + call tail_dev_zero_oom_kill +} + +# A Java-based task that sets the maximum heap size to 64 GB on a VM that has nowhere near that much memory. This task +# should be OOM killed and retried with more memory. +task java_oom_kill { + command <<< + echo "MEM_SIZE=$MEM_SIZE" >&2 + echo "MEM_UNIT=$MEM_UNIT" >&2 + + cat > Mem.java << EOF + class Mem { + public static void main(String[] args) throws Exception { + int gb = (int)Math.pow(2, 30); + System.out.println("Allocating memory..."); + byte[][] byteArr = new byte[32][]; + for (int i = 0; i < byteArr.length; i++) { + byteArr[i] = new byte[gb]; + } + System.out.println("Sleeping a minute..."); + Thread.sleep(60_000); + System.out.printf("Heap size: %,.2f%n", (double)Runtime.getRuntime().totalMemory() / gb); + System.out.println(byteArr.hashCode()); + } + } + EOF + java -Xms64g -Xmx64g Mem.java + >>> + runtime { + docker: "eclipse-temurin:21" + memory: "1 GB" + maxRetries: 1 + } +} + + +task python_oom_kill { + command <<< + echo "MEM_SIZE=$MEM_SIZE" >&2 + echo "MEM_UNIT=$MEM_UNIT" >&2 + python3 -c 'print(len([0] * (2**34)))' + >>> + runtime { + docker: "google/cloud-sdk:slim" + memory: "1 GB" + maxRetries: 1 + } +} + + +task tail_dev_zero_oom_kill { + command <<< + echo "MEM_SIZE=$MEM_SIZE" >&2 + echo "MEM_UNIT=$MEM_UNIT" >&2 + tail /dev/zero + >>> + runtime { + docker: "ubuntu:latest" + memory: "1 GB" + maxRetries: 1 + } +} \ No newline at end of file diff --git a/centaur/src/main/resources/standardTestCases/retry_with_more_memory_assorted_ooms.test b/centaur/src/main/resources/standardTestCases/retry_with_more_memory_assorted_ooms.test new file mode 100644 index 0000000000..8724a8f00f --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/retry_with_more_memory_assorted_ooms.test @@ -0,0 +1,29 @@ +name: retry_with_more_memory_assorted_ooms +testFormat: workflowfailure +backends: [GCPBATCH] + +files { + workflow: retry_with_more_memory/retry_with_more_memory_assorted_ooms.wdl + options: retry_with_more_memory/retry_with_more_memory_2x.options +} + +metadata { + workflowName: retry_with_more_memory_assorted_ooms + status: Failed + "failures.0.message": "Workflow failed" + + "retry_with_more_memory_assorted_ooms.java_oom_kill.-1.1.runtimeAttributes.memory": "1 GB" + "retry_with_more_memory_assorted_ooms.java_oom_kill.-1.1.executionStatus": "RetryableFailure" + "retry_with_more_memory_assorted_ooms.java_oom_kill.-1.2.runtimeAttributes.memory": "2 GB" + "retry_with_more_memory_assorted_ooms.java_oom_kill.-1.2.executionStatus": "Failed" + + "retry_with_more_memory_assorted_ooms.python_oom_kill.-1.1.runtimeAttributes.memory": "1 GB" + "retry_with_more_memory_assorted_ooms.python_oom_kill.-1.1.executionStatus": "RetryableFailure" + "retry_with_more_memory_assorted_ooms.python_oom_kill.-1.2.runtimeAttributes.memory": "2 GB" + "retry_with_more_memory_assorted_ooms.python_oom_kill.-1.2.executionStatus": "Failed" + + "retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.1.runtimeAttributes.memory": "1 GB" + "retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.1.executionStatus": "RetryableFailure" + "retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.2.runtimeAttributes.memory": "2 GB" + "retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.2.executionStatus": "Failed" +}