Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ trait StandardAsyncExecutionActor
val SIGTERM = 143
val SIGINT = 130
val SIGKILL = 137
// From Gemini:
// An exit code of 247, particularly in the context of process execution in Linux or containerized environments like
// Docker, often indicates a process termination due to resource limitations, most commonly insufficient memory (RAM).
val SIGCONTAINERKILL = 247
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variants has encountered this one in the wild a few times but we don't currently have a test case to reproduce.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


// `CheckingForMemoryRetry` action exits with code 0 if the stderr file contains keys mentioned in `memory-retry` config.
val StderrContainsRetryKeysCode = 0
Expand Down Expand Up @@ -963,7 +967,9 @@ trait StandardAsyncExecutionActor
* @param returnCode The return code.
* @return True if the return code is for an abort.
*/
def isAbort(returnCode: Int): Boolean = returnCode == SIGINT || returnCode == SIGTERM || returnCode == SIGKILL
def isAbortRc(returnCode: Int): Boolean = returnCode == SIGINT || returnCode == SIGTERM || returnCode == SIGKILL

def isOomKillRc(returnCode: Int): Boolean = returnCode == SIGKILL || returnCode == SIGCONTAINERKILL

/**
* Custom behavior to run after an abort signal is processed.
Expand Down Expand Up @@ -1216,12 +1222,15 @@ trait StandardAsyncExecutionActor

// See executeOrRecoverSuccess
private var missedAbort = false
// Records whether abort has been requested for this job.
private var abortRequested = false
private case class CheckMissedAbort(jobId: StandardAsyncJob)

context.become(kvClientReceive orElse standardReceiveBehavior(None) orElse slowJobWarningReceive orElse receive)

def standardReceiveBehavior(jobIdOption: Option[StandardAsyncJob]): Receive = LoggingReceive {
case AbortJobCommand =>
abortRequested = true
jobIdOption match {
case Some(jobId) =>
Try(tryAbort(jobId)) match {
Expand All @@ -1241,7 +1250,9 @@ trait StandardAsyncExecutionActor
val executeOrRecoverFuture =
mode match {
case Reconnect(jobId: StandardAsyncJob @unchecked) => reconnectAsync(jobId)
case ReconnectToAbort(jobId: StandardAsyncJob @unchecked) => reconnectToAbortAsync(jobId)
case ReconnectToAbort(jobId: StandardAsyncJob @unchecked) =>
abortRequested = true
reconnectToAbortAsync(jobId)
case Recover(jobId: StandardAsyncJob @unchecked) => recoverAsync(jobId)
case _ =>
tellMetadata(startMetadataKeyValues)
Expand Down Expand Up @@ -1430,8 +1441,9 @@ trait StandardAsyncExecutionActor
outOfMemoryDetected <- memoryRetryRC
} yield (stderrSize, returnCodeAsString, outOfMemoryDetected)

stderrSizeAndReturnCodeAndMemoryRetry flatMap { case (stderrSize, returnCodeAsString, outOfMemoryDetected) =>
stderrSizeAndReturnCodeAndMemoryRetry flatMap { case (stderrSize, returnCodeAsString, isStderrOomDetected) =>
val tryReturnCodeAsInt = Try(returnCodeAsString.trim.toInt)
def isOomDetected(rc: Int): Boolean = isOomKillRc(rc) || isStderrOomDetected

if (isDone(status)) {
tryReturnCodeAsInt match {
Expand All @@ -1445,9 +1457,11 @@ trait StandardAsyncExecutionActor
retryElseFail(executionHandle)
case Success(returnCodeAsInt) if continueOnReturnCode.continueFor(returnCodeAsInt) =>
handleExecutionSuccess(status, oldHandle, returnCodeAsInt)
// It's important that we check retryWithMoreMemory case before isAbort. RC could be 137 in either case;
// if it was caused by OOM killer, want to handle as OOM and not job abort.
case Success(returnCodeAsInt) if outOfMemoryDetected && memoryRetryRequested =>
// A job can receive a SIGKILL (137) if it was aborted or OOM killed. Abort must have been requested for this
// to actually be an abort.
case Success(returnCodeAsInt) if abortRequested && isAbortRc(returnCodeAsInt) =>
Future.successful(AbortedExecutionHandle)
case Success(returnCodeAsInt) if memoryRetryRequested && isOomDetected(returnCodeAsInt) =>
val executionHandle = Future.successful(
FailedNonRetryableExecutionHandle(
RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log),
Expand All @@ -1456,10 +1470,8 @@ trait StandardAsyncExecutionActor
)
)
retryElseFail(executionHandle,
MemoryRetryResult(outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier)
MemoryRetryResult(isOomDetected(returnCodeAsInt), memoryRetryFactor, previousMemoryMultiplier)
)
case Success(returnCodeAsInt) if isAbort(returnCodeAsInt) =>
Future.successful(AbortedExecutionHandle)
case Success(returnCodeAsInt) =>
val executionHandle = Future.successful(
FailedNonRetryableExecutionHandle(WrongReturnCode(jobDescriptor.key.tag, returnCodeAsInt, stderrAsOption),
Expand All @@ -1479,7 +1491,9 @@ trait StandardAsyncExecutionActor
} else {
tryReturnCodeAsInt match {
case Success(returnCodeAsInt)
if outOfMemoryDetected && memoryRetryRequested && !continueOnReturnCode.continueFor(returnCodeAsInt) =>
if isOomDetected(returnCodeAsInt) && memoryRetryRequested && !continueOnReturnCode.continueFor(
returnCodeAsInt
) =>
val executionHandle = Future.successful(
FailedNonRetryableExecutionHandle(
RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log),
Expand All @@ -1488,7 +1502,7 @@ trait StandardAsyncExecutionActor
)
)
retryElseFail(executionHandle,
MemoryRetryResult(outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier)
MemoryRetryResult(isOomDetected(returnCodeAsInt), memoryRetryFactor, previousMemoryMultiplier)
)
case _ =>
val failureStatus = handleExecutionFailure(status, tryReturnCodeAsInt.toOption)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
version 1.0

workflow retry_with_more_memory_assorted_ooms {
call java_oom_kill
call python_oom_kill
call tail_dev_zero_oom_kill
}

# A Java-based task that sets the maximum heap size to 64 GB on a VM that has nowhere near that much memory. This task
# should be OOM killed and retried with more memory.
task java_oom_kill {
command <<<
echo "MEM_SIZE=$MEM_SIZE" >&2
echo "MEM_UNIT=$MEM_UNIT" >&2

cat > Mem.java << EOF
class Mem {
public static void main(String[] args) throws Exception {
int gb = (int)Math.pow(2, 30);
System.out.println("Allocating memory...");
byte[][] byteArr = new byte[32][];
for (int i = 0; i < byteArr.length; i++) {
byteArr[i] = new byte[gb];
}
System.out.println("Sleeping a minute...");
Thread.sleep(60_000);
System.out.printf("Heap size: %,.2f%n", (double)Runtime.getRuntime().totalMemory() / gb);
System.out.println(byteArr.hashCode());
}
}
EOF
java -Xms64g -Xmx64g Mem.java
>>>
runtime {
docker: "eclipse-temurin:21"
memory: "1 GB"
maxRetries: 1
}
}


task python_oom_kill {
command <<<
echo "MEM_SIZE=$MEM_SIZE" >&2
echo "MEM_UNIT=$MEM_UNIT" >&2
python3 -c 'print(len([0] * (2**34)))'
>>>
runtime {
docker: "google/cloud-sdk:slim"
memory: "1 GB"
maxRetries: 1
}
}


task tail_dev_zero_oom_kill {
command <<<
echo "MEM_SIZE=$MEM_SIZE" >&2
echo "MEM_UNIT=$MEM_UNIT" >&2
tail /dev/zero
>>>
runtime {
docker: "ubuntu:latest"
memory: "1 GB"
maxRetries: 1
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: retry_with_more_memory_assorted_ooms
testFormat: workflowfailure
backends: [GCPBATCH]

files {
workflow: retry_with_more_memory/retry_with_more_memory_assorted_ooms.wdl
options: retry_with_more_memory/retry_with_more_memory_2x.options
}

metadata {
workflowName: retry_with_more_memory_assorted_ooms
status: Failed
"failures.0.message": "Workflow failed"

"retry_with_more_memory_assorted_ooms.java_oom_kill.-1.1.runtimeAttributes.memory": "1 GB"
"retry_with_more_memory_assorted_ooms.java_oom_kill.-1.1.executionStatus": "RetryableFailure"
"retry_with_more_memory_assorted_ooms.java_oom_kill.-1.2.runtimeAttributes.memory": "2 GB"
"retry_with_more_memory_assorted_ooms.java_oom_kill.-1.2.executionStatus": "Failed"

"retry_with_more_memory_assorted_ooms.python_oom_kill.-1.1.runtimeAttributes.memory": "1 GB"
"retry_with_more_memory_assorted_ooms.python_oom_kill.-1.1.executionStatus": "RetryableFailure"
"retry_with_more_memory_assorted_ooms.python_oom_kill.-1.2.runtimeAttributes.memory": "2 GB"
"retry_with_more_memory_assorted_ooms.python_oom_kill.-1.2.executionStatus": "Failed"

"retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.1.runtimeAttributes.memory": "1 GB"
"retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.1.executionStatus": "RetryableFailure"
"retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.2.runtimeAttributes.memory": "2 GB"
"retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.2.executionStatus": "Failed"
}
Loading