Skip to content

Commit d8c5b0a

Browse files
committed
wip
1 parent 42c8901 commit d8c5b0a

File tree

3 files changed

+60
-15
lines changed

3 files changed

+60
-15
lines changed

backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,12 @@ trait StandardAsyncExecutionActor
9696
val SIGTERM = 143
9797
val SIGINT = 130
9898
val SIGKILL = 137
99+
// The Variants team have observed 247 exit codes in the wild for what we suspect were OOM-killed jobs, but
100+
// unfortunately we do not yet have a test case that reproduces this. From Gemini:
101+
//
102+
// An exit code of 247, particularly in the context of process execution in Linux or containerized environments like
103+
// Docker, often indicates a process termination due to resource limitations, most commonly insufficient memory (RAM).
104+
val SIGCONTAINERKILL = 247
99105

100106
// `CheckingForMemoryRetry` action exits with code 0 if the stderr file contains keys mentioned in `memory-retry` config.
101107
val StderrContainsRetryKeysCode = 0
@@ -965,9 +971,7 @@ trait StandardAsyncExecutionActor
965971
*/
966972
def isAbort(returnCode: Int): Boolean = returnCode == SIGINT || returnCode == SIGTERM || returnCode == SIGKILL
967973

968-
// 247 return codes have been observed in the wild for OOMKilled jobs during GVS Mars work, but unfortunately we don't
969-
// have a test case to reproduce this yet.
970-
def isOOMKill(returnCode: Int): Boolean = returnCode == SIGKILL || returnCode == 247
974+
def isOomKill(returnCode: Int): Boolean = returnCode == SIGKILL || returnCode == SIGCONTAINERKILL
971975

972976
/**
973977
* Custom behavior to run after an abort signal is processed.
@@ -1436,8 +1440,9 @@ trait StandardAsyncExecutionActor
14361440
outOfMemoryDetected <- memoryRetryRC
14371441
} yield (stderrSize, returnCodeAsString, outOfMemoryDetected)
14381442

1439-
stderrSizeAndReturnCodeAndMemoryRetry flatMap { case (stderrSize, returnCodeAsString, outOfMemoryDetected) =>
1443+
stderrSizeAndReturnCodeAndMemoryRetry flatMap { case (stderrSize, returnCodeAsString, stderrOomDetected) =>
14401444
val tryReturnCodeAsInt = Try(returnCodeAsString.trim.toInt)
1445+
def oomDetected(rc: Int): Boolean = isOomKill(rc) || stderrOomDetected
14411446

14421447
if (isDone(status)) {
14431448
tryReturnCodeAsInt match {
@@ -1455,7 +1460,7 @@ trait StandardAsyncExecutionActor
14551460
// an OOM kill.
14561461
case Success(returnCodeAsInt) if abortRequested && isAbort(returnCodeAsInt) =>
14571462
Future.successful(AbortedExecutionHandle)
1458-
case Success(returnCodeAsInt) if (isOOMKill(returnCodeAsInt) || outOfMemoryDetected) && memoryRetryRequested =>
1463+
case Success(returnCodeAsInt) if oomDetected(returnCodeAsInt) && memoryRetryRequested =>
14591464
val executionHandle = Future.successful(
14601465
FailedNonRetryableExecutionHandle(
14611466
RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log),
@@ -1464,7 +1469,7 @@ trait StandardAsyncExecutionActor
14641469
)
14651470
)
14661471
retryElseFail(executionHandle,
1467-
MemoryRetryResult(outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier)
1472+
MemoryRetryResult(oomDetected(returnCodeAsInt), memoryRetryFactor, previousMemoryMultiplier)
14681473
)
14691474
case Success(returnCodeAsInt) =>
14701475
val executionHandle = Future.successful(
@@ -1485,7 +1490,7 @@ trait StandardAsyncExecutionActor
14851490
} else {
14861491
tryReturnCodeAsInt match {
14871492
case Success(returnCodeAsInt)
1488-
if outOfMemoryDetected && memoryRetryRequested && !continueOnReturnCode.continueFor(returnCodeAsInt) =>
1493+
if oomDetected(returnCodeAsInt) && memoryRetryRequested && !continueOnReturnCode.continueFor(returnCodeAsInt) =>
14891494
val executionHandle = Future.successful(
14901495
FailedNonRetryableExecutionHandle(
14911496
RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log),
@@ -1494,7 +1499,7 @@ trait StandardAsyncExecutionActor
14941499
)
14951500
)
14961501
retryElseFail(executionHandle,
1497-
MemoryRetryResult(outOfMemoryDetected, memoryRetryFactor, previousMemoryMultiplier)
1502+
MemoryRetryResult(oomDetected(returnCodeAsInt), memoryRetryFactor, previousMemoryMultiplier)
14981503
)
14991504
case _ =>
15001505
val failureStatus = handleExecutionFailure(status, tryReturnCodeAsInt.toOption)

centaur/src/main/resources/standardTestCases/retry_with_more_memory/retry_with_more_memory_assorted_ooms.wdl

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
version 1.0
22

33
workflow retry_with_more_memory_assorted_ooms {
4-
call run_app
4+
call java_oom_kill
5+
call python_oom_kill
6+
call tail_dev_zero_oom_kill
57
}
68

79
# A Java-based task that sets the maximum heap size to 64 GB on a VM that has nowhere near that much memory. This task
810
# should be OOM killed and retried with more memory.
9-
task run_app {
11+
task java_oom_kill {
1012
command <<<
1113
echo "MEM_SIZE=$MEM_SIZE" >&2
1214
echo "MEM_UNIT=$MEM_UNIT" >&2
@@ -34,4 +36,32 @@ task run_app {
3436
memory: "1 GB"
3537
maxRetries: 1
3638
}
39+
}
40+
41+
42+
task python_oom_kill {
43+
command <<<
44+
echo "MEM_SIZE=$MEM_SIZE" >&2
45+
echo "MEM_UNIT=$MEM_UNIT" >&2
46+
python3 -c 'print(len([0] * (2**34)))'
47+
>>>
48+
runtime {
49+
docker: "google/cloud-sdk:slim"
50+
memory: "1 GB"
51+
maxRetries: 1
52+
}
53+
}
54+
55+
56+
task tail_dev_zero_oom_kill {
57+
command <<<
58+
echo "MEM_SIZE=$MEM_SIZE" >&2
59+
echo "MEM_UNIT=$MEM_UNIT" >&2
60+
tail /dev/zero
61+
>>>
62+
runtime {
63+
docker: "ubuntu:latest"
64+
memory: "1 GB"
65+
maxRetries: 1
66+
}
3767
}

centaur/src/main/resources/standardTestCases/retry_with_more_memory_assorted_ooms.test

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,19 @@ metadata {
1111
workflowName: retry_with_more_memory_assorted_ooms
1212
status: Failed
1313
"failures.0.message": "Workflow failed"
14-
"failures.0.causedBy.0.message": "stderr for job `retry_with_more_memory_assorted_ooms.run_app:NA:2` contained one of the `memory-retry-error-keys: [OutOfMemory,Killed]` specified in the Cromwell config. Job might have run out of memory."
15-
"retry_with_more_memory_assorted_ooms.run_app.-1.1.runtimeAttributes.memory": "1 GB"
16-
"retry_with_more_memory_assorted_ooms.run_app.-1.1.executionStatus": "RetryableFailure"
17-
"retry_with_more_memory_assorted_ooms.run_app.-1.2.runtimeAttributes.memory": "2 GB"
18-
"retry_with_more_memory_assorted_ooms.run_app.-1.2.executionStatus": "Failed"
14+
15+
"retry_with_more_memory_assorted_ooms.java_oom_kill.-1.1.runtimeAttributes.memory": "1 GB"
16+
"retry_with_more_memory_assorted_ooms.java_oom_kill.-1.1.executionStatus": "RetryableFailure"
17+
"retry_with_more_memory_assorted_ooms.java_oom_kill.-1.2.runtimeAttributes.memory": "2 GB"
18+
"retry_with_more_memory_assorted_ooms.java_oom_kill.-1.2.executionStatus": "Failed"
19+
20+
"retry_with_more_memory_assorted_ooms.python_oom_kill.-1.1.runtimeAttributes.memory": "1 GB"
21+
"retry_with_more_memory_assorted_ooms.python_oom_kill.-1.1.executionStatus": "RetryableFailure"
22+
"retry_with_more_memory_assorted_ooms.python_oom_kill.-1.2.runtimeAttributes.memory": "2 GB"
23+
"retry_with_more_memory_assorted_ooms.python_oom_kill.-1.2.executionStatus": "Failed"
24+
25+
"retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.1.runtimeAttributes.memory": "1 GB"
26+
"retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.1.executionStatus": "RetryableFailure"
27+
"retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.2.runtimeAttributes.memory": "2 GB"
28+
"retry_with_more_memory_assorted_ooms.tail_dev_zero_oom_kill.-1.2.executionStatus": "Failed"
1929
}

0 commit comments

Comments
 (0)