forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-41210][K8S] Port executor failure tracker from Spark on YARN t…
…o K8s ### What changes were proposed in this pull request? Fail Spark Application when the number of executor failures reaches the threshold. ### Why are the changes needed? Sometimes, the executors can not launch successfully because of the wrong configuration, but in K8s, Driver does not know that, and just keep requesting new executors. This PR ports the window-based executor failure tracking mechanism to K8s(only takes effect when `spark.kubernetes.allocation.pods.allocator` is set to 'direct'), to reduce functionality gap between YARN and K8s. Note that, YARN mode also supports host-based executor allocation failure tracking and application terminating mechanism[2], this PR does not port such functionalities to Kubernetes since it's kind of an independent and big feature, and relies on some YARN features which I'm not sure if K8s has similar one. [1] [SPARK-6735](https://issues.apache.org/jira/browse/SPARK-6735) [2] [SPARK-17675](https://issues.apache.org/jira/browse/SPARK-17675) ### Does this PR introduce _any_ user-facing change? Yes, this PR provides two new configurations - `spark.executor.maxNumFailures` - `spark.executor.failuresValidityInterval` which takes effect on YARN, or on Kubernetes when `spark.kubernetes.allocation.pods.allocator` is set to 'direct'. ### How was this patch tested? New UT added, and manually tested in internal K8s cluster. Closes apache#40774 from pan3793/SPARK-41210. Authored-by: Cheng Pan <[email protected]> Signed-off-by: Kent Yao <[email protected]>
- Loading branch information
Showing
14 changed files
with
224 additions
and
109 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
102 changes: 102 additions & 0 deletions
102
core/src/main/scala/org/apache/spark/deploy/ExecutorFailureTracker.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.deploy | ||
|
||
import scala.collection.mutable | ||
|
||
import org.apache.spark.SparkConf | ||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.internal.config._ | ||
import org.apache.spark.internal.config.Streaming.STREAMING_DYN_ALLOCATION_MAX_EXECUTORS | ||
import org.apache.spark.util.{Clock, SystemClock, Utils} | ||
|
||
/** | ||
* ExecutorFailureTracker is responsible for tracking executor failures both for each host | ||
* separately and for all hosts altogether. | ||
*/ | ||
private[spark] class ExecutorFailureTracker( | ||
sparkConf: SparkConf, | ||
val clock: Clock = new SystemClock) extends Logging { | ||
|
||
private val executorFailuresValidityInterval = | ||
sparkConf.get(EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS).getOrElse(-1L) | ||
|
||
// Queue to store the timestamp of failed executors for each host | ||
private val failedExecutorsTimeStampsPerHost = mutable.Map[String, mutable.Queue[Long]]() | ||
|
||
private val failedExecutorsTimeStamps = new mutable.Queue[Long]() | ||
|
||
private def updateAndCountFailures(failedExecutorsWithTimeStamps: mutable.Queue[Long]): Int = { | ||
val endTime = clock.getTimeMillis() | ||
while (executorFailuresValidityInterval > 0 && | ||
failedExecutorsWithTimeStamps.nonEmpty && | ||
failedExecutorsWithTimeStamps.head < endTime - executorFailuresValidityInterval) { | ||
failedExecutorsWithTimeStamps.dequeue() | ||
} | ||
failedExecutorsWithTimeStamps.size | ||
} | ||
|
||
def numFailedExecutors: Int = synchronized { | ||
updateAndCountFailures(failedExecutorsTimeStamps) | ||
} | ||
|
||
def registerFailureOnHost(hostname: String): Unit = synchronized { | ||
val timeMillis = clock.getTimeMillis() | ||
failedExecutorsTimeStamps.enqueue(timeMillis) | ||
val failedExecutorsOnHost = | ||
failedExecutorsTimeStampsPerHost.getOrElse(hostname, { | ||
val failureOnHost = mutable.Queue[Long]() | ||
failedExecutorsTimeStampsPerHost.put(hostname, failureOnHost) | ||
failureOnHost | ||
}) | ||
failedExecutorsOnHost.enqueue(timeMillis) | ||
} | ||
|
||
def registerExecutorFailure(): Unit = synchronized { | ||
val timeMillis = clock.getTimeMillis() | ||
failedExecutorsTimeStamps.enqueue(timeMillis) | ||
} | ||
|
||
def numFailuresOnHost(hostname: String): Int = { | ||
failedExecutorsTimeStampsPerHost.get(hostname).map { failedExecutorsOnHost => | ||
updateAndCountFailures(failedExecutorsOnHost) | ||
}.getOrElse(0) | ||
} | ||
} | ||
|
||
object ExecutorFailureTracker { | ||
|
||
// Default to twice the number of executors (twice the maximum number of executors if dynamic | ||
// allocation is enabled), with a minimum of 3. | ||
def maxNumExecutorFailures(sparkConf: SparkConf): Int = { | ||
val effectiveNumExecutors = | ||
if (Utils.isStreamingDynamicAllocationEnabled(sparkConf)) { | ||
sparkConf.get(STREAMING_DYN_ALLOCATION_MAX_EXECUTORS) | ||
} else if (Utils.isDynamicAllocationEnabled(sparkConf)) { | ||
sparkConf.get(DYN_ALLOCATION_MAX_EXECUTORS) | ||
} else { | ||
sparkConf.get(EXECUTOR_INSTANCES).getOrElse(0) | ||
} | ||
// By default, effectiveNumExecutors is Int.MaxValue if dynamic allocation is enabled. We need | ||
// avoid the integer overflow here. | ||
val defaultMaxNumExecutorFailures = math.max(3, | ||
if (effectiveNumExecutors > Int.MaxValue / 2) Int.MaxValue else 2 * effectiveNumExecutors) | ||
|
||
sparkConf.get(MAX_EXECUTOR_FAILURES).getOrElse(defaultMaxNumExecutorFailures) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.