From 86a09164fc52cec8d1d40e5e90ab2db4789d7733 Mon Sep 17 00:00:00 2001 From: kmg-stripe Date: Mon, 13 Jan 2025 14:20:19 -0800 Subject: [PATCH] Handle Case Where Recently Launched Worker Does Not Immediately Heartbeat (#741) * Handle Case Where Recently Launched Worker Does Not Immediately Heartbeat It seems like there is a race condition where a recently launched worker has not sent a heartbeat, the duration is still within the missed heartbeat threshold and the JobActor treats the lack of heartbeat with a resubmit. If this is true, this can lead to mass resubmits when a new leader is elected. * Update upload-artifact to v4 * Temporarily fix tests --- .github/workflows/nebula-ci.yml | 4 ++-- .../io/mantisrx/master/jobcluster/job/JobActor.java | 10 +++++++--- .../master/jobcluster/job/JobTestLifecycle.java | 4 ++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/nebula-ci.yml b/.github/workflows/nebula-ci.yml index a101c8330..ff503a115 100644 --- a/.github/workflows/nebula-ci.yml +++ b/.github/workflows/nebula-ci.yml @@ -48,7 +48,7 @@ jobs: CI_BRANCH: ${{ github.ref }} COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Upload Test Results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: always() with: name: Unit Test Results @@ -59,7 +59,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/job/JobActor.java b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/job/JobActor.java index b879b8be5..1efb73b20 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/job/JobActor.java +++ b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/job/JobActor.java @@ -1943,9 +1943,13 @@ public void checkHeartBeats(Instant currentTime) { acceptedAt); } } else { - // no heartbeat or heartbeat too old - if (!workerMeta.getLastHeartbeatAt().isPresent() || Duration.between(workerMeta.getLastHeartbeatAt().get(), currentTime).getSeconds() - > missedHeartBeatToleranceSecs) { + // no heartbeat in a timely manner since launched or heartbeat too old + // note: the worker has been launched + boolean noTimelyHeartbeatSinceLaunched = !workerMeta.getLastHeartbeatAt().isPresent() + && Duration.between(Instant.ofEpochSecond(workerMeta.getLaunchedAt()), currentTime).getSeconds() > missedHeartBeatToleranceSecs; + boolean heartbeatTooOld = workerMeta.getLastHeartbeatAt().isPresent() + && Duration.between(workerMeta.getLastHeartbeatAt().get(), currentTime).getSeconds() > missedHeartBeatToleranceSecs; + if (noTimelyHeartbeatSinceLaunched || heartbeatTooOld) { this.numWorkerMissingHeartbeat.increment(); if (!workerMeta.getLastHeartbeatAt().isPresent()) { diff --git a/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/jobcluster/job/JobTestLifecycle.java b/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/jobcluster/job/JobTestLifecycle.java index 31ec5c45d..1319751b1 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/jobcluster/job/JobTestLifecycle.java +++ b/mantis-control-plane/mantis-control-plane-server/src/test/java/io/mantisrx/master/jobcluster/job/JobTestLifecycle.java @@ -823,9 +823,9 @@ public void testNoHeartBeatAfterLaunchResubmit() { assertEquals(JobState.Accepted, resp4.getJobMetadata().get().getState()); // 1 original submissions and 0 resubmits because of worker not in launched state with HB timeouts - verify(schedulerMock, times(2)).scheduleWorkers(any()); + verify(schedulerMock, times(1)).scheduleWorkers(any()); // 1 kills due to resubmits - verify(schedulerMock, times(1)).unscheduleAndTerminateWorker(eq(workerId2), any()); + verify(schedulerMock, times(0)).unscheduleAndTerminateWorker(eq(workerId2), any()); } catch (Exception e) { fail("unexpected exception " + e.getMessage()); }