Merge pull request #29570 from rockwotj/ctp-leader-flip-flop

rockwotj · web-flow · commit 7dc2c62c57ae · 2026-02-17T06:35:20.000-06:00
diff --git a/src/v/cloud_topics/level_zero/stm/ctp_stm.cc b/src/v/cloud_topics/level_zero/stm/ctp_stm.cc
@@ -406,17 +406,17 @@ ctp_stm::fence_epoch(cluster_epoch e) {
     }
     auto term = _raft->confirmed_term();
     while (true) {
-        if (_state.epoch_in_window(e)) {
+        if (_state.epoch_in_window(term, e)) {
             // Case 1.1. Same epoch, need to acquire read-lock.
             // Case 1.2. This epoch is out of order. We can accept it if it lies
             //           in [previous-epoch, max-seen-epoch) range. We also need
             //           to acquire a read fence as in 1.1.
             auto unit = co_await ss::get_units(_lock, 1, _as);
-            if (_state.epoch_in_window(e)) {
+            if (_state.epoch_in_window(term, e)) {
                 co_return cluster_epoch_fence{
                   .unit = std::move(unit), .term = term};
             }
-        } else if (_state.epoch_above_window(e)) {
+        } else if (_state.epoch_above_window(term, e)) {
             // Case 2. New epoch, need to acquire write-lock.
             auto epoch_update_lock = _epoch_update_lock.try_get_units();
             if (!epoch_update_lock) {
@@ -431,9 +431,11 @@ ctp_stm::fence_epoch(cluster_epoch e) {
               _lock, ss::semaphore::max_counter(), _as);
 
             std::optional<cluster_epoch_fence> epoch_fence_opt;
-            if (_state.epoch_in_window(e) || _state.epoch_above_window(e)) {
+            if (
+              _state.epoch_in_window(term, e)
+              || _state.epoch_above_window(term, e)) {
                 vlog(_log.debug, "Bumping max seen epoch to {}", e);
-                _state.advance_max_seen_epoch(e);
+                _state.advance_max_seen_epoch(term, e);
                 // Demote to reader lock after max_seen_epoch is updated.
                 unit.return_units(unit.count() - 1);
                 epoch_fence_opt.emplace(std::move(unit), term);
diff --git a/src/v/cloud_topics/level_zero/stm/ctp_stm.h b/src/v/cloud_topics/level_zero/stm/ctp_stm.h
@@ -75,8 +75,8 @@ class ctp_stm final : public raft::persisted_stm<> {
 
     const ctp_stm_state& state() const noexcept { return _state; }
 
-    void advance_max_seen_epoch(cluster_epoch epoch) {
-        _state.advance_max_seen_epoch(epoch);
+    void advance_max_seen_epoch(model::term_id term, cluster_epoch epoch) {
+        _state.advance_max_seen_epoch(term, epoch);
     }
 
     ss::future<std::expected<cluster_epoch_fence, stale_cluster_epoch>>
diff --git a/src/v/cloud_topics/level_zero/stm/ctp_stm_state.cc b/src/v/cloud_topics/level_zero/stm/ctp_stm_state.cc
@@ -15,9 +15,16 @@
 
 namespace cloud_topics {
 
-void ctp_stm_state::advance_max_seen_epoch(cluster_epoch epoch) noexcept {
-    if (epoch > _max_seen_epoch) {
-        _previous_seen_epoch = _max_seen_epoch.value_or(epoch);
+void ctp_stm_state::advance_max_seen_epoch(
+  model::term_id term, cluster_epoch epoch) noexcept {
+    if (term >= _seen_window_term && epoch > _max_seen_epoch) {
+        if (term > _seen_window_term) {
+            // If this is a new term, reset the window.
+            _previous_seen_epoch = epoch;
+            _seen_window_term = term;
+        } else {
+            _previous_seen_epoch = _max_seen_epoch.value_or(epoch);
+        }
         _max_seen_epoch = epoch;
     }
 }
@@ -47,7 +54,14 @@ ctp_stm_state::get_previous_seen_epoch() const noexcept {
     return _previous_seen_epoch;
 }
 
-bool ctp_stm_state::epoch_in_window(cluster_epoch epoch) const noexcept {
+bool ctp_stm_state::epoch_in_window(
+  model::term_id term, cluster_epoch epoch) const noexcept {
+    // If the term is newer then treat the window as unset.
+    if (term > _seen_window_term) {
+        auto end = _max_applied_epoch.value_or(cluster_epoch::min());
+        auto begin = _previous_applied_epoch.value_or(end);
+        return epoch >= begin && epoch <= end;
+    }
     // NOTE: the window should move forward with _max_seen_epoch.
     // If _max_seen_epoch is greater than _max_applied_epoch then
     // the window should be [_previous_seen_epoch, _max_seen_epoch].
@@ -60,7 +74,13 @@ bool ctp_stm_state::epoch_in_window(cluster_epoch epoch) const noexcept {
     return epoch >= begin && epoch <= end;
 }
 
-bool ctp_stm_state::epoch_above_window(cluster_epoch epoch) const noexcept {
+bool ctp_stm_state::epoch_above_window(
+  model::term_id term, cluster_epoch epoch) const noexcept {
+    // If the term changed, treat it as unset.
+    if (term > _seen_window_term) {
+        auto end = _max_applied_epoch.value_or(cluster_epoch::min());
+        return epoch > end;
+    }
     auto end = _max_seen_epoch.value_or(
       _max_applied_epoch.value_or(cluster_epoch::min()));
     return epoch > end;
@@ -72,13 +92,6 @@ ctp_stm_state::estimate_inactive_epoch() const noexcept {
 }
 
 void ctp_stm_state::advance_epoch(cluster_epoch epoch, model::offset offset) {
-    // The STM works on both leader and followers, on a leader the
-    // max_seen_epoch epoch is updated by the fencing mechanism.
-    // On the follower the max_seen_epoch epoch has to follow the max epoch.
-    if (epoch > _max_seen_epoch) {
-        _previous_seen_epoch = _max_seen_epoch.value_or(epoch);
-        _max_seen_epoch = epoch;
-    }
     // Register new epoch
     if (epoch > _max_applied_epoch.value_or(cluster_epoch::min())) {
         // A new max epoch requires the sliding window of epoch values in flight
diff --git a/src/v/cloud_topics/level_zero/stm/ctp_stm_state.h b/src/v/cloud_topics/level_zero/stm/ctp_stm_state.h
@@ -48,7 +48,8 @@ class ctp_stm_state
 
     /// This is invoked in the write path before the batch with new
     /// epoch value is even replicated.
-    void advance_max_seen_epoch(cluster_epoch epoch) noexcept;
+    void
+    advance_max_seen_epoch(model::term_id term, cluster_epoch epoch) noexcept;
 
     // Set the new start offset for the partition.
     //
@@ -91,9 +92,11 @@ class ctp_stm_state
     std::optional<cluster_epoch> estimate_min_epoch() const noexcept;
 
     /// Return true if the epoch can be replicated
-    bool epoch_in_window(cluster_epoch epoch) const noexcept;
+    bool
+    epoch_in_window(model::term_id term, cluster_epoch epoch) const noexcept;
     /// Return true if the epoch is above the current window
-    bool epoch_above_window(cluster_epoch epoch) const noexcept;
+    bool
+    epoch_above_window(model::term_id term, cluster_epoch epoch) const noexcept;
 
     /// Estimate inactive epoch
     std::optional<cluster_epoch> estimate_inactive_epoch() const noexcept;
@@ -135,6 +138,10 @@ class ctp_stm_state
     fmt::iterator format_to(fmt::iterator) const;
 
 private:
+    /// The term at which the *_seen_epochs are for, due to the sliding window
+    /// having the ability to diverge, we only track it within a single term,
+    /// then reset the window to avoid nasty edge cases when leadership changes.
+    model::term_id _seen_window_term;
     /// The max epoch after the current in flight requests are applied.
     ///
     /// This is required because of the pipelining of requests in the STM.
diff --git a/src/v/cloud_topics/level_zero/stm/tests/ctp_stm_state_test.cc b/src/v/cloud_topics/level_zero/stm/tests/ctp_stm_state_test.cc
@@ -37,15 +37,16 @@ TEST(ctp_stm_state_test, advance_max_seen_epoch) {
     ct::cluster_epoch epoch1(10);
     ct::cluster_epoch epoch2(20);
     ct::cluster_epoch epoch3(5);
+    model::term_id term(1);
 
-    state.advance_max_seen_epoch(epoch1);
+    state.advance_max_seen_epoch(term, epoch1);
     EXPECT_EQ(state.get_max_seen_epoch().value(), epoch1);
 
-    state.advance_max_seen_epoch(epoch2);
+    state.advance_max_seen_epoch(term, epoch2);
     EXPECT_EQ(state.get_max_seen_epoch().value(), epoch2);
 
     // Should not go backwards
-    state.advance_max_seen_epoch(epoch3);
+    state.advance_max_seen_epoch(term, epoch3);
     EXPECT_EQ(state.get_max_seen_epoch().value(), epoch2);
 }
 
@@ -57,26 +58,29 @@ TEST(ctp_stm_state_test, advance_epoch) {
 
     state.advance_epoch(epoch1, model::offset(1));
     EXPECT_EQ(state.get_max_applied_epoch().value(), epoch1);
-    EXPECT_EQ(state.get_max_seen_epoch().value(), epoch1);
+    // advance_epoch does not update the seen window
+    EXPECT_FALSE(state.get_max_seen_epoch().has_value());
 
     state.advance_epoch(epoch2, model::offset(2));
     EXPECT_EQ(state.get_max_applied_epoch().value(), epoch2);
-    EXPECT_EQ(state.get_max_seen_epoch().value(), epoch2);
+    EXPECT_FALSE(state.get_max_seen_epoch().has_value());
 
     // Should not go backwards
     state.advance_epoch(epoch3, model::offset(3));
     EXPECT_EQ(state.get_max_applied_epoch().value(), epoch2);
-    EXPECT_EQ(state.get_max_seen_epoch().value(), epoch2);
+    EXPECT_FALSE(state.get_max_seen_epoch().has_value());
 }
 
 TEST(ctp_stm_state_test, advance_epoch_on_a_follower) {
-    // On a follower the max_seen_epoch should also be updated
+    // On a follower, advance_epoch only updates the applied window,
+    // not the seen window. The seen window is only managed through
+    // advance_max_seen_epoch on the leader path.
     ct::ctp_stm_state state;
     ct::cluster_epoch advance_epoch(20);
 
     state.advance_epoch(advance_epoch, model::offset(1));
 
-    EXPECT_EQ(state.get_max_seen_epoch().value(), advance_epoch);
+    EXPECT_FALSE(state.get_max_seen_epoch().has_value());
     EXPECT_EQ(state.get_max_applied_epoch().value(), advance_epoch);
 }
 
@@ -171,6 +175,7 @@ kafka::offset operator""_offset(unsigned long long v) {
 
 TEST(ctp_stm_state_test, sliding_window_issue) {
     ct::ctp_stm_state state;
+    model::term_id term(1);
 
     kafka::offset hwm = 0_offset;
 
@@ -193,17 +198,17 @@ TEST(ctp_stm_state_test, sliding_window_issue) {
     // get the write lock before we can start our window
     EXPECT_EQ(estimate_inactive_epoch(), std::nullopt);
     // Start our epochs at 2
-    EXPECT_FALSE(state.epoch_in_window(2_epoch));
+    EXPECT_FALSE(state.epoch_in_window(term, 2_epoch));
 
     // Write lock grabbed, max epoch can be advanced!
-    state.advance_max_seen_epoch(2_epoch);
+    state.advance_max_seen_epoch(term, 2_epoch);
 
     // Now epoch 0 is in the window
-    EXPECT_TRUE(state.epoch_in_window(2_epoch));
+    EXPECT_TRUE(state.epoch_in_window(term, 2_epoch));
     // Epoch 1 is not in the window
-    EXPECT_FALSE(state.epoch_in_window(1_epoch));
+    EXPECT_FALSE(state.epoch_in_window(term, 1_epoch));
     // Nor is 3
-    EXPECT_FALSE(state.epoch_in_window(3_epoch));
+    EXPECT_FALSE(state.epoch_in_window(term, 3_epoch));
 
     // Now the batch that was replicated with offset 0
     apply_replicated(2_epoch);
@@ -213,7 +218,7 @@ TEST(ctp_stm_state_test, sliding_window_issue) {
     EXPECT_EQ(estimate_inactive_epoch(), 1_epoch);
 
     // Let's now add another batch at epoch 2
-    EXPECT_TRUE(state.epoch_in_window(2_epoch));
+    EXPECT_TRUE(state.epoch_in_window(term, 2_epoch));
     apply_replicated(2_epoch);
 
     // Reconciler now runs
@@ -222,19 +227,19 @@ TEST(ctp_stm_state_test, sliding_window_issue) {
     // Our epoch window hasn't moved
     EXPECT_EQ(estimate_inactive_epoch(), 1_epoch);
 
-    EXPECT_FALSE(state.epoch_in_window(5_epoch));
+    EXPECT_FALSE(state.epoch_in_window(term, 5_epoch));
 
     // Epoch is bumped, our window should now be [2, 5]
-    state.advance_max_seen_epoch(5_epoch);
+    state.advance_max_seen_epoch(term, 5_epoch);
 
     // This is our new epoch
-    EXPECT_TRUE(state.epoch_in_window(5_epoch));
+    EXPECT_TRUE(state.epoch_in_window(term, 5_epoch));
     // Our previous epoch is good still
-    EXPECT_TRUE(state.epoch_in_window(2_epoch));
+    EXPECT_TRUE(state.epoch_in_window(term, 2_epoch));
     // And so is something in between (unlikely in real life, but just to show)
-    EXPECT_TRUE(state.epoch_in_window(3_epoch));
+    EXPECT_TRUE(state.epoch_in_window(term, 3_epoch));
     // Something below is still bad
-    EXPECT_FALSE(state.epoch_in_window(1_epoch));
+    EXPECT_FALSE(state.epoch_in_window(term, 1_epoch));
 
     // Still not safe to GC, we accept stuff at epoch 0
     EXPECT_EQ(estimate_inactive_epoch(), 1_epoch);
@@ -250,12 +255,12 @@ TEST(ctp_stm_state_test, sliding_window_issue) {
     EXPECT_EQ(estimate_inactive_epoch(), 1_epoch);
 
     // Now we start to replicate to the epoch to 10 (write lock grabbed)
-    state.advance_max_seen_epoch(10_epoch);
-    EXPECT_TRUE(state.epoch_in_window(10_epoch));
-    EXPECT_TRUE(state.epoch_in_window(5_epoch));
-    EXPECT_TRUE(state.epoch_in_window(8_epoch));
-    EXPECT_FALSE(state.epoch_in_window(0_epoch));
-    EXPECT_FALSE(state.epoch_in_window(4_epoch));
+    state.advance_max_seen_epoch(term, 10_epoch);
+    EXPECT_TRUE(state.epoch_in_window(term, 10_epoch));
+    EXPECT_TRUE(state.epoch_in_window(term, 5_epoch));
+    EXPECT_TRUE(state.epoch_in_window(term, 8_epoch));
+    EXPECT_FALSE(state.epoch_in_window(term, 0_epoch));
+    EXPECT_FALSE(state.epoch_in_window(term, 4_epoch));
 
     apply_replicated(10_epoch);
 
@@ -268,11 +273,11 @@ TEST(ctp_stm_state_test, sliding_window_issue) {
     EXPECT_EQ(estimate_inactive_epoch(), 4_epoch);
 
     // Now we bump the window again, but haven't replicated it yet.
-    state.advance_max_seen_epoch(15_epoch);
-    EXPECT_TRUE(state.epoch_in_window(10_epoch));
-    EXPECT_TRUE(state.epoch_in_window(15_epoch));
-    EXPECT_TRUE(state.epoch_in_window(12_epoch));
-    EXPECT_FALSE(state.epoch_in_window(9_epoch));
+    state.advance_max_seen_epoch(term, 15_epoch);
+    EXPECT_TRUE(state.epoch_in_window(term, 10_epoch));
+    EXPECT_TRUE(state.epoch_in_window(term, 15_epoch));
+    EXPECT_TRUE(state.epoch_in_window(term, 12_epoch));
+    EXPECT_FALSE(state.epoch_in_window(term, 9_epoch));
 
     EXPECT_EQ(estimate_inactive_epoch(), 4_epoch);
 
@@ -356,6 +361,7 @@ TEST(ctp_stm_state_test, l0_simulation) {
     };
     // We simulate the operations for a single partition in l0
     l0_simulation_state universe;
+    model::term_id term(1);
 
     {
         std::vector<uploaded_l0_file_batch> batches{
@@ -399,12 +405,13 @@ TEST(ctp_stm_state_test, l0_simulation) {
         std::vector<std::function<void()>> possible_operations;
         // If there are batches to upload, let's do it.
         if (!universe.uploaded_batches.empty()) {
-            possible_operations.emplace_back([&universe, &oplog] {
+            possible_operations.emplace_back([&universe, &oplog, term] {
                 auto batch = universe.uploaded_batches.front();
                 universe.uploaded_batches.pop_front();
-                if (!universe.stm.epoch_in_window(batch.epoch)) {
-                    universe.stm.advance_max_seen_epoch(batch.epoch);
-                    ASSERT_TRUE(universe.stm.epoch_in_window(batch.epoch));
+                if (!universe.stm.epoch_in_window(term, batch.epoch)) {
+                    universe.stm.advance_max_seen_epoch(term, batch.epoch);
+                    ASSERT_TRUE(
+                      universe.stm.epoch_in_window(term, batch.epoch));
                 }
                 placeholder_batch placeholder{
                   .epoch = batch.epoch, .offset = universe.hwm++};
diff --git a/src/v/cloud_topics/level_zero/stm/tests/ctp_stm_test.cc b/src/v/cloud_topics/level_zero/stm/tests/ctp_stm_test.cc

Original file line number	Diff line number	Diff line change
`@@ -75,8 +75,8 @@ class ctp_stm final : public raft::persisted_stm<> {`
`75`	`75`
`76`	`76`	`const ctp_stm_state& state() const noexcept { return _state; }`
`77`	`77`
`78`		`- void advance_max_seen_epoch(cluster_epoch epoch) {`
`79`		`- _state.advance_max_seen_epoch(epoch);`
	`78`	`+ void advance_max_seen_epoch(model::term_id term, cluster_epoch epoch) {`
	`79`	`+ _state.advance_max_seen_epoch(term, epoch);`
`80`	`80`	`}`
`81`	`81`
`82`	`82`	`ss::future<std::expected<cluster_epoch_fence, stale_cluster_epoch>>`