feat: Preserve deterministic routing for LWT queries in LatencyAwarePolicy. ⏱️

nikagra · dkropachev · commit ab2574feae1e · 2026-02-09T13:16:53.000-04:00
diff --git a/README.md b/README.md
@@ -19,9 +19,17 @@ The Scylla Java Driver is a fork from [DataStax Java Driver](https://github.com/
 * Like all Scylla Drivers, the Scylla Java Driver is **Shard Aware** and contains extensions for a `tokenAwareHostPolicy`. 
   Using this policy, the driver can select a connection to a particular shard based on the shard's token. 
   As a result, latency is significantly reduced because there is no need to pass data between the shards.
-* **Lightweight Transaction (LWT) Optimization**: when using `TokenAwarePolicy` with prepared statements, 
-  LWT queries automatically use replica-only routing, prioritizing local datacenter replicas to minimize 
-  coordinator forwarding overhead and reduce contention during Paxos consensus phases.
+* **Lightweight Transaction (LWT) Optimization**: 
+  - When using `TokenAwarePolicy` with prepared statements, LWT queries automatically use replica-only routing, 
+    prioritizing local datacenter replicas to minimize coordinator forwarding overhead and reduce contention during 
+    Paxos consensus phases.
+  - When using `RackAwareRoundRobinPolicy`, LWT queries skip local rack prioritization and distribute evenly across 
+    all hosts in the local datacenter. This avoids creating rack-level hotspots during Paxos consensus, which can 
+    lead to increased contention and reduced throughput. The local datacenter is still prioritized over remote 
+    datacenters to maintain low latency.
+  - When using `LatencyAwarePolicy`, LWT queries bypass latency-based reordering to preserve deterministic replica 
+    selection. This ensures that LWT routing assumptions (such as consistent coordinator selection for optimal Paxos 
+    performance) are maintained throughout the policy chain.
 * [Sync](manual/) and [Async](manual/async/) API
 * [Simple](manual/statements/simple/), [Prepared](manual/statements/prepared/), and [Batch](manual/statements/batch/)
   statements
diff --git a/driver-core/src/main/java/com/datastax/driver/core/policies/LatencyAwarePolicy.java b/driver-core/src/main/java/com/datastax/driver/core/policies/LatencyAwarePolicy.java
@@ -62,6 +62,11 @@
  * they will only be tried if all other nodes failed). Note that this policy only penalizes slow
  * nodes, it does <em>not</em> globally sort the query plan by latency.
  *
+ * <p><strong>LWT statements:</strong> if {@link Statement#isLWT()} returns {@code true}, this
+ * policy does not apply latency-based reordering and returns the child policy's query plan as-is.
+ * This is to preserve LWT-specific routing assumptions (for example deterministic replica selection
+ * when using {@link TokenAwarePolicy}).
+ *
  * <p>The latency score for a given node is a based on a form of <a
  * href="http://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average">exponential moving
  * average</a>. In other words, the latency score of a node is the average of its previously
@@ -145,7 +150,7 @@ public void run() {
         if (logger.isDebugEnabled()) {
           /*
            * For users to be able to know if the policy potentially needs tuning, we need to provide
-           * some feedback on on how things evolve. For that, we use the min computation to also check
+           * some feedback on how things evolve. For that, we use the min computation to also check
            * which host will be excluded if a query is submitted now and if any host is, we log it (but
            * we try to avoid flooding too). This is probably interesting information anyway since it
            * gets an idea of which host perform badly.
@@ -253,6 +258,13 @@ public HostDistance distance(Host host) {
    */
   @Override
   public Iterator<Host> newQueryPlan(String loggedKeyspace, Statement statement) {
+    // For LWT queries, preserve the child policy's ordering.
+    // LWT routing can rely on deterministic replica ordering (e.g. by TokenAwarePolicy), and
+    // latency-based reordering can undermine those assumptions.
+    if (statement != null && statement.isLWT()) {
+      return childPolicy.newQueryPlan(loggedKeyspace, statement);
+    }
+
     final Iterator<Host> childIter = childPolicy.newQueryPlan(loggedKeyspace, statement);
     return new AbstractIterator<Host>() {
 
diff --git a/driver-core/src/test/java/com/datastax/driver/core/policies/LatencyAwarePolicyTest.java b/driver-core/src/test/java/com/datastax/driver/core/policies/LatencyAwarePolicyTest.java
@@ -28,10 +28,13 @@
 import com.datastax.driver.core.LatencyTracker;
 import com.datastax.driver.core.ScassandraTestBase;
 import com.datastax.driver.core.Session;
+import com.datastax.driver.core.SimpleStatement;
 import com.datastax.driver.core.Statement;
 import com.datastax.driver.core.exceptions.NoHostAvailableException;
 import com.datastax.driver.core.exceptions.ReadTimeoutException;
 import com.datastax.driver.core.exceptions.UnavailableException;
+import com.google.common.collect.Lists;
+import java.util.Iterator;
 import java.util.concurrent.CountDownLatch;
 import org.testng.annotations.Test;
 
@@ -178,4 +181,50 @@ public void should_consider_latency_when_read_timeout() throws Exception {
       cluster.close();
     }
   }
+
+  @Test(groups = "short")
+  public void should_not_reorder_query_plan_for_lwt_queries() throws Exception {
+    // given
+    String query = "SELECT foo FROM bar";
+    primingClient.prime(queryBuilder().withQuery(query).build());
+
+    LatencyAwarePolicy latencyAwarePolicy =
+        LatencyAwarePolicy.builder(new RoundRobinPolicy()).withMininumMeasurements(1).build();
+
+    Cluster.Builder builder = super.createClusterBuilder();
+    builder.withLoadBalancingPolicy(latencyAwarePolicy);
+
+    Cluster cluster = builder.build();
+    try {
+      cluster.init();
+
+      // Create an LWT statement so latency-aware policy must preserve child ordering
+      Statement lwtStatement =
+          new SimpleStatement(query) {
+            @Override
+            public boolean isLWT() {
+              return true;
+            }
+          };
+
+      // Make a request to populate latency metrics
+      LatencyTrackerBarrier barrier = new LatencyTrackerBarrier(1);
+      cluster.register(barrier);
+      Session session = cluster.connect();
+      session.execute(query);
+      barrier.await();
+      latencyAwarePolicy.new Updater().run();
+
+      // when
+      Iterator<Host> plan1 = latencyAwarePolicy.newQueryPlan("ks", lwtStatement);
+      Iterator<Host> plan2 = latencyAwarePolicy.newQueryPlan("ks", lwtStatement);
+
+      // then
+      Host host = retrieveSingleHost(cluster);
+      assertThat(Lists.newArrayList(plan1)).containsExactly(host);
+      assertThat(Lists.newArrayList(plan2)).containsExactly(host);
+    } finally {
+      cluster.close();
+    }
+  }
 }