uccl-project · MaoZiming · Oct 28, 2025 · Oct 29, 2025
diff --git a/ep/bench/test_internode.py b/ep/bench/test_internode.py
@@ -418,7 +418,7 @@ def check_data(check_x, recv_gbl_rank_prefix_sum):
 
     # Tune combine performance
     best_time, best_results = 1e10, None
-    for nvl_chunk_size in range(1, 8, 1):
+    for nvl_chunk_size in range(1, 20, 2):
         for rdma_chunk_size in range(12 if num_nodes == 2 else 8, 33, 4):
             config = Config(
                 num_sms,

diff --git a/ep/include/common.hpp b/ep/include/common.hpp
@@ -34,7 +34,7 @@
 // imm for reordering buffer sequence tracking.
 #ifdef USE_MSCCLPP_FIFO_BACKEND
 #ifdef USE_NORMAL_MODE
-#define kMaxInflight 8
+#define kMaxInflight 16  // Increased for better combine throughput
 #else
 #define kMaxInflight 32
 #endif

diff --git a/ep/include/proxy_ctx.hpp b/ep/include/proxy_ctx.hpp
@@ -2,10 +2,16 @@
 #include "barrier_local.hpp"
 #include "util/gpu_rt.h"
 #include <infiniband/verbs.h>
+#include <array>
 #include <atomic>
 #include <map>
 #include <unordered_map>
 #include <vector>
+#ifdef USE_NORMAL_MODE
+#include <chrono>
+#endif
+
+struct TransferCmd;  // forward declaration
 
 template <typename Key>
 class TokenCounter {
@@ -23,6 +29,51 @@ class TokenCounter {
   MapType counter_;
 };
 
+// Fast array-based token counter for combine operations
+// Assumes max 4 buffers, 512 experts
+class FastCombineTokenCounter {
+ public:
+  static constexpr size_t kMaxBuffers = 8;  // Increased to be safe
+  static constexpr size_t kMaxExperts = 512;
+
+  void Add(int buffer_idx, int expert_idx, size_t k) {
+    if (buffer_idx >= 0 && buffer_idx < kMaxBuffers && expert_idx >= 0 &&
+        expert_idx < kMaxExperts) {
+      counters_[buffer_idx][expert_idx] += k;
+    } else {
+      // Fallback to map for out-of-range
+      fallback_[{buffer_idx, expert_idx}] += k;
+    }
+  }
+
+  size_t Get(int buffer_idx, int expert_idx) const {
+    if (buffer_idx >= 0 && buffer_idx < kMaxBuffers && expert_idx >= 0 &&
+        expert_idx < kMaxExperts) {
+      return counters_[buffer_idx][expert_idx];
+    }
+    auto it = fallback_.find({buffer_idx, expert_idx});
+    return (it == fallback_.end()) ? 0 : it->second;
+  }
+
+  void Reset(int buffer_idx, int expert_idx) {
+    if (buffer_idx >= 0 && buffer_idx < kMaxBuffers && expert_idx >= 0 &&
+        expert_idx < kMaxExperts) {
+      counters_[buffer_idx][expert_idx] = 0;
+    } else {
+      fallback_[{buffer_idx, expert_idx}] = 0;
+    }
+  }
+
+  void Clear() {
+    memset(counters_, 0, sizeof(counters_));
+    fallback_.clear();
+  }
+
+ private:
+  size_t counters_[kMaxBuffers][kMaxExperts] = {};
+  mutable std::map<std::pair<int, int>, size_t> fallback_;  // For out-of-range
+};
+
 using DispatchTokenKey = std::tuple<int, int, int>;
 using CombineTokenKey = std::pair<int, int>;
 using NormalTokenKey = std::pair<int, int>;
@@ -91,7 +142,7 @@ struct ProxyCtx {
   uint32_t tag = 0;
 
   TokenCounter<DispatchTokenKey> dispatch_token_counter;
-  TokenCounter<CombineTokenKey> combine_token_counter;
+  FastCombineTokenCounter combine_token_counter;  // Optimized for fast lookups
   TokenCounter<NormalTokenKey> normal_token_counter;
 
   /* low_latency_buffer_idx, expert_idx, dst_rank */
@@ -130,4 +181,44 @@ struct ProxyCtx {
     return (static_cast<uint64_t>(static_cast<uint32_t>(dst_rank)) << 32) ^
            static_cast<uint64_t>(static_cast<uint32_t>(index));
   }
+
+#ifdef USE_NORMAL_MODE
+  // Batching state for delayed transmission
+  struct BatchState {
+    std::vector<uint64_t> wrs;
+    std::vector<TransferCmd> cmds;
+    std::chrono::steady_clock::time_point first_cmd_time;
+    bool has_pending = false;
+  };
+  std::unordered_map<int, BatchState> pending_batches;  // per dst_rank
+  static constexpr size_t kMaxBatchSize = 64;  // Sweet spot for EFA UD mode
+  static constexpr int64_t kMaxBatchDelayUs =
+      10;  // Not used (size-only batching)
+
+  // Pre-allocated buffers to avoid allocation in hot path
+  std::unordered_map<int, std::vector<size_t>> reusable_dst_rank_wr_ids;
+  std::unordered_map<size_t, std::vector<size_t>> reusable_ring_to_indices;
+  std::vector<uint64_t> reusable_ring_wrids;
+  std::vector<ibv_sge> reusable_sges;
+  std::vector<ibv_send_wr> reusable_wrs;
+
+  // Cache for EFA UD addressing - avoid repeated ibv_wr_set_ud_addr
+  struct UDAddrCache {
+    ibv_ah* ah = nullptr;
+    uint32_t qpn = 0;
+    uint32_t qkey = QKEY;
+  };
+  std::unordered_map<size_t, UDAddrCache> ud_addr_cache;  // ring_idx -> cache
+
+  // Pre-allocated array for sequence numbers (replacing map)
+  static constexpr size_t kSeqArraySize =
+      16384;  // Support up to 16K unique (dst_rank, index) pairs
+  std::array<std::atomic<uint8_t>, kSeqArraySize> seq_array{};
+
+  // Hash function for sequence array indexing
+  inline size_t seq_hash(int dst_rank, size_t index) const {
+    // Simple hash combining dst_rank and index
+    return (static_cast<size_t>(dst_rank) * 4096 + index) % kSeqArraySize;
+  }
+#endif
 };
diff --git a/ep/include/rdma.hpp b/ep/include/rdma.hpp
@@ -331,6 +331,14 @@ void post_rdma_async_batched(ProxyCtx& S, void* buf, size_t num_wrs,
                              std::vector<TransferCmd> const& cmds_to_post,
                              std::vector<std::unique_ptr<ProxyCtx>>& ctxs,
                              int my_rank, int thread_idx);
+#ifdef USE_NORMAL_MODE
+void flush_pending_batch_for_dst(ProxyCtx& S, int dst_rank, void* buf,
+                                 std::vector<std::unique_ptr<ProxyCtx>>& ctxs,
+                                 int my_rank, int thread_idx);
+void flush_all_pending_batches(ProxyCtx& S, void* buf,
+                               std::vector<std::unique_ptr<ProxyCtx>>& ctxs,
+                               int my_rank, int thread_idx);
+#endif
 void local_process_completions(ProxyCtx& S,
                                std::unordered_set<uint64_t>& acked_wrs,
                                int thread_idx, ibv_wc* wc, int ne,

diff --git a/ep/src/proxy.cpp b/ep/src/proxy.cpp
@@ -393,6 +393,9 @@ void Proxy::run_dual() {
   uint64_t my_tail = 0;
   size_t seen = 0;
   std::set<PendingUpdate> pending_atomic_updates;
+#ifdef USE_NORMAL_MODE
+  auto last_flush_check = std::chrono::steady_clock::now();
+#endif
   while (ctx_.progress_run.load(std::memory_order_acquire)) {
     poll_cq_dual(ctx_, acked_wrs_, cfg_.thread_idx, ring, ctx_by_tag_,
                  atomic_buffer_ptr_, cfg_.num_ranks, cfg_.num_experts,
@@ -416,6 +419,10 @@ void Proxy::run_dual() {
 
 #ifdef USE_NORMAL_MODE
     barrier_check();
+
+    // Note: Periodic flush removed - using size-only batching
+    // (kMaxBatchSize=64) for maximum throughput. Time-based flushing overhead
+    // was degrading performance.
 #endif
   }
 }