Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ep/bench/test_internode.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def check_data(check_x, recv_gbl_rank_prefix_sum):

# Tune combine performance
best_time, best_results = 1e10, None
for nvl_chunk_size in range(1, 8, 1):
for nvl_chunk_size in range(1, 20, 2):
for rdma_chunk_size in range(12 if num_nodes == 2 else 8, 33, 4):
config = Config(
num_sms,
Expand Down
2 changes: 1 addition & 1 deletion ep/include/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
// imm for reordering buffer sequence tracking.
#ifdef USE_MSCCLPP_FIFO_BACKEND
#ifdef USE_NORMAL_MODE
#define kMaxInflight 8
#define kMaxInflight 16 // Increased for better combine throughput
#else
#define kMaxInflight 32
#endif
Expand Down
93 changes: 92 additions & 1 deletion ep/include/proxy_ctx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,16 @@
#include "barrier_local.hpp"
#include "util/gpu_rt.h"
#include <infiniband/verbs.h>
#include <array>
#include <atomic>
#include <map>
#include <unordered_map>
#include <vector>
#ifdef USE_NORMAL_MODE
#include <chrono>
#endif

struct TransferCmd; // forward declaration

template <typename Key>
class TokenCounter {
Expand All @@ -23,6 +29,51 @@ class TokenCounter {
MapType counter_;
};

// Fast array-based token counter for combine operations
// Assumes max 4 buffers, 512 experts
class FastCombineTokenCounter {
public:
static constexpr size_t kMaxBuffers = 8; // Increased to be safe
static constexpr size_t kMaxExperts = 512;

void Add(int buffer_idx, int expert_idx, size_t k) {
if (buffer_idx >= 0 && buffer_idx < kMaxBuffers && expert_idx >= 0 &&
expert_idx < kMaxExperts) {
counters_[buffer_idx][expert_idx] += k;
} else {
// Fallback to map for out-of-range
fallback_[{buffer_idx, expert_idx}] += k;
}
}

size_t Get(int buffer_idx, int expert_idx) const {
if (buffer_idx >= 0 && buffer_idx < kMaxBuffers && expert_idx >= 0 &&
expert_idx < kMaxExperts) {
return counters_[buffer_idx][expert_idx];
}
auto it = fallback_.find({buffer_idx, expert_idx});
return (it == fallback_.end()) ? 0 : it->second;
}

void Reset(int buffer_idx, int expert_idx) {
if (buffer_idx >= 0 && buffer_idx < kMaxBuffers && expert_idx >= 0 &&
expert_idx < kMaxExperts) {
counters_[buffer_idx][expert_idx] = 0;
} else {
fallback_[{buffer_idx, expert_idx}] = 0;
}
}

void Clear() {
memset(counters_, 0, sizeof(counters_));
fallback_.clear();
}

private:
size_t counters_[kMaxBuffers][kMaxExperts] = {};
mutable std::map<std::pair<int, int>, size_t> fallback_; // For out-of-range
};

using DispatchTokenKey = std::tuple<int, int, int>;
using CombineTokenKey = std::pair<int, int>;
using NormalTokenKey = std::pair<int, int>;
Expand Down Expand Up @@ -91,7 +142,7 @@ struct ProxyCtx {
uint32_t tag = 0;

TokenCounter<DispatchTokenKey> dispatch_token_counter;
TokenCounter<CombineTokenKey> combine_token_counter;
FastCombineTokenCounter combine_token_counter; // Optimized for fast lookups
TokenCounter<NormalTokenKey> normal_token_counter;

/* low_latency_buffer_idx, expert_idx, dst_rank */
Expand Down Expand Up @@ -130,4 +181,44 @@ struct ProxyCtx {
return (static_cast<uint64_t>(static_cast<uint32_t>(dst_rank)) << 32) ^
static_cast<uint64_t>(static_cast<uint32_t>(index));
}

#ifdef USE_NORMAL_MODE
// Batching state for delayed transmission
struct BatchState {
std::vector<uint64_t> wrs;
std::vector<TransferCmd> cmds;
std::chrono::steady_clock::time_point first_cmd_time;
bool has_pending = false;
};
std::unordered_map<int, BatchState> pending_batches; // per dst_rank
static constexpr size_t kMaxBatchSize = 64; // Sweet spot for EFA UD mode
static constexpr int64_t kMaxBatchDelayUs =
10; // Not used (size-only batching)

// Pre-allocated buffers to avoid allocation in hot path
std::unordered_map<int, std::vector<size_t>> reusable_dst_rank_wr_ids;
std::unordered_map<size_t, std::vector<size_t>> reusable_ring_to_indices;
std::vector<uint64_t> reusable_ring_wrids;
std::vector<ibv_sge> reusable_sges;
std::vector<ibv_send_wr> reusable_wrs;

// Cache for EFA UD addressing - avoid repeated ibv_wr_set_ud_addr
struct UDAddrCache {
ibv_ah* ah = nullptr;
uint32_t qpn = 0;
uint32_t qkey = QKEY;
};
std::unordered_map<size_t, UDAddrCache> ud_addr_cache; // ring_idx -> cache

// Pre-allocated array for sequence numbers (replacing map)
static constexpr size_t kSeqArraySize =
16384; // Support up to 16K unique (dst_rank, index) pairs
std::array<std::atomic<uint8_t>, kSeqArraySize> seq_array{};

// Hash function for sequence array indexing
inline size_t seq_hash(int dst_rank, size_t index) const {
// Simple hash combining dst_rank and index
return (static_cast<size_t>(dst_rank) * 4096 + index) % kSeqArraySize;
}
#endif
};
8 changes: 8 additions & 0 deletions ep/include/rdma.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,14 @@ void post_rdma_async_batched(ProxyCtx& S, void* buf, size_t num_wrs,
std::vector<TransferCmd> const& cmds_to_post,
std::vector<std::unique_ptr<ProxyCtx>>& ctxs,
int my_rank, int thread_idx);
#ifdef USE_NORMAL_MODE
void flush_pending_batch_for_dst(ProxyCtx& S, int dst_rank, void* buf,
std::vector<std::unique_ptr<ProxyCtx>>& ctxs,
int my_rank, int thread_idx);
void flush_all_pending_batches(ProxyCtx& S, void* buf,
std::vector<std::unique_ptr<ProxyCtx>>& ctxs,
int my_rank, int thread_idx);
#endif
void local_process_completions(ProxyCtx& S,
std::unordered_set<uint64_t>& acked_wrs,
int thread_idx, ibv_wc* wc, int ne,
Expand Down
7 changes: 7 additions & 0 deletions ep/src/proxy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,9 @@ void Proxy::run_dual() {
uint64_t my_tail = 0;
size_t seen = 0;
std::set<PendingUpdate> pending_atomic_updates;
#ifdef USE_NORMAL_MODE
auto last_flush_check = std::chrono::steady_clock::now();
#endif
while (ctx_.progress_run.load(std::memory_order_acquire)) {
poll_cq_dual(ctx_, acked_wrs_, cfg_.thread_idx, ring, ctx_by_tag_,
atomic_buffer_ptr_, cfg_.num_ranks, cfg_.num_experts,
Expand All @@ -416,6 +419,10 @@ void Proxy::run_dual() {

#ifdef USE_NORMAL_MODE
barrier_check();

// Note: Periodic flush removed - using size-only batching
// (kMaxBatchSize=64) for maximum throughput. Time-based flushing overhead
// was degrading performance.
#endif
}
}
Expand Down
Loading