From 50d50a7a166a68c83a8593cd861f428018c3f47f Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Sun, 29 Aug 2021 14:18:12 +0100 Subject: [PATCH 01/19] initial commit --- .gitmodules | 3 + tensorpipe/CMakeLists.txt | 23 + tensorpipe/common/efa_read_write_ops.h | 201 +++++++++ tensorpipe/common/fabric.cc | 129 ++++++ tensorpipe/common/fabric.h | 146 +++++++ tensorpipe/transport/efa/connection_impl.cc | 445 ++++++++++++++++++++ tensorpipe/transport/efa/connection_impl.h | 179 ++++++++ tensorpipe/transport/efa/constants.h | 52 +++ tensorpipe/transport/efa/context_impl.cc | 109 +++++ tensorpipe/transport/efa/context_impl.h | 60 +++ tensorpipe/transport/efa/error.cc | 37 ++ tensorpipe/transport/efa/error.h | 48 +++ tensorpipe/transport/efa/factory.cc | 27 ++ tensorpipe/transport/efa/factory.h | 23 + tensorpipe/transport/efa/listener_impl.cc | 154 +++++++ tensorpipe/transport/efa/listener_impl.h | 60 +++ tensorpipe/transport/efa/mode.md | 49 +++ tensorpipe/transport/efa/reactor.cc | 265 ++++++++++++ tensorpipe/transport/efa/reactor.h | 160 +++++++ tensorpipe/transport/efa/sockaddr.cc | 142 +++++++ tensorpipe/transport/efa/sockaddr.h | 57 +++ tensorpipe/transport/efa/utility.cc | 178 ++++++++ tensorpipe/transport/efa/utility.h | 26 ++ third_party/libfabric | 1 + 24 files changed, 2574 insertions(+) create mode 100644 tensorpipe/common/efa_read_write_ops.h create mode 100644 tensorpipe/common/fabric.cc create mode 100644 tensorpipe/common/fabric.h create mode 100644 tensorpipe/transport/efa/connection_impl.cc create mode 100644 tensorpipe/transport/efa/connection_impl.h create mode 100644 tensorpipe/transport/efa/constants.h create mode 100644 tensorpipe/transport/efa/context_impl.cc create mode 100644 tensorpipe/transport/efa/context_impl.h create mode 100644 tensorpipe/transport/efa/error.cc create mode 100644 tensorpipe/transport/efa/error.h create mode 100644 tensorpipe/transport/efa/factory.cc create mode 100644 tensorpipe/transport/efa/factory.h create mode 100644 tensorpipe/transport/efa/listener_impl.cc create mode 100644 tensorpipe/transport/efa/listener_impl.h create mode 100644 tensorpipe/transport/efa/mode.md create mode 100644 tensorpipe/transport/efa/reactor.cc create mode 100644 tensorpipe/transport/efa/reactor.h create mode 100644 tensorpipe/transport/efa/sockaddr.cc create mode 100644 tensorpipe/transport/efa/sockaddr.h create mode 100644 tensorpipe/transport/efa/utility.cc create mode 100644 tensorpipe/transport/efa/utility.h create mode 160000 third_party/libfabric diff --git a/.gitmodules b/.gitmodules index 7207ef9b6..1cfc7c11d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -11,3 +11,6 @@ [submodule "third_party/libnop"] path = third_party/libnop url = https://github.com/google/libnop.git +[submodule "third_party/libfabric"] + path = third_party/libfabric + url = https://github.com/ofiwg/libfabric diff --git a/tensorpipe/CMakeLists.txt b/tensorpipe/CMakeLists.txt index 5c3606471..5ed5bded4 100644 --- a/tensorpipe/CMakeLists.txt +++ b/tensorpipe/CMakeLists.txt @@ -164,6 +164,29 @@ if(TP_ENABLE_IBV) set(TENSORPIPE_HAS_IBV_TRANSPORT 1) endif() +### EFA + +tp_conditional_backend( + TP_ENABLE_EFA "Enable EFA transport" "LINUX") +if(TP_ENABLE_EFA) + list(APPEND TP_SRCS + common/fabric.cc + transport/efa/connection_impl.cc + transport/efa/context_impl.cc + transport/efa/error.cc + transport/efa/factory.cc + transport/efa/listener_impl.cc + transport/efa/reactor.cc + transport/efa/sockaddr.cc + transport/efa/utility.cc) + list(APPEND TP_PUBLIC_HDRS + transport/efa/error.h + transport/efa/factory.h + transport/efa/utility.h) + set(TENSORPIPE_HAS_EFA_TRANSPORT 1) + list(APPEND TP_INCLUDE_DIRS $) + # list(APPEND TP_STATIC_OR_SHARED fabric) +endif() ## MAC OS specific library deps diff --git a/tensorpipe/common/efa_read_write_ops.h b/tensorpipe/common/efa_read_write_ops.h new file mode 100644 index 000000000..714d71a1c --- /dev/null +++ b/tensorpipe/common/efa_read_write_ops.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace tensorpipe { + +// The read operation captures all state associated with reading a +// fixed length chunk of data from the underlying connection. All +// reads are required to include a word-sized header containing the +// number of bytes in the operation. This makes it possible for the +// read side of the connection to either 1) not know how many bytes +// to expected, and dynamically allocate, or 2) know how many bytes +// to expect, and preallocate the destination memory. +class EFAReadOperation { + public: + enum Mode { + READ_LENGTH, + READ_PAYLOAD, + COMPLETE, + }; + + public: + using read_callback_fn = + std::function; + + explicit inline EFAReadOperation(read_callback_fn fn); + + inline EFAReadOperation(void* ptr, size_t length, read_callback_fn fn); + + // Called when a buffer is needed to read data from stream. + inline void allocFromLoop(); + + // Called when data has been read from stream. + // inline void readFromLoop(); + + // Returns if this read operation is complete. + inline bool completeFromLoop() const; + + // Invoke user callback. + inline void callbackFromLoop(const Error& error); + + // private: + Mode mode_{READ_LENGTH}; + char* ptr_{nullptr}; + + // Number of bytes as specified by the user (if applicable). + optional givenLength_; + + // Number of bytes to expect as read from the connection. + size_t readLength_{0}; + + // Number of bytes read from the connection. + // This is reset to 0 when we advance from READ_LENGTH to READ_PAYLOAD. + size_t bytesRead_{0}; + + // Holds temporary allocation if no length was specified. + std::unique_ptr buffer_{nullptr}; + + // User callback. + read_callback_fn fn_; +}; + +EFAReadOperation::EFAReadOperation(read_callback_fn fn) : fn_(std::move(fn)) {} + +EFAReadOperation::EFAReadOperation( + void* ptr, + size_t length, + read_callback_fn fn) + : ptr_(static_cast(ptr)), givenLength_(length), fn_(std::move(fn)) {} + +void EFAReadOperation::allocFromLoop() { + if (givenLength_.has_value()) { + TP_DCHECK(ptr_ != nullptr || givenLength_.value() == 0); + TP_DCHECK_EQ(readLength_, givenLength_.value()); + } else { + TP_DCHECK(ptr_ == nullptr); + buffer_ = std::make_unique(readLength_); + ptr_ = buffer_.get(); + } +} + +// void EFAReadOperation::readFromLoop(size_t nread) { +// bytesRead_ += nread; +// if (mode_ == READ_LENGTH) { +// TP_DCHECK_LE(bytesRead_, sizeof(readLength_)); +// if (bytesRead_ == sizeof(readLength_)) { +// if (givenLength_.has_value()) { +// TP_DCHECK(ptr_ != nullptr || givenLength_.value() == 0); +// TP_DCHECK_EQ(readLength_, givenLength_.value()); +// } else { +// TP_DCHECK(ptr_ == nullptr); +// buffer_ = std::make_unique(readLength_); +// ptr_ = buffer_.get(); +// } +// if (readLength_ == 0) { +// mode_ = COMPLETE; +// } else { +// mode_ = READ_PAYLOAD; +// } +// bytesRead_ = 0; +// } +// } else if (mode_ == READ_PAYLOAD) { +// TP_DCHECK_LE(bytesRead_, readLength_); +// if (bytesRead_ == readLength_) { +// mode_ = COMPLETE; +// } +// } else { +// TP_THROW_ASSERT() << "invalid mode " << mode_; +// } +// } + +bool EFAReadOperation::completeFromLoop() const { + return mode_ == COMPLETE; +} + +void EFAReadOperation::callbackFromLoop(const Error& error) { + fn_(error, ptr_, readLength_); +} + +// The write operation captures all state associated with writing a +// fixed length chunk of data from the underlying connection. The +// write includes a word-sized header containing the length of the +// write. This header is a member field on this class and therefore +// the instance must be kept alive and the reference to the instance +// must remain valid until the write callback has been called. +class EFAWriteOperation { + public: + + enum Mode { + WRITE_LENGTH, + WRITE_PAYLOAD, // Not used + WAIT_TO_COMPLETE, + COMPLETE, + }; + + using write_callback_fn = std::function; + + inline EFAWriteOperation( + const void* ptr, + size_t length, + write_callback_fn fn); + + struct Buf { + char* base; + size_t len; + }; + + inline std::tuple getBufs(); + + // Invoke user callback. + inline void callbackFromLoop(const Error& error); + + // private: + Mode mode_{WRITE_LENGTH}; + const char* ptr_; + const size_t length_; + + // Buffers (structs with pointers and lengths) to write to stream. + std::array bufs_; + + // User callback. + write_callback_fn fn_; +}; + +EFAWriteOperation::EFAWriteOperation( + const void* ptr, + size_t length, + write_callback_fn fn) + : ptr_(static_cast(ptr)), length_(length), fn_(std::move(fn)) { + bufs_[0].base = const_cast(reinterpret_cast(&length_)); + bufs_[0].len = sizeof(length_); + bufs_[1].base = const_cast(ptr_); + bufs_[1].len = length_; +} + +std::tuple EFAWriteOperation::getBufs() { + size_t numBuffers = length_ == 0 ? 1 : 2; + return std::make_tuple(bufs_.data(), numBuffers); +} + +void EFAWriteOperation::callbackFromLoop(const Error& error) { + fn_(error); +} + +} // namespace tensorpipe diff --git a/tensorpipe/common/fabric.cc b/tensorpipe/common/fabric.cc new file mode 100644 index 000000000..a6e8f9985 --- /dev/null +++ b/tensorpipe/common/fabric.cc @@ -0,0 +1,129 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace tensorpipe{ + +FabricContext::FabricContext(){ + UniqueFabricPtr fabinfo = getFabricInfo(); + struct fi_info *info = fabinfo.get(); + struct fi_av_attr av_attr = {}; + + // fi_fabric: create fabric + struct fid_fabric *fabric_; + int ret = fi_fabric(info->fabric_attr, &fabric_, nullptr); + TP_CHECK_EFA_RET(ret, "Couldn't open a fabric provider"); + fabric.reset(fabric_); + + // fi_domain: create domain + struct fid_domain *domain_; + ret = fi_domain(fabric.get(), info, &domain_, nullptr); + // LOG(INFO) << domain_-> + TP_CHECK_EFA_RET(ret, "Couldn't open a fabric access domain"); + domain.reset(domain_); + + // fi_av_open: create address vector + av_attr.type = FI_AV_TABLE; + struct fid_av *av_; + ret = fi_av_open(domain.get(), &av_attr, &av_, nullptr); + av.reset(av_); + TP_CHECK_EFA_RET(ret, "Couldn't open AV"); + + // fi_cq_open: open completion queue + struct fid_cq *cq_; + struct fi_cq_attr cq_attr = {}; + cq_attr.format = FI_CQ_FORMAT_TAGGED; + cq_attr.size = info->rx_attr->size; + ret = fi_cq_open(domain.get(), &cq_attr, &cq_, nullptr); + cq.reset(cq_); + TP_CHECK_EFA_RET(ret, "Couldn't open CQ"); + + // fi_endpoint: create transport level communication endpoint(s) + struct fid_ep *ep_; + ret = fi_endpoint(domain.get(), info, &ep_, nullptr); + ep.reset(ep_); + TP_CHECK_EFA_RET(ret, "Couldn't allocate endpoint"); + + // fi_ep_bind: bind CQ and AV to the endpoint + ret = fi_ep_bind(ep.get(), (fid_t)cq.get(), FI_RECV | FI_TRANSMIT); + TP_CHECK_EFA_RET(ret, "Couldn't bind EP-CQ"); + ret = fi_ep_bind(ep.get(), (fid_t)av.get(), 0); + TP_CHECK_EFA_RET(ret, "Couldn't bind EP-AV"); + + // fi_enable: enable endpoint for communication + ret = fi_enable(ep.get()); + TP_CHECK_EFA_RET(ret, "Couldn't enable endpoint"); + + // fi_getname: get endpoint name + ret = fi_getname((fid_t)ep.get(), addr.name, &addr.len); + TP_CHECK_EFA_RET(ret, "Call to fi_getname() failed"); + // set readable address name + fi_av_straddr(av.get(), addr.name, readable_addr.name, &readable_addr.len); +} + +UniqueFabricPtr FabricContext::getFabricInfo(){ + UniqueFabricPtr hints(fi_allocinfo()); + hints->mode = FI_CONTEXT; + hints->ep_attr->type = FI_EP_RDM; // Reliable Datagram + hints->caps = FI_TAGGED | FI_MSG | FI_REMOTE_COMM | FI_DIRECTED_RECV | FI_LOCAL_COMM | FI_SOURCE; + hints->tx_attr->msg_order = FI_ORDER_SAS; + hints->rx_attr->msg_order = FI_ORDER_SAS; + hints->domain_attr->control_progress = FI_PROGRESS_AUTO; + hints->domain_attr->data_progress = FI_PROGRESS_AUTO; + hints->domain_attr->caps = + FI_LOCAL_COMM | FI_REMOTE_COMM; // Enable local loopback + hints->domain_attr->av_type = FI_AV_TABLE; + hints->fabric_attr->prov_name = strdup("efa"); + + UniqueFabricPtr info; + struct fi_info* info_; + int ret = + fi_getinfo(FABRIC_VERSION, nullptr, nullptr, 0, hints.get(), &info_); + info.reset(info_); + TP_THROW_ASSERT() << "Could not find any optimal provider. Return Code: " + << ret << ". ERROR: " << fi_strerror(-ret); + return info; + // TP_CHECK_EFA_RET(ret, "fi_getinfo failed"); +} + +FabricEndpoint::FabricEndpoint(){ + fabric_ctx = std::make_unique(); +} + +int FabricEndpoint::PollCQ(struct fi_cq_tagged_entry* cq_entries, fi_addr_t* src_addrs, size_t count){ + int ret = fi_cq_readfrom(fabric_ctx->cq.get(), &cq_entries, count, src_addrs); + return ret; +} + +int FabricEndpoint::PushSendEvent(void* buffer, size_t size, uint64_t tag, fi_addr_t dest_addr, void* context){ + int ret = fi_tsend(fabric_ctx->ep.get(), buffer, size, nullptr, dest_addr, tag, context); + if (ret < 0 && ret != -FI_EAGAIN) { + TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); + } + return ret; +} + +int FabricEndpoint::PushRecvEvent(void* buffer, size_t size, uint64_t tag, fi_addr_t dest_addr, uint64_t ignore, void* context){ + int ret = fi_trecv(fabric_ctx->ep.get(), buffer, size, nullptr, dest_addr, tag, ignore, context); + if (ret < 0 && ret != -FI_EAGAIN) { + TP_CHECK_EFA_RET(ret, "Unable to do fi_trecv message"); + } + return ret; +} + + + +} // namespace tensorpipe \ No newline at end of file diff --git a/tensorpipe/common/fabric.h b/tensorpipe/common/fabric.h new file mode 100644 index 000000000..6d46319e7 --- /dev/null +++ b/tensorpipe/common/fabric.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +static const int FABRIC_VERSION = FI_VERSION(1, 10); +static const int kMaxConcurrentWorkRequest = 4224; + +namespace tensorpipe { + +#define TP_CHECK_EFA_RET(ret, msg) \ + do { \ + if (ret != 0) { \ + TP_THROW_ASSERT() << msg << ". Return Code: " << ret \ + << ". ERROR: " << fi_strerror(-ret); \ + } \ + } while (false) + +struct FabricDeleter { + void operator()(fi_info* info) { + if (info) + fi_freeinfo(info); + } + void operator()(fid* fid) { + if (fid) + fi_close(fid); + } + void operator()(fid_domain* fid) { + if (fid) + fi_close((fid_t)fid); + } + void operator()(fid_fabric* fid) { + if (fid) + fi_close((fid_t)fid); + } + void operator()(fid_cq* fid) { + if (fid) + fi_close((fid_t)fid); + } + void operator()(fid_av* fid) { + if (fid) + fi_close((fid_t)fid); + } + void operator()(fid_ep* fid) { + if (fid) + fi_close((fid_t)fid); + } + void operator()(fid_eq* fid) { + if (fid) + fi_close((fid_t)fid); + } +}; + +template +using UniqueFabricPtr = std::unique_ptr; + +struct FabricAddr { + // endpoint name + char name[64] = {}; + // length of endpoint name + size_t len = sizeof(name); + + std::string DebugStr() const { + std::stringstream ss; + ss << "["; + for (size_t i = 0; i < len; i++) { + ss << std::to_string(name[i]) << ","; + } + ss << "]"; + return ss.str(); + } + + std::string str() const { + return std::string(name, len); + } + + void CopyFrom(void* ep_name, const size_t ep_name_len) { + len = ep_name_len; + memcpy(name, ep_name, sizeof(name)); + } + + void CopyTo(char* ep_name, size_t* ep_name_len) { + *(ep_name_len) = len; + memcpy(ep_name, name, sizeof(name)); + } +}; + +class FabricContext { + public: + // fabric top-level object + UniqueFabricPtr fabric; + // domains which maps to a specific local network interface adapter + UniqueFabricPtr domain; + // completion queue + UniqueFabricPtr cq; + // address vector + UniqueFabricPtr av; + // the endpoint + UniqueFabricPtr ep; + // endpoint name + struct FabricAddr addr; + // readable endpoint name + struct FabricAddr readable_addr; + + public: + explicit FabricContext(); + + private: + UniqueFabricPtr getFabricInfo(); +}; + +class FabricEndpoint { + public: + FabricEndpoint(); + + fi_addr_t AddPeerAddr(FabricAddr* addr); + + int PushSendEvent(void* buffer, size_t size, uint64_t tag, fi_addr_t dst_addr, void* context = nullptr); + int PushRecvEvent(void* buffer, size_t size, uint64_t tag, fi_addr_t src_addr, uint64_t ignore, void* context = nullptr); + + int PollCQ(struct fi_cq_tagged_entry* cq_entries, fi_addr_t* src_addrs, size_t count); + + // Fabric Context contains everything + std::unique_ptr fabric_ctx; +}; +} // namespace tensorpipe \ No newline at end of file diff --git a/tensorpipe/transport/efa/connection_impl.cc b/tensorpipe/transport/efa/connection_impl.cc new file mode 100644 index 000000000..1f809afec --- /dev/null +++ b/tensorpipe/transport/efa/connection_impl.cc @@ -0,0 +1,445 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +namespace { + +// When the connection gets closed, to avoid leaks, it needs to "reclaim" all +// the work requests that it had posted, by waiting for their completion. They +// may however complete with error, which makes it harder to identify and +// distinguish them from failing incoming requests because, in principle, we +// cannot access the opcode field of a failed work completion. Therefore, we +// assign a special ID to those types of requests, to match them later on. +constexpr uint64_t kWriteRequestId = 1; +constexpr uint64_t kAckRequestId = 2; + +// The data that each queue pair endpoint needs to send to the other endpoint in +// order to set up the queue pair itself. This data is transferred over a TCP +// connection. +// struct Exchange { +// efaSetupInformation setupInfo; +// uint64_t memoryRegionPtr; +// uint32_t memoryRegionKey; +// }; + +} // namespace + +ConnectionImpl::ConnectionImpl( + ConstructorToken token, + std::shared_ptr context, + std::string id, + Socket socket) + : ConnectionImplBoilerplate( + token, + std::move(context), + std::move(id)), + socket_(std::move(socket)) {} + +ConnectionImpl::ConnectionImpl( + ConstructorToken token, + std::shared_ptr context, + std::string id, + std::string addr) + : ConnectionImplBoilerplate( + token, + std::move(context), + std::move(id)), + sockaddr_(Sockaddr::createInetSockAddr(addr)) {} + +void ConnectionImpl::initImplFromLoop() { + context_->enroll(*this); + + Error error; + // The connection either got a socket or an address, but not both. + TP_DCHECK(socket_.hasValue() ^ sockaddr_.has_value()); + if (!socket_.hasValue()) { + std::tie(error, socket_) = + Socket::createForFamily(sockaddr_->addr()->sa_family); + if (error) { + setError(std::move(error)); + return; + } + error = socket_.reuseAddr(true); + if (error) { + setError(std::move(error)); + return; + } + error = socket_.connect(sockaddr_.value()); + if (error) { + setError(std::move(error)); + return; + } + } + // Ensure underlying control socket is non-blocking such that it + // works well with event driven I/O. + error = socket_.block(false); + if (error) { + setError(std::move(error)); + return; + } + // Register methods to be called when our peer writes to our inbox and reads + // from our outbox. + // context_->getReactor().registerQp(qp_->qp_num, shared_from_this()); + + // We're sending address first, so wait for writability. + state_ = SEND_ADDR; + context_->registerDescriptor(socket_.fd(), EPOLLOUT, shared_from_this()); +} + +void ConnectionImpl::readImplFromLoop(read_callback_fn fn) { + readOperations_.emplace_back(std::move(fn)); + + processReadOperationsFromLoop(); +} + +// void ConnectionImpl::readImplFromLoop( +// AbstractNopHolder& object, +// read_nop_callback_fn fn) { +// readOperations_.emplace_back( +// &object, +// [fn{std::move(fn)}]( +// const Error& error, const void* /* unused */, size_t /* unused */) +// { +// fn(error); +// }); + +// // If the inbox already contains some data, we may be able to process this +// // operation right away. +// processReadOperationsFromLoop(); +// } + +void ConnectionImpl::readImplFromLoop( + void* ptr, + size_t length, + read_callback_fn fn) { + readOperations_.emplace_back(ptr, length, std::move(fn)); + + // If the inbox already contains some data, we may be able to process this + // operation right away. + processReadOperationsFromLoop(); +} + +void ConnectionImpl::writeImplFromLoop( + const void* ptr, + size_t length, + write_callback_fn fn) { + writeOperations_.emplace_back(ptr, length, std::move(fn)); + + // If the outbox has some free space, we may be able to process this operation + // right away. + processWriteOperationsFromLoop(); +} + +void ConnectionImpl::handleEventsFromLoop(int events) { + TP_DCHECK(context_->inLoop()); + TP_VLOG(9) << "Connection " << id_ << " is handling an event on its socket (" + << EpollLoop::formatEpollEvents(events) << ")"; + + // Handle only one of the events in the mask. Events on the control + // file descriptor are rare enough for the cost of having epoll call + // into this function multiple times to not matter. The benefit is + // that every handler can close and unregister the control file + // descriptor from the event loop, without worrying about the next + // handler trying to do so as well. + // In some cases the socket could be in a state where it's both in an error + // state and readable/writable. If we checked for EPOLLIN or EPOLLOUT first + // and then returned after handling them, we would keep doing so forever and + // never reach the error handling. So we should keep the error check first. + if (events & EPOLLERR) { + int error; + socklen_t errorlen = sizeof(error); + int rv = getsockopt( + socket_.fd(), + SOL_SOCKET, + SO_ERROR, + reinterpret_cast(&error), + &errorlen); + if (rv == -1) { + setError(TP_CREATE_ERROR(SystemError, "getsockopt", rv)); + } else { + setError(TP_CREATE_ERROR(SystemError, "async error on socket", error)); + } + return; + } + if (events & EPOLLIN) { + handleEventInFromLoop(); + return; + } + if (events & EPOLLOUT) { + handleEventOutFromLoop(); + return; + } + // Check for hangup last, as there could be cases where we get EPOLLHUP but + // there's still data to be read from the socket, so we want to deal with that + // before dealing with the hangup. + if (events & EPOLLHUP) { + setError(TP_CREATE_ERROR(EOFError)); + return; + } +} + +void ConnectionImpl::handleEventInFromLoop() { + TP_DCHECK(context_->inLoop()); + if (state_ == RECV_ADDR) { + struct FabricAddr addr; + + auto err = socket_.read(&addr.name, 64); + // Crossing our fingers that the exchange information is small enough that + // it can be read in a single chunk. + if (err != 64) { + setError(TP_CREATE_ERROR(ShortReadError, 64, err)); + return; + } + + peer_addr = endpoint->AddPeerAddr(&addr); + + // The connection is usable now. + state_ = ESTABLISHED; + processWriteOperationsFromLoop(); + // Trigger read operations in case a pair of local read() and remote + // write() happened before connection is established. Otherwise read() + // callback would lose if it's the only read() request. + processReadOperationsFromLoop(); + return; + } + + if (state_ == ESTABLISHED) { + // We don't expect to read anything on this socket once the + // connection has been established. If we do, assume it's a + // zero-byte read indicating EOF. + setError(TP_CREATE_ERROR(EOFError)); + return; + } + + TP_THROW_ASSERT() << "EPOLLIN event not handled in state " << state_; +} + +void ConnectionImpl::handleEventOutFromLoop() { + TP_DCHECK(context_->inLoop()); + if (state_ == SEND_ADDR) { + FabricAddr addr = endpoint->fabric_ctx->addr; + + auto err = socket_.write(reinterpret_cast(&addr.name), 64); + // Crossing our fingers that the exchange information is small enough that + // it can be written in a single chunk. + if (err != 64) { + setError(TP_CREATE_ERROR(ShortWriteError, 64, err)); + return; + } + + // Sent our address. Wait for address from peer. + state_ = RECV_ADDR; + context_->registerDescriptor(socket_.fd(), EPOLLIN, shared_from_this()); + return; + } + + TP_THROW_ASSERT() << "EPOLLOUT event not handled in state " << state_; +} + +void ConnectionImpl::processReadOperationsFromLoop() { + TP_DCHECK(context_->inLoop()); + + // Process all read read operations that we can immediately serve, only + // when connection is established. + if (state_ != ESTABLISHED) { + return; + } + + // pop out finished event at front + while (!readOperations_.empty()) { + EFAReadOperation& readOperation = readOperations_.front(); + if (readOperation.completeFromLoop()) { + readOperation.callbackFromLoop(Error::kSuccess); + readOperations_.pop_front(); + } else { + break; + } + } + + // Serve read operations + // while (!readOperations_.empty()) { + // EFAReadOperation& readOperation = readOperations_.front(); + // context_->getReactor().postRecv( + // &readOperation.readLength_, + // sizeof(size_t), + // kLength, + // peer_addr, + // 0xffffffff, // ignore lower bits for msg index + // &readOperation); + // } + + for (int i = 0; i < readOperations_.size(); i++) { + EFAReadOperation& readOperation = readOperations_[i]; + if (readOperation.mode_ == EFAReadOperation::Mode::READ_LENGTH) { + // context_->getReactor().; + context_->getReactor().postRecv( + &readOperation.readLength_, + sizeof(size_t), + kLength, + peer_addr, + 0xffffffff, // ignore lower bits for msg index + &readOperation); + readOperation.mode_ = EFAReadOperation::Mode::READ_PAYLOAD; + } else { + // if the operation is not READ_LENGTH, all operations back are all not + // READ_LENGTH, we can skip more checks + break; + } + } +} + +void ConnectionImpl::processWriteOperationsFromLoop() { + TP_DCHECK(context_->inLoop()); + + if (state_ != ESTABLISHED) { + return; + } + + while (!writeOperations_.empty()) { + EFAWriteOperation& writeOperation = writeOperations_.front(); + if (writeOperation.mode_ == EFAWriteOperation::Mode::COMPLETE) { + writeOperations_.pop_front(); + } else { + break; + } + } + + for (int i = 0; i < writeOperations_.size(); i++) { + EFAWriteOperation& writeOperation = writeOperations_[i]; + if (writeOperation.mode_ == EFAWriteOperation::Mode::WRITE_LENGTH) { + EFAWriteOperation::Buf* buf_array; + size_t size; + std::tie(buf_array, size) = writeOperation.getBufs(); + // auto size_buf = std::get<0>(writeOperation.getBufs()); + // auto payload_buf = std::get<1>(writeOperation.getBufs()); + context_->getReactor().postSend( + buf_array[0].base, + buf_array[0].len, + kLength | sendIdx, + peer_addr, + &writeOperation); + if (size > 1) { + context_->getReactor().postSend( + buf_array[1].base, + buf_array[1].len, + kPayload | sendIdx, + peer_addr, + &writeOperation); + } + sendIdx++; + writeOperation.mode_ = EFAWriteOperation::Mode::WAIT_TO_COMPLETE; + } else { + // if the operation is not WAIT_TO_SEND, all operations back are all not + // WAIT_TO_SEND, we can skip more checks + break; + } + } +} + +// void ConnectionImpl::onError(efaLib::wc_status status, uint64_t wrId) { +// TP_DCHECK(context_->inLoop()); +// // setError(TP_CREATE_ERROR( +// // efaError, +// context_->getReactor().getefaLib().wc_status_str(status))); +// // if (wrId == kWriteRequestId) { +// // onWriteCompleted(); +// // } else if (wrId == kAckRequestId) { +// // onAckCompleted(); +// // } +// } + +void ConnectionImpl::handleErrorImpl() { + for (auto& readOperation : readOperations_) { + readOperation.callbackFromLoop(error_); + } + readOperations_.clear(); + + for (auto& writeOperation : writeOperations_) { + writeOperation.callbackFromLoop(error_); + } + writeOperations_.clear(); + + tryCleanup(); + + if (socket_.hasValue()) { + if (state_ > INITIALIZING) { + context_->unregisterDescriptor(socket_.fd()); + } + socket_.reset(); + } + + context_->unenroll(*this); +} + +void ConnectionImpl::tryCleanup() { + TP_DCHECK(context_->inLoop()); + // Setting the queue pair to an error state will cause all its work requests + // (both those that had started being served, and those that hadn't; including + // those from a shared receive queue) to be flushed. We need to wait for the + // completion events of all those requests to be retrieved from the completion + // queue before we can destroy the queue pair. We can do so by deferring the + // destruction to the loop, since the reactor will only proceed to invoke + // deferred functions once it doesn't have any completion events to handle. + // However the RDMA writes and the sends may be queued up inside the reactor + // and thus may not have even been scheduled yet, so we explicitly wait for + // them to complete. + // if (error_) { + // if (numWritesInFlight_ == 0 && numAcksInFlight_ == 0) { + // TP_VLOG(8) << "Connection " << id_ << " is ready to clean up"; + // context_->deferToLoop([impl{shared_from_this()}]() { impl->cleanup(); + // }); + // } else { + // TP_VLOG(9) << "Connection " << id_ + // << " cannot proceed to cleanup because it has " + // << numWritesInFlight_ << " pending RDMA write requests and " + // << numAcksInFlight_ << " pending send requests on QP " + // << qp_->qp_num; + // } + // } +} + +void ConnectionImpl::cleanup() { + TP_DCHECK(context_->inLoop()); + TP_VLOG(8) << "Connection " << id_ << " is cleaning up"; + + // context_->getReactor().unregisterQp(qp_->qp_num); + + // qp_.reset(); + // inboxMr_.reset(); + // inboxBuf_.reset(); + // outboxMr_.reset(); + // outboxBuf_.reset(); +} + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/connection_impl.h b/tensorpipe/transport/efa/connection_impl.h new file mode 100644 index 000000000..285e0a4a9 --- /dev/null +++ b/tensorpipe/transport/efa/connection_impl.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +class ContextImpl; +class ListenerImpl; + +class ConnectionImpl final : public ConnectionImplBoilerplate< + ContextImpl, + ListenerImpl, + ConnectionImpl>, + public EpollLoop::EventHandler { +// constexpr static size_t kBufferSize = 2 * 1024 * 1024; + +// constexpr static int kNumOutboxRingbufferRoles = 3; +// using OutboxefaAcker = RingBufferRole; +// using OutboxefaWriter = RingBufferRole; +// using OutboxProducer = RingBufferRole; + +// constexpr static int kNumInboxRingbufferRoles = 2; +// using InboxConsumer = RingBufferRole; +// using InboxefaRecver = RingBufferRole; + + enum State { + INITIALIZING = 1, + SEND_ADDR, + RECV_ADDR, + ESTABLISHED, + }; + + public: + // Create a connection that is already connected (e.g. from a listener). + ConnectionImpl( + ConstructorToken token, + std::shared_ptr context, + std::string id, + Socket socket); + + // Create a connection that connects to the specified address. + ConnectionImpl( + ConstructorToken token, + std::shared_ptr context, + std::string id, + std::string addr); + + // Implementation of EventHandler. + void handleEventsFromLoop(int events) override; + + // Implementation of efaEventHandler. +// void onRemoteProducedData(uint32_t length) override; +// void onRemoteConsumedData(uint32_t length) override; +// void onWriteCompleted() override; +// void onAckCompleted() override; + // void onError(efaLib::wc_status status, uint64_t wrId) override; + + protected: + // Implement the entry points called by ConnectionImplBoilerplate. + // Implement the entry points called by ConnectionImplBoilerplate. + void initImplFromLoop() override; + void readImplFromLoop(read_callback_fn fn) override; + void readImplFromLoop(void* ptr, size_t length, read_callback_fn fn) override; + void writeImplFromLoop(const void* ptr, size_t length, write_callback_fn fn) + override; + void handleErrorImpl() override; + + private: + // Handle events of type EPOLLIN on the UNIX domain socket. + // + // The only data that is expected on that socket is the address and other + // setup information for the other side's queue pair and inbox. + void handleEventInFromLoop(); + + // Handle events of type EPOLLOUT on the UNIX domain socket. + // + // Once the socket is writable we send the address and other setup information + // for this side's queue pair and inbox. + void handleEventOutFromLoop(); + + State state_{INITIALIZING}; + Socket socket_; + optional sockaddr_; + std::shared_ptr endpoint; + fi_addr_t peer_addr; + +// efaQueuePair qp_; + uint32_t sendIdx, recvIdx; + + + // Inbox. + // Initialize header during construction because it isn't assignable. +// RingBufferHeader inboxHeader_{kBufferSize}; + // Use mmapped memory so it's page-aligned (and, one day, to use huge pages). +// MmappedPtr inboxBuf_; +// RingBuffer inboxRb_; +// efaMemoryRegion inboxMr_; + + // Outbox. + // Initialize header during construction because it isn't assignable. +// RingBufferHeader outboxHeader_{kBufferSize}; + // Use mmapped memory so it's page-aligned (and, one day, to use huge pages). +// MmappedPtr outboxBuf_; +// RingBuffer outboxRb_; +// efaMemoryRegion outboxMr_; + + // Peer inbox key, pointer and head. +// uint32_t peerInboxKey_{0}; +// uint64_t peerInboxPtr_{0}; +// uint64_t peerInboxHead_{0}; + + // The connection performs two types of send requests: writing to the remote + // inbox, or acknowledging a write into its own inbox. These send operations + // could be delayed and stalled by the reactor as only a limited number of + // work requests can be outstanding at the same time globally. Thus we keep + // count of how many we have pending to make sure they have all completed or + // flushed when we close, and that none is stuck in the pipeline. +// uint32_t numWritesInFlight_{0}; +// uint32_t numAcksInFlight_{0}; + + // Pending read operations. + std::deque readOperations_; + + // Pending write operations. + std::deque writeOperations_; + + // Process pending read operations if in an operational state. + // + // This may be triggered by the other side of the connection (by pushing this + // side's inbox token to the reactor) when it has written some new data to its + // outbox (which is this side's inbox). It is also called by this connection + // when it moves into an established state or when a new read operation is + // queued, in case data was already available before this connection was ready + // to consume it. + void processReadOperationsFromLoop(); + + // Process pending write operations if in an operational state. + // + // This may be triggered by the other side of the connection (by pushing this + // side's outbox token to the reactor) when it has read some data from its + // inbox (which is this side's outbox). This is important when some of this + // side's writes couldn't complete because the outbox was full, and thus they + // needed to wait for some of its data to be read. This method is also called + // by this connection when it moves into an established state, in case some + // writes were queued before the connection was ready to process them, or when + // a new write operation is queued. + void processWriteOperationsFromLoop(); + + void tryCleanup(); + void cleanup(); +}; + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/constants.h b/tensorpipe/transport/efa/constants.h new file mode 100644 index 000000000..ae03cfa0d --- /dev/null +++ b/tensorpipe/transport/efa/constants.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace { + +// We should probably allow these to be user-configured. But, for now, we'll set +// them to the lowest value they can have, the rationale being that this way +// they will always be valid. +constexpr uint8_t kPortNum = 1; +constexpr uint8_t kGlobalIdentifierIndex = 0; + +// FIXME Instead of hardcoding the next three values, we could use +// efa_query_device to obtain max_cqe, max_qp_wr and max_srq_wr and deduce from +// them the maximum allowed values for these parameters. + +// How many simultaneous receive requests to keep queued on the shared receive +// queue. Incoming RDMA writes and sends will consume one such request. The +// reactor loop will fill the SRQ back up to this value once some requests +// complete. So this number should just be large enough to accommodate all the +// requests that could finish between two reactor loop iterations. And, even if +// this number ends up being too low, the excess incoming requests will just +// retry, causing a performance penalty but not a failure. +constexpr uint32_t kNumPendingRecvReqs = 1024; + +// How many RDMA write requests can be pending at the same time across all +// connections. We need to put a limit on them because they all use the same +// global completion queue which has a fixed capacity and if it overruns it will +// enter an unrecoverable error state. This value is also set as the capacity of +// the send queue of each queue pair. +constexpr uint32_t kNumPendingWriteReqs = 1024; + +// How many elements the completion queue should be able to hold. These elements +// will be either the completed receive requests of the SRQ, or the completed +// send requests from a connection's queue pair. We can bound the former value +// but not the latter, so we try to add some margin. +constexpr int kCompletionQueueSize = + kNumPendingRecvReqs + kNumPendingWriteReqs; + +// How many work completions to poll from the completion queue at each reactor +// iteration. +constexpr int kNumPolledWorkCompletions = 64; + +} // namespace diff --git a/tensorpipe/transport/efa/context_impl.cc b/tensorpipe/transport/efa/context_impl.cc new file mode 100644 index 000000000..3e35a2245 --- /dev/null +++ b/tensorpipe/transport/efa/context_impl.cc @@ -0,0 +1,109 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +namespace { + +// Prepend descriptor with transport name so it's easy to +// disambiguate descriptors when debugging. +const std::string kDomainDescriptorPrefix{"efa:"}; + +std::string generateDomainDescriptor() { + // It would be very cool if we could somehow obtain an "identifier" for the + // InfiniBand subnet that our device belongs to, but nothing of that sort + // seems to be available. So instead we say that if the user is trying to + // connect two processes which both have access to an InfiniBand device then + // they must know what they are doing and probably must have set up things + // properly. + return kDomainDescriptorPrefix + "*"; +} + +} // namespace + +std::shared_ptr ContextImpl::create() { + Error error; + // efaLib efaLib; + // std::tie(error, efaLib) = efaLib::create(); + // if (error) { + // TP_VLOG(7) + // << "efa transport is not viable because libefaerbs couldn't be loaded: " + // << error.what(); + // return nullptr; + // } + + // efaDeviceList deviceList; + // std::tie(error, deviceList) = efaDeviceList::create(efaLib); + // if (error && error.isOfType() && + // error.castToType()->errorCode() == ENOSYS) { + // TP_VLOG(7) << "efa transport is not viable because it couldn't get list of " + // << "InfiniBand devices because the kernel module isn't loaded"; + // return nullptr; + // } + // TP_THROW_ASSERT_IF(error) + // << "Couldn't get list of InfiniBand devices: " << error.what(); + + // if (deviceList.size() == 0) { + // TP_VLOG(7) << "efa transport is not viable because it couldn't find any " + // << "InfiniBand NICs"; + // return nullptr; + // } + + // return std::make_shared( + // std::move(efaLib), std::move(deviceList)); + return std::make_shared(); +} + +// ContextImpl::ContextImpl(efaLib efaLib, efaDeviceList deviceList) +// : ContextImplBoilerplate( +// generateDomainDescriptor()), +// reactor_(std::move(efaLib), std::move(deviceList)) {} + +void ContextImpl::handleErrorImpl() { + loop_.close(); + reactor_.close(); +} + +void ContextImpl::joinImpl() { + loop_.join(); + reactor_.join(); +} + +bool ContextImpl::inLoop() const { + return reactor_.inLoop(); +}; + +void ContextImpl::deferToLoop(std::function fn) { + reactor_.deferToLoop(std::move(fn)); +}; + +void ContextImpl::registerDescriptor( + int fd, + int events, + std::shared_ptr h) { + loop_.registerDescriptor(fd, events, std::move(h)); +} + +void ContextImpl::unregisterDescriptor(int fd) { + loop_.unregisterDescriptor(fd); +} + +Reactor& ContextImpl::getReactor() { + return reactor_; +} + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/context_impl.h b/tensorpipe/transport/efa/context_impl.h new file mode 100644 index 000000000..f2267e142 --- /dev/null +++ b/tensorpipe/transport/efa/context_impl.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +class ConnectionImpl; +class ListenerImpl; + +class ContextImpl final + : public ContextImplBoilerplate { + public: + static std::shared_ptr create(); + +// ContextImpl(efaLib efaLib, efaDeviceList deviceList); + ContextImpl(); + + // Implement the DeferredExecutor interface. + bool inLoop() const override; + void deferToLoop(std::function fn) override; + + void registerDescriptor( + int fd, + int events, + std::shared_ptr h); + + void unregisterDescriptor(int fd); + + Reactor& getReactor(); + + protected: + // Implement the entry points called by ContextImplBoilerplate. + void handleErrorImpl() override; + void joinImpl() override; + + private: + Reactor reactor_; + EpollLoop loop_{this->reactor_}; +}; + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/error.cc b/tensorpipe/transport/efa/error.cc new file mode 100644 index 000000000..cf91931b6 --- /dev/null +++ b/tensorpipe/transport/efa/error.cc @@ -0,0 +1,37 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +// #include + +namespace tensorpipe { +namespace transport { +namespace efa { + +std::string efaError::what() const { + return error_; +} + +std::string GetaddrinfoError::what() const { + std::ostringstream ss; + ss << "getaddrinfo: " << gai_strerror(error_); + return ss.str(); +} + +std::string NoAddrFoundError::what() const { + return "no address found"; +} + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/error.h b/tensorpipe/transport/efa/error.h new file mode 100644 index 000000000..409b4bb70 --- /dev/null +++ b/tensorpipe/transport/efa/error.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +class efaError final : public BaseError { + public: + explicit efaError(std::string error) : error_(error) {} + + std::string what() const override; + + private: + std::string error_; +}; + +class GetaddrinfoError final : public BaseError { + public: + explicit GetaddrinfoError(int error) : error_(error) {} + + std::string what() const override; + + private: + int error_; +}; + +class NoAddrFoundError final : public BaseError { + public: + NoAddrFoundError() {} + + std::string what() const override; +}; + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/factory.cc b/tensorpipe/transport/efa/factory.cc new file mode 100644 index 000000000..c37363ed2 --- /dev/null +++ b/tensorpipe/transport/efa/factory.cc @@ -0,0 +1,27 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +std::shared_ptr create() { + return std::make_shared< + ContextBoilerplate>(); +} + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/factory.h b/tensorpipe/transport/efa/factory.h new file mode 100644 index 000000000..76611507e --- /dev/null +++ b/tensorpipe/transport/efa/factory.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +std::shared_ptr create(); + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/listener_impl.cc b/tensorpipe/transport/efa/listener_impl.cc new file mode 100644 index 000000000..ee6b6d147 --- /dev/null +++ b/tensorpipe/transport/efa/listener_impl.cc @@ -0,0 +1,154 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +ListenerImpl::ListenerImpl( + ConstructorToken token, + std::shared_ptr context, + std::string id, + std::string addr) + : ListenerImplBoilerplate( + token, + std::move(context), + std::move(id)), + sockaddr_(Sockaddr::createInetSockAddr(addr)) {} + +void ListenerImpl::initImplFromLoop() { + context_->enroll(*this); + + Error error; + TP_DCHECK(!socket_.hasValue()); + std::tie(error, socket_) = + Socket::createForFamily(sockaddr_.addr()->sa_family); + if (error) { + setError(std::move(error)); + return; + } + error = socket_.reuseAddr(true); + if (error) { + setError(std::move(error)); + return; + } + error = socket_.bind(sockaddr_); + if (error) { + setError(std::move(error)); + return; + } + error = socket_.block(false); + if (error) { + setError(std::move(error)); + return; + } + error = socket_.listen(128); + if (error) { + setError(std::move(error)); + return; + } +} + +void ListenerImpl::handleErrorImpl() { + if (!fns_.empty()) { + context_->unregisterDescriptor(socket_.fd()); + } + socket_.reset(); + for (auto& fn : fns_) { + fn(error_, std::shared_ptr()); + } + fns_.clear(); + + context_->unenroll(*this); +} + +void ListenerImpl::acceptImplFromLoop(accept_callback_fn fn) { + fns_.push_back(std::move(fn)); + + // Only register if we go from 0 to 1 pending callbacks. In other cases we + // already had a pending callback and thus we were already registered. + if (fns_.size() == 1) { + // Register with loop for readability events. + context_->registerDescriptor(socket_.fd(), EPOLLIN, shared_from_this()); + } +} + +std::string ListenerImpl::addrImplFromLoop() const { + struct sockaddr_storage ss; + struct sockaddr* addr = reinterpret_cast(&ss); + socklen_t addrlen = sizeof(ss); + int rv = getsockname(socket_.fd(), addr, &addrlen); + TP_THROW_SYSTEM_IF(rv < 0, errno); + return Sockaddr(addr, addrlen).str(); +} + +void ListenerImpl::handleEventsFromLoop(int events) { + TP_DCHECK(context_->inLoop()); + TP_VLOG(9) << "Listener " << id_ << " is handling an event on its socket (" + << EpollLoop::formatEpollEvents(events) << ")"; + + if (events & EPOLLERR) { + int error; + socklen_t errorlen = sizeof(error); + int rv = getsockopt( + socket_.fd(), + SOL_SOCKET, + SO_ERROR, + reinterpret_cast(&error), + &errorlen); + if (rv == -1) { + setError(TP_CREATE_ERROR(SystemError, "getsockopt", rv)); + } else { + setError(TP_CREATE_ERROR(SystemError, "async error on socket", error)); + } + return; + } + if (events & EPOLLHUP) { + setError(TP_CREATE_ERROR(EOFError)); + return; + } + TP_ARG_CHECK_EQ(events, EPOLLIN); + + Error error; + Socket socket; + std::tie(error, socket) = socket_.accept(); + if (error) { + setError(std::move(error)); + return; + } + + TP_DCHECK(!fns_.empty()) + << "when the callback is disarmed the listener's descriptor is supposed " + << "to be unregistered"; + auto fn = std::move(fns_.front()); + fns_.pop_front(); + if (fns_.empty()) { + context_->unregisterDescriptor(socket_.fd()); + } + fn(Error::kSuccess, createAndInitConnection(std::move(socket))); +} + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/listener_impl.h b/tensorpipe/transport/efa/listener_impl.h new file mode 100644 index 000000000..a85e6e44a --- /dev/null +++ b/tensorpipe/transport/efa/listener_impl.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +class ConnectionImpl; +class ContextImpl; + +class ListenerImpl final + : public ListenerImplBoilerplate, + public EpollLoop::EventHandler { + public: + // Create a listener that listens on the specified address. + ListenerImpl( + ConstructorToken token, + std::shared_ptr context, + std::string id, + std::string addr); + + // Implementation of EventHandler. + void handleEventsFromLoop(int events) override; + + protected: + // Implement the entry points called by ListenerImplBoilerplate. + void initImplFromLoop() override; + void acceptImplFromLoop(accept_callback_fn fn) override; + std::string addrImplFromLoop() const override; + void handleErrorImpl() override; + + private: + Socket socket_; + Sockaddr sockaddr_; + std::deque fns_; +}; + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/mode.md b/tensorpipe/transport/efa/mode.md new file mode 100644 index 000000000..ead2f46ae --- /dev/null +++ b/tensorpipe/transport/efa/mode.md @@ -0,0 +1,49 @@ +The general model of EFA is similar to efa, which has event queues and completion queues for send/recv. +User have to poll the completion queue to trigger the send/recv event happen. Otherwise the event will just stay in the event queue. + +The code below mainly from aws-ofi-nccl, which used libfabric to implement nccl's interface + +Send process: + +1. Push sent event to the completion queue +```Cpp +while (true){ + rc = fi_send(...) # return code + if (rc == 0) + break; # send succeed + else if (rc == -FI_EAGAIN) { + # This is a retryable error + # Can attempt to progress the completion queue to make send event happen + /* + * Process completions so that you have enough + * resources for sending connect message + */ + ret = nccl_ofi_progress(nccl_ofi_component[dev]); + if (OFI_UNLIKELY(ret != 0)) + goto error; + } + else { + NCCL_OFI_WARN("Unable to send connect message for dev %d. RC: %zd, ERROR: %s", + dev, rc, fi_strerror(-rc)); + ret = ncclSystemError; + goto error; + } +} while (true); +``` +This part is bit different from efa, that pushing send event to the event queue may fail, which might need retry. + +2. Progress the completion queue +```Cpp +do { + ret = nccl_ofi_progress(nccl_ofi_component[dev]); + if (OFI_UNLIKELY(ret != 0)) + goto error; +} while (true); +``` + +The receive process is the same as the send process. + +Some design question I'd like to ask for suggestions: +1. Since the memory doesn't need to be pinned/registered, do I still need the RingBuffer related class for EFA? +2. How should I arrange the event loop? Previously I used a busy polling thread for the progress of completion queue, and retrying push the send event to the queue directly in the main thread. Is this a good practice in tensorpipe? + \ No newline at end of file diff --git a/tensorpipe/transport/efa/reactor.cc b/tensorpipe/transport/efa/reactor.cc new file mode 100644 index 000000000..c70e2ea60 --- /dev/null +++ b/tensorpipe/transport/efa/reactor.cc @@ -0,0 +1,265 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +Reactor::Reactor() { + postRecvRequests(kNumPendingRecvReqs); + startThread("TP_efa_reactor"); +} + +void Reactor::postSend( + void* buffer, + size_t size, + uint64_t tag, + fi_addr_t peer_addr, + void* context) { + // First try send all messages in pending queue + while (!pendingSends_.empty()) { + EFASendEvent sevent = pendingSends_.front(); + int ret = + this->endpoint->PushSendEvent(sevent.buffer, sevent.size, sevent.tag, sevent.peer_addr, sevent.context); + if (ret == 0) { + // Send successfully, pop out events + pendingSends_.pop_front(); + } else if (ret == -FI_EAGAIN) { + // Event queue is full now, push the event into pending queue and return + pendingSends_.push_back({buffer, size, tag, peer_addr, context}); + return; + } else if (ret < 0) { + // Unknown failure, raise exception + TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); + } + } + + // No pending events, send out directly + int ret = + this->endpoint->PushSendEvent(buffer, size, tag, peer_addr, context); + if (ret == 0) { + // Send successfully + return; + } else if (ret == -FI_EAGAIN) { + // Event queue is full now, push the event into pending queue and return + pendingSends_.push_back({buffer, size, tag, peer_addr, context}); + return; + } else if (ret < 0) { + TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); + } +} + +int Reactor::postPendingSends() { + while (!pendingSends_.empty()) { + EFASendEvent sevent = pendingSends_.front(); + int ret = this->endpoint->PushSendEvent( + sevent.buffer, + sevent.size, + sevent.tag, + sevent.peer_addr, + sevent.context); // ignore low 32 bits on tag matching + if (ret == 0) { + // Send successfully, pop out events + pendingSends_.pop_front(); + } else if (ret == -FI_EAGAIN) { + return pendingSends_.size(); + } else if (ret < 0) { + // Unknown failure, raise exception + TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); + } + return 0; + } +} + +int Reactor::postPendingRecvs() { + while (!pendingRecvs_.empty()) { + EFARecvEvent revent = pendingRecvs_.front(); + int ret = this->endpoint->PushRecvEvent( + revent.buffer, + revent.size, + revent.tag, + revent.peer_addr, + revent.ignore, + revent.context); // ignore low 32 bits on tag matching + if (ret == 0) { + // Send successfully, pop out events + pendingRecvs_.pop_front(); + } else if (ret == -FI_EAGAIN) { + return pendingRecvs_.size(); + } else if (ret < 0) { + // Unknown failure, raise exception + TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); + } + } + return 0; +} + +void Reactor::postRecv( + void* buffer, + size_t size, + uint64_t tag, + fi_addr_t dest_addr, + uint64_t ignore, + void* context) { + // First try send all messages in pending queue + int pendingRecvNum = postPendingRecvs(); + if (pendingRecvNum == 0){ + // No pending events, send out directly + int ret = + this->endpoint->PushRecvEvent(buffer, size, tag, dest_addr, ignore, context); + if (ret == 0) { + // Send successfully + return; + } else if (ret == -FI_EAGAIN) { + // Event queue is full now, push the event into pending queue and return + pendingRecvs_.push_back({buffer, size, tag, dest_addr, ignore, context}); + return; + } else if (ret < 0) { + TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); + } + } else { + pendingRecvs_.push_back({buffer, size, tag, dest_addr, ignore, context}); + return; + } + +} + +// void Reactor::postRecvRequests(int num) { +// while (num > 0) { +// efaLib::recv_wr* badRecvWr = nullptr; +// std::array wrs; +// std::memset(wrs.data(), 0, sizeof(wrs)); +// for (int i = 0; i < std::min(num, kNumPolledWorkCompletions) - 1; i++) { +// wrs[i].next = &wrs[i + 1]; +// } +// int rv = getefaLib().post_srq_recv(srq_.get(), wrs.data(), &badRecvWr); +// TP_THROW_SYSTEM_IF(rv != 0, errno); +// TP_THROW_ASSERT_IF(badRecvWr != nullptr); +// num -= std::min(num, kNumPolledWorkCompletions); +// } +// } + +void Reactor::setId(std::string id) { + id_ = std::move(id); +} + +void Reactor::close() { + if (!closed_.exchange(true)) { + stopBusyPolling(); + } +} + +void Reactor::join() { + close(); + + if (!joined_.exchange(true)) { + joinThread(); + } +} + +Reactor::~Reactor() { + join(); +} + +// void Reactor::postRecvRequests(int num){ +// uint64_t size_buffer; +// int ret = endpoint->PushRecvEvent(&size_buffer, sizeof(uint64_t), kLength, +// FI_ADDR_UNSPEC); + +// } + +bool Reactor::pollOnce() { + std::array cq_entries; + std::array src_addrs; + auto rv = + endpoint->PollCQ(cq_entries.data(), src_addrs.data(), cq_entries.size()); + + if (rv == 0) { + return false; + } + TP_THROW_SYSTEM_IF(rv < 0, errno); + + int numRecvs = 0; + int numWrites = 0; + int numAcks = 0; + for (int cqIdx = 0; cqIdx < rv; cqIdx++) { + struct fi_cq_tagged_entry& cq = cq_entries[cqIdx]; + fi_addr_t& src_addr = src_addrs[cqIdx]; + uint32_t msg_idx = static_cast(cq.tag); + if (cq.flags && FI_SEND) { + // Send event + if (cq.flags && kLength) { + // Send size finished, check whether it's zero sized message + auto* operation_ptr = static_cast(cq.op_context); + if (operation_ptr->length_ == 0){ + operation_ptr->mode_ = EFAWriteOperation::Mode::COMPLETE; + operation_ptr->callbackFromLoop(Error::kSuccess); + } + } else if (cq.flags && kPayload) { + auto* operation_ptr = static_cast(cq.op_context); + operation_ptr->mode_ = EFAWriteOperation::Mode::COMPLETE; + operation_ptr->callbackFromLoop(Error::kSuccess); + } + } else if (cq.flags && FI_RECV) { + // Receive event + // auto iter = efaEventHandler_.find(src_addr); + if (cq.tag && kLength) { + // Received length information + auto* operation_ptr = static_cast(cq.op_context); + operation_ptr->mode_ = EFAReadOperation::Mode::READ_PAYLOAD; + operation_ptr->allocFromLoop(); + // postRecv() + // void* buffer = operation_ptr->perpareBuffer(); + postRecv( + operation_ptr->ptr_, + operation_ptr->readLength_, + kPayload | msg_idx, + src_addr, + 0, // Exact match of tag + operation_ptr); + // iter->second->onRecvLength(msg_idx); + } else if (cq.tag && kPayload) { + // Received payload + auto* operation_ptr = static_cast(cq.op_context); + operation_ptr->mode_ = EFAReadOperation::Mode::COMPLETE; + operation_ptr->callbackFromLoop(Error::kSuccess); + } + } + // auto iter = queuePairEventHandler_.find(wc.qp_num); + // TP_THROW_ASSERT_IF(iter == queuePairEventHandler_.end()) + // << "Got work completion for unknown queue pair " << wc.qp_num; + + // if (wc.status != efaLib::WC_SUCCESS) { + // iter->second->onError(wc.status, wc.wr_id); + // continue; + // } + } + + return true; +} + +bool Reactor::readyToClose() { + return true; + // return queuePairEventHandler_.size() == 0; +} + +// void Reactor::registerQp( +// uint32_t qpn, +// std::shared_ptr eventHandler) { +// queuePairEventHandler_.emplace(qpn, std::move(eventHandler)); +// } + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/reactor.h b/tensorpipe/transport/efa/reactor.h new file mode 100644 index 000000000..c6c2130da --- /dev/null +++ b/tensorpipe/transport/efa/reactor.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +// class efaEventHandler { +// public: +// virtual void onRecvLength(uint32_t msg_idx) = 0; + +// virtual void onSendData(uint32_t msg_idx) = 0; + +// virtual void onSendCompleted(uint32_t msg_idx) = 0; + +// virtual void onRecvCompleted(uint32_t msg_idx) = 0; + +// virtual void onError(int errno) = 0; + +// virtual ~efaEventHandler() = default; +// }; + +enum efaTag: uint64_t{ + kLength = 1ULL << 32, + kPayload = 1ULL << 33, +}; + +// Reactor loop. +// +// Companion class to the event loop in `loop.h` that executes +// functions on triggers. The triggers are posted to a shared memory +// ring buffer, so this can be done by other processes on the same +// machine. It uses extra data in the ring buffer header to store a +// mutex and condition variable to avoid a busy loop. +// +class Reactor final : public BusyPollingLoop { + public: + // Reactor(efaLib efaLib, efaDeviceList deviceList); + Reactor(); + + // const efaLib& getefaLib() { + // return efaLib_; + // } + + // efaProtectionDomain& getefaPd() { + // return pd_; + // } + + // efaCompletionQueue& getefaCq() { + // return cq_; + // } + + // efaSharedReceiveQueue& getefaSrq() { + // return srq_; + // } + + // const efaAddress& getefaAddress() { + // return addr_; + // } + + // void registerQp(uint32_t qpn, std::shared_ptr eventHandler); + + // void unregisterQp(uint32_t qpn); + + void postSend(void* buffer, size_t size, uint64_t tag, fi_addr_t peer_addr, void* context); + + void postRecv(void* buffer, size_t size, uint64_t tag, fi_addr_t peer_addr, uint64_t ignore, void* context); + // void postAck(efaQueuePair& qp, efaLib::send_wr& wr); + + void setId(std::string id); + + void close(); + + void join(); + + ~Reactor(); + + protected: + bool pollOnce() override; + + bool readyToClose() override; + + struct EFASendEvent{ + void* buffer; + size_t size; + uint64_t tag; + fi_addr_t peer_addr; + void* context; + }; + + struct EFARecvEvent{ + void* buffer; + size_t size; + uint64_t tag; + fi_addr_t peer_addr; + uint64_t ignore; + void* context; + }; + private: + // InfiniBand stuff + // const efaLib efaLib_; + // efaContext ctx_; + // efaProtectionDomain pd_; + // efaCompletionQueue cq_; + // efaSharedReceiveQueue srq_; + // efaAddress addr_; + int postPendingRecvs(); + int postPendingSends(); + + std::shared_ptr endpoint; + + void postRecvRequests(int num); + + std::atomic closed_{false}; + std::atomic joined_{false}; + + std::array size_buffer; + + // An identifier for the context, composed of the identifier for the context, + // combined with the transport's name. It will only be used for logging and + // debugging purposes. + std::string id_{"N/A"}; + + // The registered event handlers for each queue pair. + // std::unordered_map> + // efaEventHandler_; + + // uint32_t numAvailableWrites_{kNumPendingWriteReqs}; + // uint32_t numAvailableAcks_{kNumPendingAckReqs}; + std::deque pendingSends_; + std::deque pendingRecvs_; + // std::deque> pendingQpAcks_; +}; + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/sockaddr.cc b/tensorpipe/transport/efa/sockaddr.cc new file mode 100644 index 000000000..89b02e01d --- /dev/null +++ b/tensorpipe/transport/efa/sockaddr.cc @@ -0,0 +1,142 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +Sockaddr Sockaddr::createInetSockAddr(const std::string& str) { + int port = 0; + std::string addrStr; + std::string portStr; + + // If the input string is an IPv6 address with port, the address + // itself must be wrapped with brackets. + if (addrStr.empty()) { + auto start = str.find("["); + auto stop = str.find("]"); + if (start < stop && start != std::string::npos && + stop != std::string::npos) { + addrStr = str.substr(start + 1, stop - (start + 1)); + if (stop + 1 < str.size() && str[stop + 1] == ':') { + portStr = str.substr(stop + 2); + } + } + } + + // If the input string is an IPv4 address with port, we expect + // at least a single period and a single colon in the string. + if (addrStr.empty()) { + auto period = str.find("."); + auto colon = str.find(":"); + if (period != std::string::npos && colon != std::string::npos) { + addrStr = str.substr(0, colon); + portStr = str.substr(colon + 1); + } + } + + // Fallback to using entire input string as address without port. + if (addrStr.empty()) { + addrStr = str; + } + + // Parse port number if specified. + if (!portStr.empty()) { + port = std::stoi(portStr); + if (port < 0 || port > std::numeric_limits::max()) { + TP_THROW_EINVAL() << str; + } + } + + // Try to convert an IPv4 address. + { + struct sockaddr_in addr; + std::memset(&addr, 0, sizeof(addr)); + auto rv = inet_pton(AF_INET, addrStr.c_str(), &addr.sin_addr); + TP_THROW_SYSTEM_IF(rv < 0, errno); + if (rv == 1) { + addr.sin_family = AF_INET; + addr.sin_port = ntohs(port); + return Sockaddr(reinterpret_cast(&addr), sizeof(addr)); + } + } + + // Try to convert an IPv6 address. + { + struct sockaddr_in6 addr; + std::memset(&addr, 0, sizeof(addr)); + + auto interfacePos = addrStr.find('%'); + if (interfacePos != std::string::npos) { + addr.sin6_scope_id = + if_nametoindex(addrStr.substr(interfacePos + 1).c_str()); + addrStr = addrStr.substr(0, interfacePos); + } + + auto rv = inet_pton(AF_INET6, addrStr.c_str(), &addr.sin6_addr); + TP_THROW_SYSTEM_IF(rv < 0, errno); + if (rv == 1) { + addr.sin6_family = AF_INET6; + addr.sin6_port = ntohs(port); + return Sockaddr(reinterpret_cast(&addr), sizeof(addr)); + } + } + + // Invalid address. + TP_THROW_EINVAL() << str; + + // Return bogus to silence "return from non-void function" warning. + // Note: we don't reach this point per the throw above. + return Sockaddr(nullptr, 0); +} + +std::string Sockaddr::str() const { + std::ostringstream oss; + + if (addr_.ss_family == AF_INET) { + std::array buf; + auto in = reinterpret_cast(&addr_); + auto rv = inet_ntop(AF_INET, &in->sin_addr, buf.data(), buf.size()); + TP_THROW_SYSTEM_IF(rv == nullptr, errno); + oss << buf.data() << ":" << htons(in->sin_port); + } else if (addr_.ss_family == AF_INET6) { + std::array buf; + auto in6 = reinterpret_cast(&addr_); + auto rv = inet_ntop(AF_INET6, &in6->sin6_addr, buf.data(), buf.size()); + TP_THROW_SYSTEM_IF(rv == nullptr, errno); + oss << "[" << buf.data(); + if (in6->sin6_scope_id > 0) { + std::array scopeBuf; + rv = if_indextoname(in6->sin6_scope_id, scopeBuf.data()); + TP_THROW_SYSTEM_IF(rv == nullptr, errno); + oss << "%" << scopeBuf.data(); + } + oss << "]:" << htons(in6->sin6_port); + + } else { + TP_THROW_EINVAL() << "invalid address family: " << addr_.ss_family; + } + + return oss.str(); +} + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/sockaddr.h b/tensorpipe/transport/efa/sockaddr.h new file mode 100644 index 000000000..cb6bbfd07 --- /dev/null +++ b/tensorpipe/transport/efa/sockaddr.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include + +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +class Sockaddr final : public tensorpipe::Sockaddr { + public: + static Sockaddr createInetSockAddr(const std::string& str); + + Sockaddr(const struct sockaddr* addr, socklen_t addrlen) { + TP_ARG_CHECK(addr != nullptr); + TP_ARG_CHECK_LE(addrlen, sizeof(addr_)); + // Ensure the sockaddr_storage is zeroed, because we don't always + // write to all fields in the `sockaddr_[in|in6]` structures. + std::memset(&addr_, 0, sizeof(addr_)); + std::memcpy(&addr_, addr, addrlen); + addrlen_ = addrlen; + } + + inline const struct sockaddr* addr() const override { + return reinterpret_cast(&addr_); + } + + inline struct sockaddr* addr() { + return reinterpret_cast(&addr_); + } + + inline socklen_t addrlen() const override { + return addrlen_; + } + + std::string str() const; + + private: + struct sockaddr_storage addr_; + socklen_t addrlen_; +}; + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/utility.cc b/tensorpipe/transport/efa/utility.cc new file mode 100644 index 000000000..8df5572e5 --- /dev/null +++ b/tensorpipe/transport/efa/utility.cc @@ -0,0 +1,178 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +namespace { + +struct InterfaceAddressesDeleter { + void operator()(struct ifaddrs* ptr) { + ::freeifaddrs(ptr); + } +}; + +using InterfaceAddresses = + std::unique_ptr; + +std::tuple createInterfaceAddresses() { + struct ifaddrs* ifaddrs; + auto rv = ::getifaddrs(&ifaddrs); + if (rv < 0) { + return std::make_tuple( + TP_CREATE_ERROR(SystemError, "getifaddrs", errno), + InterfaceAddresses()); + } + return std::make_tuple(Error::kSuccess, InterfaceAddresses(ifaddrs)); +} + +std::tuple getHostname() { + std::array hostname; + auto rv = ::gethostname(hostname.data(), hostname.size()); + if (rv < 0) { + return std::make_tuple( + TP_CREATE_ERROR(SystemError, "gethostname", errno), std::string()); + } + return std::make_tuple(Error::kSuccess, std::string(hostname.data())); +} + +struct AddressInfoDeleter { + void operator()(struct addrinfo* ptr) { + ::freeaddrinfo(ptr); + } +}; + +using AddressInfo = std::unique_ptr; + +std::tuple createAddressInfo(std::string host) { + struct addrinfo hints; + std::memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = IPPROTO_TCP; + + struct addrinfo* result; + auto rv = ::getaddrinfo(host.c_str(), nullptr, &hints, &result); + if (rv != 0) { + return std::make_tuple( + TP_CREATE_ERROR(GetaddrinfoError, rv), AddressInfo()); + } + return std::make_tuple(Error::kSuccess, AddressInfo(result)); +} + +} // namespace + +std::tuple lookupAddrForIface(std::string iface) { + Error error; + InterfaceAddresses addresses; + std::tie(error, addresses) = createInterfaceAddresses(); + if (error) { + return std::make_tuple(std::move(error), std::string()); + } + + struct ifaddrs* ifa; + for (ifa = addresses.get(); ifa != nullptr; ifa = ifa->ifa_next) { + // Skip entry if ifa_addr is NULL (see getifaddrs(3)) + if (ifa->ifa_addr == nullptr) { + continue; + } + + if (iface != ifa->ifa_name) { + continue; + } + + switch (ifa->ifa_addr->sa_family) { + case AF_INET: + return std::make_tuple( + Error::kSuccess, + Sockaddr(ifa->ifa_addr, sizeof(struct sockaddr_in)).str()); + case AF_INET6: + return std::make_tuple( + Error::kSuccess, + Sockaddr(ifa->ifa_addr, sizeof(struct sockaddr_in6)).str()); + } + } + + return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string()); +} + +std::tuple lookupAddrForHostname() { + Error error; + std::string hostname; + std::tie(error, hostname) = getHostname(); + if (error) { + return std::make_tuple(std::move(error), std::string()); + } + + AddressInfo info; + std::tie(error, info) = createAddressInfo(std::move(hostname)); + if (error) { + return std::make_tuple(std::move(error), std::string()); + } + + Error firstError; + for (struct addrinfo* rp = info.get(); rp != nullptr; rp = rp->ai_next) { + TP_DCHECK(rp->ai_family == AF_INET || rp->ai_family == AF_INET6); + TP_DCHECK_EQ(rp->ai_socktype, SOCK_STREAM); + TP_DCHECK_EQ(rp->ai_protocol, IPPROTO_TCP); + + Sockaddr addr = Sockaddr(rp->ai_addr, rp->ai_addrlen); + + Socket socket; + std::tie(error, socket) = Socket::createForFamily(rp->ai_family); + + if (!error) { + error = socket.bind(addr); + } + + if (error) { + // Record the first binding error we encounter and return that in the end + // if no working address is found, in order to help with debugging. + if (!firstError) { + firstError = error; + } + continue; + } + + return std::make_tuple(Error::kSuccess, addr.str()); + } + + if (firstError) { + return std::make_tuple(std::move(firstError), std::string()); + } else { + return std::make_tuple(TP_CREATE_ERROR(NoAddrFoundError), std::string()); + } +} + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/utility.h b/tensorpipe/transport/efa/utility.h new file mode 100644 index 000000000..e4ec1a4de --- /dev/null +++ b/tensorpipe/transport/efa/utility.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include + +namespace tensorpipe { +namespace transport { +namespace efa { + +std::tuple lookupAddrForIface(std::string iface); + +std::tuple lookupAddrForHostname(); + +} // namespace efa +} // namespace transport +} // namespace tensorpipe diff --git a/third_party/libfabric b/third_party/libfabric new file mode 160000 index 000000000..4c47e0b0c --- /dev/null +++ b/third_party/libfabric @@ -0,0 +1 @@ +Subproject commit 4c47e0b0cf92bec1fc9003ac046612cd05490992 From 6cf90ef01c7db97c810d8c1a66c150726ad89ca5 Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Sun, 29 Aug 2021 16:39:05 +0100 Subject: [PATCH 02/19] fix --- tensorpipe/common/{fabric.cc => efa.cc} | 30 +++++++++-- tensorpipe/common/{fabric.h => efa.h} | 11 +++- tensorpipe/common/efa_lib.h | 60 +++++++++++++++++++++ tensorpipe/transport/efa/connection_impl.cc | 2 +- tensorpipe/transport/efa/connection_impl.h | 2 +- tensorpipe/transport/efa/context_impl.cc | 26 +++++---- tensorpipe/transport/efa/reactor.cc | 3 +- tensorpipe/transport/efa/reactor.h | 4 +- 8 files changed, 117 insertions(+), 21 deletions(-) rename tensorpipe/common/{fabric.cc => efa.cc} (79%) rename tensorpipe/common/{fabric.h => efa.h} (96%) create mode 100644 tensorpipe/common/efa_lib.h diff --git a/tensorpipe/common/fabric.cc b/tensorpipe/common/efa.cc similarity index 79% rename from tensorpipe/common/fabric.cc rename to tensorpipe/common/efa.cc index a6e8f9985..82f12c514 100644 --- a/tensorpipe/common/fabric.cc +++ b/tensorpipe/common/efa.cc @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include #include #include @@ -92,11 +92,31 @@ UniqueFabricPtr FabricContext::getFabricInfo(){ struct fi_info* info_; int ret = fi_getinfo(FABRIC_VERSION, nullptr, nullptr, 0, hints.get(), &info_); - info.reset(info_); - TP_THROW_ASSERT() << "Could not find any optimal provider. Return Code: " - << ret << ". ERROR: " << fi_strerror(-ret); + info.reset(info_); + TP_CHECK_EFA_RET(ret, "fi_getinfo failed"); + // TP_THROW_ASSERT() << "Could not find any optimal provider. Return Code: " + // << ret << ". ERROR: " << fi_strerror(-ret); return info; - // TP_CHECK_EFA_RET(ret, "fi_getinfo failed"); +} + +bool FabricEndpoint::isEfaAvailable(){ + UniqueFabricPtr hints(fi_allocinfo()); + hints->mode = FI_CONTEXT; + hints->ep_attr->type = FI_EP_RDM; // Reliable Datagram + hints->caps = FI_TAGGED | FI_MSG | FI_REMOTE_COMM | FI_DIRECTED_RECV | FI_LOCAL_COMM | FI_SOURCE; + hints->tx_attr->msg_order = FI_ORDER_SAS; + hints->rx_attr->msg_order = FI_ORDER_SAS; + hints->domain_attr->control_progress = FI_PROGRESS_AUTO; + hints->domain_attr->data_progress = FI_PROGRESS_AUTO; + hints->domain_attr->caps = + FI_LOCAL_COMM | FI_REMOTE_COMM; // Enable local loopback + hints->domain_attr->av_type = FI_AV_TABLE; + hints->fabric_attr->prov_name = strdup("efa"); + UniqueFabricPtr info; + struct fi_info* info_; + int ret = + fi_getinfo(FABRIC_VERSION, nullptr, nullptr, 0, hints.get(), &info_); + return info_ == nullptr; } FabricEndpoint::FabricEndpoint(){ diff --git a/tensorpipe/common/fabric.h b/tensorpipe/common/efa.h similarity index 96% rename from tensorpipe/common/fabric.h rename to tensorpipe/common/efa.h index 6d46319e7..78e306022 100644 --- a/tensorpipe/common/fabric.h +++ b/tensorpipe/common/efa.h @@ -6,7 +6,9 @@ * LICENSE file in the root directory of this source tree. */ -#pragma once +#ifndef COMMON_EFA_H_ +#define COMMON_EFA_H_ + #include #include @@ -140,7 +142,12 @@ class FabricEndpoint { int PollCQ(struct fi_cq_tagged_entry* cq_entries, fi_addr_t* src_addrs, size_t count); + static bool isEfaAvailable(); + // Fabric Context contains everything std::unique_ptr fabric_ctx; }; -} // namespace tensorpipe \ No newline at end of file +} // namespace tensorpipe + + +#endif // COMMON_EFA_H_ diff --git a/tensorpipe/common/efa_lib.h b/tensorpipe/common/efa_lib.h new file mode 100644 index 000000000..1e299e024 --- /dev/null +++ b/tensorpipe/common/efa_lib.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include + +namespace tensorpipe { + +// Wrapper for libibverbs. + +class EfaLib { + public: + private: + explicit EfaLib(DynamicLibraryHandle dlhandle) + : dlhandle_(std::move(dlhandle)) {} + + DynamicLibraryHandle dlhandle_; + public: + EfaLib() = default; + + static std::tuple create() { + Error error; + DynamicLibraryHandle dlhandle; + // To keep things "neat" and contained, we open in "local" mode (as opposed + // to global) so that the ibverbs symbols can only be resolved through this + // handle and are not exposed (a.k.a., "leaded") to other shared objects. + std::tie(error, dlhandle) = + DynamicLibraryHandle::create("libfabric.so", RTLD_LOCAL | RTLD_LAZY); + if (error) { + return std::make_tuple(std::move(error), EfaLib()); + } + // Log at level 9 as we can't know whether this will be used in a transport + // or channel, thus err on the side of this being as low-level as possible + // because we don't expect this to be of interest that often. + TP_VLOG(9) << [&]() -> std::string { + std::string filename; + std::tie(error, filename) = dlhandle.getFilename(); + if (error) { + return "Couldn't determine location of shared library libfabric.so: " + + error.what(); + } + return "Found shared library libfabric.so at " + filename; + }(); + EfaLib lib(std::move(dlhandle)); + return std::make_tuple(Error::kSuccess, std::move(lib)); + } + +}; + + +} // namespace tensorpipe diff --git a/tensorpipe/transport/efa/connection_impl.cc b/tensorpipe/transport/efa/connection_impl.cc index 1f809afec..77fcea6c5 100644 --- a/tensorpipe/transport/efa/connection_impl.cc +++ b/tensorpipe/transport/efa/connection_impl.cc @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/tensorpipe/transport/efa/connection_impl.h b/tensorpipe/transport/efa/connection_impl.h index 285e0a4a9..aabdb4d4b 100644 --- a/tensorpipe/transport/efa/connection_impl.h +++ b/tensorpipe/transport/efa/connection_impl.h @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/tensorpipe/transport/efa/context_impl.cc b/tensorpipe/transport/efa/context_impl.cc index 3e35a2245..803f36816 100644 --- a/tensorpipe/transport/efa/context_impl.cc +++ b/tensorpipe/transport/efa/context_impl.cc @@ -7,7 +7,8 @@ */ #include - +#include +#include #include #include @@ -35,14 +36,21 @@ std::string generateDomainDescriptor() { std::shared_ptr ContextImpl::create() { Error error; - // efaLib efaLib; - // std::tie(error, efaLib) = efaLib::create(); - // if (error) { - // TP_VLOG(7) - // << "efa transport is not viable because libefaerbs couldn't be loaded: " - // << error.what(); - // return nullptr; - // } + EfaLib efaLib; + std::tie(error, efaLib) = EfaLib::create(); + if (error) { + TP_VLOG(7) + << "efa transport is not viable because libfabric couldn't be loaded: " + << error.what(); + return nullptr; + } + + bool isEfaAvailable = FabricEndpoint::isEfaAvailable(); + if (!FabricEndpoint::isEfaAvailable()){ + TP_VLOG(7) + << "libfabric cannot find efa provider."; + return nullptr; + } // efaDeviceList deviceList; // std::tie(error, deviceList) = efaDeviceList::create(efaLib); diff --git a/tensorpipe/transport/efa/reactor.cc b/tensorpipe/transport/efa/reactor.cc index c70e2ea60..cea6e6540 100644 --- a/tensorpipe/transport/efa/reactor.cc +++ b/tensorpipe/transport/efa/reactor.cc @@ -17,7 +17,8 @@ namespace transport { namespace efa { Reactor::Reactor() { - postRecvRequests(kNumPendingRecvReqs); + // postRecvRequests(kNumPendingRecvReqs); + endpoint = std::make_shared(); startThread("TP_efa_reactor"); } diff --git a/tensorpipe/transport/efa/reactor.h b/tensorpipe/transport/efa/reactor.h index c6c2130da..90842ad28 100644 --- a/tensorpipe/transport/efa/reactor.h +++ b/tensorpipe/transport/efa/reactor.h @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include @@ -132,7 +132,7 @@ class Reactor final : public BusyPollingLoop { std::shared_ptr endpoint; - void postRecvRequests(int num); + // void postRecvRequests(int num); std::atomic closed_{false}; std::atomic joined_{false}; From 4abd29dfc934e744519a1c55c44f488eb0f5ee91 Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Mon, 30 Aug 2021 15:40:35 +0100 Subject: [PATCH 03/19] fix --- tensorpipe/CMakeLists.txt | 2 +- tensorpipe/transport/efa/reactor.cc | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorpipe/CMakeLists.txt b/tensorpipe/CMakeLists.txt index 5ed5bded4..f3cf81405 100644 --- a/tensorpipe/CMakeLists.txt +++ b/tensorpipe/CMakeLists.txt @@ -170,7 +170,7 @@ tp_conditional_backend( TP_ENABLE_EFA "Enable EFA transport" "LINUX") if(TP_ENABLE_EFA) list(APPEND TP_SRCS - common/fabric.cc + common/efa.cc transport/efa/connection_impl.cc transport/efa/context_impl.cc transport/efa/error.cc diff --git a/tensorpipe/transport/efa/reactor.cc b/tensorpipe/transport/efa/reactor.cc index cea6e6540..2a23c2440 100644 --- a/tensorpipe/transport/efa/reactor.cc +++ b/tensorpipe/transport/efa/reactor.cc @@ -79,8 +79,9 @@ int Reactor::postPendingSends() { // Unknown failure, raise exception TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); } - return 0; } + + return 0; } int Reactor::postPendingRecvs() { From 263cbf5ad3efc4d64d355b4549aaf4e398a99711 Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Tue, 31 Aug 2021 10:36:50 +0100 Subject: [PATCH 04/19] fix --- tensorpipe/benchmark/CMakeLists.txt | 2 +- tensorpipe/benchmark/transport_registry.cc | 11 +++++++++++ tensorpipe/common/efa.cc | 13 ++++++++++++- tensorpipe/common/efa.h | 2 +- tensorpipe/config.h.in | 1 + tensorpipe/tensorpipe.h | 6 ++++++ 6 files changed, 32 insertions(+), 3 deletions(-) diff --git a/tensorpipe/benchmark/CMakeLists.txt b/tensorpipe/benchmark/CMakeLists.txt index d12c030aa..efd4c05e0 100644 --- a/tensorpipe/benchmark/CMakeLists.txt +++ b/tensorpipe/benchmark/CMakeLists.txt @@ -6,7 +6,7 @@ # TODO: Make those separate CMake projects. -add_executable(benchmark_transport benchmark_transport.cc options.cc transport_registry.cc channel_registry.cc) +add_executable(benchmark_transport benchmark_transport.cc options.cc transport_registry.cc) target_link_libraries(benchmark_transport PRIVATE tensorpipe) add_executable(benchmark_pipe benchmark_pipe.cc options.cc transport_registry.cc channel_registry.cc) diff --git a/tensorpipe/benchmark/transport_registry.cc b/tensorpipe/benchmark/transport_registry.cc index b18778a19..40c999f08 100644 --- a/tensorpipe/benchmark/transport_registry.cc +++ b/tensorpipe/benchmark/transport_registry.cc @@ -42,6 +42,17 @@ std::shared_ptr makeUvContext() { TP_REGISTER_CREATOR(TensorpipeTransportRegistry, uv, makeUvContext); +// EFA + +#if TENSORPIPE_HAS_EFA_TRANSPORT +std::shared_ptr makeEfaContext() { + return tensorpipe::transport::efa::create(); +} + +TP_REGISTER_CREATOR(TensorpipeTransportRegistry, efa, makeEfaContext); +#endif // TENSORPIPE_HAS_EFA_TRANSPORT + + void validateTransportContext( std::shared_ptr context) { if (!context) { diff --git a/tensorpipe/common/efa.cc b/tensorpipe/common/efa.cc index 82f12c514..d92ac6552 100644 --- a/tensorpipe/common/efa.cc +++ b/tensorpipe/common/efa.cc @@ -74,6 +74,7 @@ FabricContext::FabricContext(){ fi_av_straddr(av.get(), addr.name, readable_addr.name, &readable_addr.len); } + UniqueFabricPtr FabricContext::getFabricInfo(){ UniqueFabricPtr hints(fi_allocinfo()); hints->mode = FI_CONTEXT; @@ -116,7 +117,8 @@ bool FabricEndpoint::isEfaAvailable(){ struct fi_info* info_; int ret = fi_getinfo(FABRIC_VERSION, nullptr, nullptr, 0, hints.get(), &info_); - return info_ == nullptr; + info.reset(info_); + return (ret == 0); } FabricEndpoint::FabricEndpoint(){ @@ -145,5 +147,14 @@ int FabricEndpoint::PushRecvEvent(void* buffer, size_t size, uint64_t tag, fi_ad } +fi_addr_t FabricEndpoint::AddPeerAddr(FabricAddr* addr){ + TP_VLOG(9) << "Add addr"; + fi_addr_t peer_addr; + int ret = + fi_av_insert(fabric_ctx->av.get(), addr->name, 1, &peer_addr, 0, nullptr); + TP_DCHECK_EQ(ret, 1); + TP_CHECK_EFA_RET(ret, "Unable to add address to endpoint"); + return peer_addr; +}; } // namespace tensorpipe \ No newline at end of file diff --git a/tensorpipe/common/efa.h b/tensorpipe/common/efa.h index 78e306022..0cf8902b6 100644 --- a/tensorpipe/common/efa.h +++ b/tensorpipe/common/efa.h @@ -32,7 +32,7 @@ namespace tensorpipe { #define TP_CHECK_EFA_RET(ret, msg) \ do { \ - if (ret != 0) { \ + if (ret < 0) { \ TP_THROW_ASSERT() << msg << ". Return Code: " << ret \ << ". ERROR: " << fi_strerror(-ret); \ } \ diff --git a/tensorpipe/config.h.in b/tensorpipe/config.h.in index ff5fa16a0..e26726f51 100644 --- a/tensorpipe/config.h.in +++ b/tensorpipe/config.h.in @@ -10,5 +10,6 @@ #cmakedefine01 TENSORPIPE_HAS_SHM_TRANSPORT #cmakedefine01 TENSORPIPE_HAS_IBV_TRANSPORT +#cmakedefine01 TENSORPIPE_HAS_EFA_TRANSPORT #cmakedefine01 TENSORPIPE_HAS_CMA_CHANNEL diff --git a/tensorpipe/tensorpipe.h b/tensorpipe/tensorpipe.h index 15b54f97c..d720806aa 100644 --- a/tensorpipe/tensorpipe.h +++ b/tensorpipe/tensorpipe.h @@ -41,6 +41,12 @@ #include #endif // TENSORPIPE_HAS_IBV_TRANSPORT +#if TENSORPIPE_HAS_EFA_TRANSPORT +#include +#include +#include +#endif // TENSORPIPE_HAS_EFA_TRANSPORT + // Channels #include From 52587b13d685dacc4938c288049f607df70e144d Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Thu, 2 Sep 2021 09:23:23 +0100 Subject: [PATCH 05/19] fix --- tensorpipe/common/efa.cc | 18 +- tensorpipe/common/efa.h | 15 +- tensorpipe/common/efa_read_write_ops.h | 131 +++++++++----- tensorpipe/transport/efa/connection_impl.cc | 181 +++++-------------- tensorpipe/transport/efa/connection_impl.h | 59 +----- tensorpipe/transport/efa/context_impl.cc | 8 +- tensorpipe/transport/efa/reactor.cc | 189 +++++++------------- tensorpipe/transport/efa/reactor.h | 30 ++-- 8 files changed, 246 insertions(+), 385 deletions(-) diff --git a/tensorpipe/common/efa.cc b/tensorpipe/common/efa.cc index d92ac6552..d3db4317e 100644 --- a/tensorpipe/common/efa.cc +++ b/tensorpipe/common/efa.cc @@ -125,12 +125,12 @@ FabricEndpoint::FabricEndpoint(){ fabric_ctx = std::make_unique(); } -int FabricEndpoint::PollCQ(struct fi_cq_tagged_entry* cq_entries, fi_addr_t* src_addrs, size_t count){ - int ret = fi_cq_readfrom(fabric_ctx->cq.get(), &cq_entries, count, src_addrs); +int FabricEndpoint::poll_cq(struct fi_cq_tagged_entry* cq_entries, fi_addr_t* src_addrs, size_t count){ + int ret = fi_cq_readfrom(fabric_ctx->cq.get(), cq_entries, count, src_addrs); return ret; } -int FabricEndpoint::PushSendEvent(void* buffer, size_t size, uint64_t tag, fi_addr_t dest_addr, void* context){ +int FabricEndpoint::post_send(void* buffer, size_t size, uint64_t tag, fi_addr_t dest_addr, void* context){ int ret = fi_tsend(fabric_ctx->ep.get(), buffer, size, nullptr, dest_addr, tag, context); if (ret < 0 && ret != -FI_EAGAIN) { TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); @@ -138,7 +138,7 @@ int FabricEndpoint::PushSendEvent(void* buffer, size_t size, uint64_t tag, fi_ad return ret; } -int FabricEndpoint::PushRecvEvent(void* buffer, size_t size, uint64_t tag, fi_addr_t dest_addr, uint64_t ignore, void* context){ +int FabricEndpoint::post_recv(void* buffer, size_t size, uint64_t tag, fi_addr_t dest_addr, uint64_t ignore, void* context){ int ret = fi_trecv(fabric_ctx->ep.get(), buffer, size, nullptr, dest_addr, tag, ignore, context); if (ret < 0 && ret != -FI_EAGAIN) { TP_CHECK_EFA_RET(ret, "Unable to do fi_trecv message"); @@ -147,9 +147,9 @@ int FabricEndpoint::PushRecvEvent(void* buffer, size_t size, uint64_t tag, fi_ad } -fi_addr_t FabricEndpoint::AddPeerAddr(FabricAddr* addr){ - TP_VLOG(9) << "Add addr"; +fi_addr_t FabricEndpoint::addPeerAddr(FabricAddr* addr){ fi_addr_t peer_addr; + // TP_LOG_WARNING() << "Add: " << addr->DebugStr(); int ret = fi_av_insert(fabric_ctx->av.get(), addr->name, 1, &peer_addr, 0, nullptr); TP_DCHECK_EQ(ret, 1); @@ -157,4 +157,10 @@ fi_addr_t FabricEndpoint::AddPeerAddr(FabricAddr* addr){ return peer_addr; }; + void FabricEndpoint::removePeerAddr(fi_addr_t peer_addr){ + int ret = + fi_av_remove(fabric_ctx->av.get(), &peer_addr, 1, 0); + TP_DCHECK_EQ(ret, 0); + }; + } // namespace tensorpipe \ No newline at end of file diff --git a/tensorpipe/common/efa.h b/tensorpipe/common/efa.h index 0cf8902b6..ff5e97ea0 100644 --- a/tensorpipe/common/efa.h +++ b/tensorpipe/common/efa.h @@ -6,8 +6,8 @@ * LICENSE file in the root directory of this source tree. */ -#ifndef COMMON_EFA_H_ -#define COMMON_EFA_H_ +#ifndef TENSORPIPE_COMMON_EFA_H_ +#define TENSORPIPE_COMMON_EFA_H_ #include @@ -135,12 +135,13 @@ class FabricEndpoint { public: FabricEndpoint(); - fi_addr_t AddPeerAddr(FabricAddr* addr); + fi_addr_t addPeerAddr(FabricAddr* addr); + void removePeerAddr(fi_addr_t peer_addr); - int PushSendEvent(void* buffer, size_t size, uint64_t tag, fi_addr_t dst_addr, void* context = nullptr); - int PushRecvEvent(void* buffer, size_t size, uint64_t tag, fi_addr_t src_addr, uint64_t ignore, void* context = nullptr); + int post_send(void* buffer, size_t size, uint64_t tag, fi_addr_t dst_addr, void* context = nullptr); + int post_recv(void* buffer, size_t size, uint64_t tag, fi_addr_t src_addr, uint64_t ignore, void* context = nullptr); - int PollCQ(struct fi_cq_tagged_entry* cq_entries, fi_addr_t* src_addrs, size_t count); + int poll_cq(struct fi_cq_tagged_entry* cq_entries, fi_addr_t* src_addrs, size_t count); static bool isEfaAvailable(); @@ -150,4 +151,4 @@ class FabricEndpoint { } // namespace tensorpipe -#endif // COMMON_EFA_H_ +#endif // TENSORPIPE_COMMON_EFA_H_ diff --git a/tensorpipe/common/efa_read_write_ops.h b/tensorpipe/common/efa_read_write_ops.h index 714d71a1c..9662aa9e0 100644 --- a/tensorpipe/common/efa_read_write_ops.h +++ b/tensorpipe/common/efa_read_write_ops.h @@ -28,10 +28,10 @@ namespace tensorpipe { // to expected, and dynamically allocate, or 2) know how many bytes // to expect, and preallocate the destination memory. class EFAReadOperation { - public: + public: enum Mode { - READ_LENGTH, - READ_PAYLOAD, + WAIT_TO_POST, + WAIT_TO_COMPLETE, COMPLETE, }; @@ -50,13 +50,23 @@ class EFAReadOperation { // inline void readFromLoop(); // Returns if this read operation is complete. - inline bool completeFromLoop() const; + inline bool completed() const; + inline bool posted() const; + + inline void setCompleted(); + + inline void setWaitToCompleted(); + + inline size_t getReadLength(); + + inline size_t* getLengthPtr(); + inline char* getBufferPtr(); // Invoke user callback. inline void callbackFromLoop(const Error& error); - // private: - Mode mode_{READ_LENGTH}; + private: + Mode mode_{WAIT_TO_POST}; char* ptr_{nullptr}; // Number of bytes as specified by the user (if applicable). @@ -95,40 +105,33 @@ void EFAReadOperation::allocFromLoop() { } } -// void EFAReadOperation::readFromLoop(size_t nread) { -// bytesRead_ += nread; -// if (mode_ == READ_LENGTH) { -// TP_DCHECK_LE(bytesRead_, sizeof(readLength_)); -// if (bytesRead_ == sizeof(readLength_)) { -// if (givenLength_.has_value()) { -// TP_DCHECK(ptr_ != nullptr || givenLength_.value() == 0); -// TP_DCHECK_EQ(readLength_, givenLength_.value()); -// } else { -// TP_DCHECK(ptr_ == nullptr); -// buffer_ = std::make_unique(readLength_); -// ptr_ = buffer_.get(); -// } -// if (readLength_ == 0) { -// mode_ = COMPLETE; -// } else { -// mode_ = READ_PAYLOAD; -// } -// bytesRead_ = 0; -// } -// } else if (mode_ == READ_PAYLOAD) { -// TP_DCHECK_LE(bytesRead_, readLength_); -// if (bytesRead_ == readLength_) { -// mode_ = COMPLETE; -// } -// } else { -// TP_THROW_ASSERT() << "invalid mode " << mode_; -// } -// } - -bool EFAReadOperation::completeFromLoop() const { +inline size_t* EFAReadOperation::getLengthPtr() { + return &readLength_; +}; +inline char* EFAReadOperation::getBufferPtr() { + return ptr_; +}; + +inline size_t EFAReadOperation::getReadLength() { + return readLength_; +}; + +bool EFAReadOperation::completed() const { return mode_ == COMPLETE; } +bool EFAReadOperation::posted() const { + return !(mode_ == WAIT_TO_POST); +} + +void EFAReadOperation::setCompleted() { + mode_ = COMPLETE; +} + +void EFAReadOperation::setWaitToCompleted() { + mode_ = WAIT_TO_COMPLETE; +} + void EFAReadOperation::callbackFromLoop(const Error& error) { fn_(error, ptr_, readLength_); } @@ -141,10 +144,8 @@ void EFAReadOperation::callbackFromLoop(const Error& error) { // must remain valid until the write callback has been called. class EFAWriteOperation { public: - enum Mode { - WRITE_LENGTH, - WRITE_PAYLOAD, // Not used + WAIT_TO_POST, WAIT_TO_COMPLETE, COMPLETE, }; @@ -165,11 +166,27 @@ class EFAWriteOperation { // Invoke user callback. inline void callbackFromLoop(const Error& error); - - // private: - Mode mode_{WRITE_LENGTH}; + // set mode to WAIT_TO_COMPLETE + inline void setWaitComplete(); + + inline bool posted(); + + // Returns if this write operation is complete. + inline bool completed() const; + // set mode to complete + inline void setCompleted(); + // get peer address + inline fi_addr_t getPeerAddr(); + // set peer address + inline void setPeerAddr(fi_addr_t peer_addr); + // get length + inline size_t getLength() const; + + private: + Mode mode_{WAIT_TO_POST}; const char* ptr_; const size_t length_; + fi_addr_t peer_addr_; // Buffers (structs with pointers and lengths) to write to stream. std::array bufs_; @@ -198,4 +215,32 @@ void EFAWriteOperation::callbackFromLoop(const Error& error) { fn_(error); } +bool EFAWriteOperation::posted() { + return !(mode_ == WAIT_TO_POST); +} + +size_t EFAWriteOperation::getLength() const { + return length_; +} + +void EFAWriteOperation::setWaitComplete() { + mode_ = WAIT_TO_COMPLETE; +} + +void EFAWriteOperation::setCompleted() { + mode_ = COMPLETE; +} + +bool EFAWriteOperation::completed() const { + return mode_ == COMPLETE; +} + +void EFAWriteOperation::setPeerAddr(fi_addr_t peer_addr) { + peer_addr_ = peer_addr; +} + +fi_addr_t EFAWriteOperation::getPeerAddr() { + return peer_addr_; +} + } // namespace tensorpipe diff --git a/tensorpipe/transport/efa/connection_impl.cc b/tensorpipe/transport/efa/connection_impl.cc index 77fcea6c5..d12a9be30 100644 --- a/tensorpipe/transport/efa/connection_impl.cc +++ b/tensorpipe/transport/efa/connection_impl.cc @@ -15,10 +15,10 @@ #include #include +#include #include #include #include -#include #include #include #include @@ -31,28 +31,6 @@ namespace tensorpipe { namespace transport { namespace efa { -namespace { - -// When the connection gets closed, to avoid leaks, it needs to "reclaim" all -// the work requests that it had posted, by waiting for their completion. They -// may however complete with error, which makes it harder to identify and -// distinguish them from failing incoming requests because, in principle, we -// cannot access the opcode field of a failed work completion. Therefore, we -// assign a special ID to those types of requests, to match them later on. -constexpr uint64_t kWriteRequestId = 1; -constexpr uint64_t kAckRequestId = 2; - -// The data that each queue pair endpoint needs to send to the other endpoint in -// order to set up the queue pair itself. This data is transferred over a TCP -// connection. -// struct Exchange { -// efaSetupInformation setupInfo; -// uint64_t memoryRegionPtr; -// uint32_t memoryRegionKey; -// }; - -} // namespace - ConnectionImpl::ConnectionImpl( ConstructorToken token, std::shared_ptr context, @@ -77,9 +55,10 @@ ConnectionImpl::ConnectionImpl( void ConnectionImpl::initImplFromLoop() { context_->enroll(*this); - Error error; // The connection either got a socket or an address, but not both. + endpoint = context_->getReactor().endpoint; + TP_DCHECK(socket_.hasValue() ^ sockaddr_.has_value()); if (!socket_.hasValue()) { std::tie(error, socket_) = @@ -106,9 +85,6 @@ void ConnectionImpl::initImplFromLoop() { setError(std::move(error)); return; } - // Register methods to be called when our peer writes to our inbox and reads - // from our outbox. - // context_->getReactor().registerQp(qp_->qp_num, shared_from_this()); // We're sending address first, so wait for writability. state_ = SEND_ADDR; @@ -121,22 +97,6 @@ void ConnectionImpl::readImplFromLoop(read_callback_fn fn) { processReadOperationsFromLoop(); } -// void ConnectionImpl::readImplFromLoop( -// AbstractNopHolder& object, -// read_nop_callback_fn fn) { -// readOperations_.emplace_back( -// &object, -// [fn{std::move(fn)}]( -// const Error& error, const void* /* unused */, size_t /* unused */) -// { -// fn(error); -// }); - -// // If the inbox already contains some data, we may be able to process this -// // operation right away. -// processReadOperationsFromLoop(); -// } - void ConnectionImpl::readImplFromLoop( void* ptr, size_t length, @@ -211,18 +171,20 @@ void ConnectionImpl::handleEventInFromLoop() { TP_DCHECK(context_->inLoop()); if (state_ == RECV_ADDR) { struct FabricAddr addr; - - auto err = socket_.read(&addr.name, 64); + // auto x = &addr.name; + auto err = socket_.read(addr.name, sizeof(addr.name)); // Crossing our fingers that the exchange information is small enough that // it can be read in a single chunk. - if (err != 64) { - setError(TP_CREATE_ERROR(ShortReadError, 64, err)); + if (err != sizeof(addr.name)) { + setError(TP_CREATE_ERROR(ShortReadError, sizeof(addr.name), err)); return; } - peer_addr = endpoint->AddPeerAddr(&addr); + peer_addr = endpoint->addPeerAddr(&addr); // The connection is usable now. + context_->getReactor().registerHandler(peer_addr, shared_from_this()); + state_ = ESTABLISHED; processWriteOperationsFromLoop(); // Trigger read operations in case a pair of local read() and remote @@ -247,12 +209,12 @@ void ConnectionImpl::handleEventOutFromLoop() { TP_DCHECK(context_->inLoop()); if (state_ == SEND_ADDR) { FabricAddr addr = endpoint->fabric_ctx->addr; - - auto err = socket_.write(reinterpret_cast(&addr.name), 64); + auto err = + socket_.write(reinterpret_cast(addr.name), sizeof(addr.name)); // Crossing our fingers that the exchange information is small enough that // it can be written in a single chunk. - if (err != 64) { - setError(TP_CREATE_ERROR(ShortWriteError, 64, err)); + if (err != sizeof(addr.name)) { + setError(TP_CREATE_ERROR(ShortWriteError, sizeof(addr.name), err)); return; } @@ -274,41 +236,18 @@ void ConnectionImpl::processReadOperationsFromLoop() { return; } - // pop out finished event at front - while (!readOperations_.empty()) { - EFAReadOperation& readOperation = readOperations_.front(); - if (readOperation.completeFromLoop()) { - readOperation.callbackFromLoop(Error::kSuccess); - readOperations_.pop_front(); - } else { - break; - } - } - - // Serve read operations - // while (!readOperations_.empty()) { - // EFAReadOperation& readOperation = readOperations_.front(); - // context_->getReactor().postRecv( - // &readOperation.readLength_, - // sizeof(size_t), - // kLength, - // peer_addr, - // 0xffffffff, // ignore lower bits for msg index - // &readOperation); - // } - for (int i = 0; i < readOperations_.size(); i++) { EFAReadOperation& readOperation = readOperations_[i]; - if (readOperation.mode_ == EFAReadOperation::Mode::READ_LENGTH) { + if (!readOperation.posted()) { // context_->getReactor().; context_->getReactor().postRecv( - &readOperation.readLength_, + readOperation.getLengthPtr(), sizeof(size_t), kLength, peer_addr, 0xffffffff, // ignore lower bits for msg index &readOperation); - readOperation.mode_ = EFAReadOperation::Mode::READ_PAYLOAD; + readOperation.setWaitToCompleted(); } else { // if the operation is not READ_LENGTH, all operations back are all not // READ_LENGTH, we can skip more checks @@ -317,28 +256,44 @@ void ConnectionImpl::processReadOperationsFromLoop() { } } -void ConnectionImpl::processWriteOperationsFromLoop() { - TP_DCHECK(context_->inLoop()); - - if (state_ != ESTABLISHED) { - return; - } - +void ConnectionImpl::onWriteCompleted() { while (!writeOperations_.empty()) { EFAWriteOperation& writeOperation = writeOperations_.front(); - if (writeOperation.mode_ == EFAWriteOperation::Mode::COMPLETE) { + if (writeOperation.completed()) { + writeOperation.callbackFromLoop(Error::kSuccess); writeOperations_.pop_front(); } else { break; } } +} + +void ConnectionImpl::onReadCompleted() { + while (!readOperations_.empty()) { + EFAReadOperation& readOperation = readOperations_.front(); + if (readOperation.completed()) { + readOperation.callbackFromLoop(Error::kSuccess); + readOperations_.pop_front(); + } else { + break; + } + } +} + +void ConnectionImpl::processWriteOperationsFromLoop() { + TP_DCHECK(context_->inLoop()); + + if (state_ != ESTABLISHED) { + return; + } for (int i = 0; i < writeOperations_.size(); i++) { EFAWriteOperation& writeOperation = writeOperations_[i]; - if (writeOperation.mode_ == EFAWriteOperation::Mode::WRITE_LENGTH) { + if (!writeOperation.posted()) { EFAWriteOperation::Buf* buf_array; size_t size; std::tie(buf_array, size) = writeOperation.getBufs(); + writeOperation.setPeerAddr(peer_addr); // auto size_buf = std::get<0>(writeOperation.getBufs()); // auto payload_buf = std::get<1>(writeOperation.getBufs()); context_->getReactor().postSend( @@ -356,7 +311,6 @@ void ConnectionImpl::processWriteOperationsFromLoop() { &writeOperation); } sendIdx++; - writeOperation.mode_ = EFAWriteOperation::Mode::WAIT_TO_COMPLETE; } else { // if the operation is not WAIT_TO_SEND, all operations back are all not // WAIT_TO_SEND, we can skip more checks @@ -365,18 +319,6 @@ void ConnectionImpl::processWriteOperationsFromLoop() { } } -// void ConnectionImpl::onError(efaLib::wc_status status, uint64_t wrId) { -// TP_DCHECK(context_->inLoop()); -// // setError(TP_CREATE_ERROR( -// // efaError, -// context_->getReactor().getefaLib().wc_status_str(status))); -// // if (wrId == kWriteRequestId) { -// // onWriteCompleted(); -// // } else if (wrId == kAckRequestId) { -// // onAckCompleted(); -// // } -// } - void ConnectionImpl::handleErrorImpl() { for (auto& readOperation : readOperations_) { readOperation.callbackFromLoop(error_); @@ -388,7 +330,7 @@ void ConnectionImpl::handleErrorImpl() { } writeOperations_.clear(); - tryCleanup(); + cleanup(); if (socket_.hasValue()) { if (state_ > INITIALIZING) { @@ -400,44 +342,11 @@ void ConnectionImpl::handleErrorImpl() { context_->unenroll(*this); } -void ConnectionImpl::tryCleanup() { - TP_DCHECK(context_->inLoop()); - // Setting the queue pair to an error state will cause all its work requests - // (both those that had started being served, and those that hadn't; including - // those from a shared receive queue) to be flushed. We need to wait for the - // completion events of all those requests to be retrieved from the completion - // queue before we can destroy the queue pair. We can do so by deferring the - // destruction to the loop, since the reactor will only proceed to invoke - // deferred functions once it doesn't have any completion events to handle. - // However the RDMA writes and the sends may be queued up inside the reactor - // and thus may not have even been scheduled yet, so we explicitly wait for - // them to complete. - // if (error_) { - // if (numWritesInFlight_ == 0 && numAcksInFlight_ == 0) { - // TP_VLOG(8) << "Connection " << id_ << " is ready to clean up"; - // context_->deferToLoop([impl{shared_from_this()}]() { impl->cleanup(); - // }); - // } else { - // TP_VLOG(9) << "Connection " << id_ - // << " cannot proceed to cleanup because it has " - // << numWritesInFlight_ << " pending RDMA write requests and " - // << numAcksInFlight_ << " pending send requests on QP " - // << qp_->qp_num; - // } - // } -} - void ConnectionImpl::cleanup() { TP_DCHECK(context_->inLoop()); TP_VLOG(8) << "Connection " << id_ << " is cleaning up"; - - // context_->getReactor().unregisterQp(qp_->qp_num); - - // qp_.reset(); - // inboxMr_.reset(); - // inboxBuf_.reset(); - // outboxMr_.reset(); - // outboxBuf_.reset(); + context_->getReactor().unregisterHandler(peer_addr); + endpoint->removePeerAddr(peer_addr); } } // namespace efa diff --git a/tensorpipe/transport/efa/connection_impl.h b/tensorpipe/transport/efa/connection_impl.h index aabdb4d4b..a93f1fc07 100644 --- a/tensorpipe/transport/efa/connection_impl.h +++ b/tensorpipe/transport/efa/connection_impl.h @@ -14,11 +14,11 @@ #include #include -#include #include +#include +#include #include #include -#include #include #include #include @@ -35,17 +35,8 @@ class ConnectionImpl final : public ConnectionImplBoilerplate< ContextImpl, ListenerImpl, ConnectionImpl>, + public efaEventHandler, public EpollLoop::EventHandler { -// constexpr static size_t kBufferSize = 2 * 1024 * 1024; - -// constexpr static int kNumOutboxRingbufferRoles = 3; -// using OutboxefaAcker = RingBufferRole; -// using OutboxefaWriter = RingBufferRole; -// using OutboxProducer = RingBufferRole; - -// constexpr static int kNumInboxRingbufferRoles = 2; -// using InboxConsumer = RingBufferRole; -// using InboxefaRecver = RingBufferRole; enum State { INITIALIZING = 1, @@ -73,10 +64,11 @@ class ConnectionImpl final : public ConnectionImplBoilerplate< void handleEventsFromLoop(int events) override; // Implementation of efaEventHandler. -// void onRemoteProducedData(uint32_t length) override; -// void onRemoteConsumedData(uint32_t length) override; -// void onWriteCompleted() override; -// void onAckCompleted() override; + // void onRemoteProducedData(uint32_t length) override; + // void onRemoteConsumedData(uint32_t length) override; + void onWriteCompleted() override; + void onReadCompleted() override; + // void onAckCompleted() override; // void onError(efaLib::wc_status status, uint64_t wrId) override; protected: @@ -108,39 +100,7 @@ class ConnectionImpl final : public ConnectionImplBoilerplate< std::shared_ptr endpoint; fi_addr_t peer_addr; -// efaQueuePair qp_; - uint32_t sendIdx, recvIdx; - - - // Inbox. - // Initialize header during construction because it isn't assignable. -// RingBufferHeader inboxHeader_{kBufferSize}; - // Use mmapped memory so it's page-aligned (and, one day, to use huge pages). -// MmappedPtr inboxBuf_; -// RingBuffer inboxRb_; -// efaMemoryRegion inboxMr_; - - // Outbox. - // Initialize header during construction because it isn't assignable. -// RingBufferHeader outboxHeader_{kBufferSize}; - // Use mmapped memory so it's page-aligned (and, one day, to use huge pages). -// MmappedPtr outboxBuf_; -// RingBuffer outboxRb_; -// efaMemoryRegion outboxMr_; - - // Peer inbox key, pointer and head. -// uint32_t peerInboxKey_{0}; -// uint64_t peerInboxPtr_{0}; -// uint64_t peerInboxHead_{0}; - - // The connection performs two types of send requests: writing to the remote - // inbox, or acknowledging a write into its own inbox. These send operations - // could be delayed and stalled by the reactor as only a limited number of - // work requests can be outstanding at the same time globally. Thus we keep - // count of how many we have pending to make sure they have all completed or - // flushed when we close, and that none is stuck in the pipeline. -// uint32_t numWritesInFlight_{0}; -// uint32_t numAcksInFlight_{0}; + uint32_t sendIdx = 0; // Pending read operations. std::deque readOperations_; @@ -170,7 +130,6 @@ class ConnectionImpl final : public ConnectionImplBoilerplate< // a new write operation is queued. void processWriteOperationsFromLoop(); - void tryCleanup(); void cleanup(); }; diff --git a/tensorpipe/transport/efa/context_impl.cc b/tensorpipe/transport/efa/context_impl.cc index 803f36816..be139c497 100644 --- a/tensorpipe/transport/efa/context_impl.cc +++ b/tensorpipe/transport/efa/context_impl.cc @@ -45,7 +45,7 @@ std::shared_ptr ContextImpl::create() { return nullptr; } - bool isEfaAvailable = FabricEndpoint::isEfaAvailable(); + // bool isEfaAvailable = FabricEndpoint::isEfaAvailable(); if (!FabricEndpoint::isEfaAvailable()){ TP_VLOG(7) << "libfabric cannot find efa provider."; @@ -74,6 +74,12 @@ std::shared_ptr ContextImpl::create() { return std::make_shared(); } + +ContextImpl::ContextImpl() + : ContextImplBoilerplate( + generateDomainDescriptor()) { + } + // ContextImpl::ContextImpl(efaLib efaLib, efaDeviceList deviceList) // : ContextImplBoilerplate( // generateDomainDescriptor()), diff --git a/tensorpipe/transport/efa/reactor.cc b/tensorpipe/transport/efa/reactor.cc index 2a23c2440..943c07be7 100644 --- a/tensorpipe/transport/efa/reactor.cc +++ b/tensorpipe/transport/efa/reactor.cc @@ -8,8 +8,8 @@ #include -#include #include +#include #include namespace tensorpipe { @@ -28,43 +28,27 @@ void Reactor::postSend( uint64_t tag, fi_addr_t peer_addr, void* context) { - // First try send all messages in pending queue - while (!pendingSends_.empty()) { - EFASendEvent sevent = pendingSends_.front(); - int ret = - this->endpoint->PushSendEvent(sevent.buffer, sevent.size, sevent.tag, sevent.peer_addr, sevent.context); - if (ret == 0) { - // Send successfully, pop out events - pendingSends_.pop_front(); - } else if (ret == -FI_EAGAIN) { - // Event queue is full now, push the event into pending queue and return - pendingSends_.push_back({buffer, size, tag, peer_addr, context}); - return; - } else if (ret < 0) { - // Unknown failure, raise exception - TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); - } - } + pendingSends_.push_back({buffer, size, tag, peer_addr, context}); + postPendingRecvs(); +} - // No pending events, send out directly - int ret = - this->endpoint->PushSendEvent(buffer, size, tag, peer_addr, context); - if (ret == 0) { - // Send successfully - return; - } else if (ret == -FI_EAGAIN) { - // Event queue is full now, push the event into pending queue and return - pendingSends_.push_back({buffer, size, tag, peer_addr, context}); - return; - } else if (ret < 0) { - TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); - } +void Reactor::postRecv( + void* buffer, + size_t size, + uint64_t tag, + fi_addr_t dest_addr, + uint64_t ignore, + void* context) { + // First try send all messages in pending queue + pendingRecvs_.push_back({buffer, size, tag, dest_addr, ignore, context}); + postPendingRecvs(); } int Reactor::postPendingSends() { + // TP_LOG_WARNING() << "Post send event"; while (!pendingSends_.empty()) { EFASendEvent sevent = pendingSends_.front(); - int ret = this->endpoint->PushSendEvent( + int ret = this->endpoint->post_send( sevent.buffer, sevent.size, sevent.tag, @@ -80,14 +64,14 @@ int Reactor::postPendingSends() { TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); } } - + return 0; } int Reactor::postPendingRecvs() { while (!pendingRecvs_.empty()) { EFARecvEvent revent = pendingRecvs_.front(); - int ret = this->endpoint->PushRecvEvent( + int ret = this->endpoint->post_recv( revent.buffer, revent.size, revent.tag, @@ -101,57 +85,12 @@ int Reactor::postPendingRecvs() { return pendingRecvs_.size(); } else if (ret < 0) { // Unknown failure, raise exception - TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); + TP_CHECK_EFA_RET(ret, "Unable to do fi_trecv message"); } } return 0; } -void Reactor::postRecv( - void* buffer, - size_t size, - uint64_t tag, - fi_addr_t dest_addr, - uint64_t ignore, - void* context) { - // First try send all messages in pending queue - int pendingRecvNum = postPendingRecvs(); - if (pendingRecvNum == 0){ - // No pending events, send out directly - int ret = - this->endpoint->PushRecvEvent(buffer, size, tag, dest_addr, ignore, context); - if (ret == 0) { - // Send successfully - return; - } else if (ret == -FI_EAGAIN) { - // Event queue is full now, push the event into pending queue and return - pendingRecvs_.push_back({buffer, size, tag, dest_addr, ignore, context}); - return; - } else if (ret < 0) { - TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); - } - } else { - pendingRecvs_.push_back({buffer, size, tag, dest_addr, ignore, context}); - return; - } - -} - -// void Reactor::postRecvRequests(int num) { -// while (num > 0) { -// efaLib::recv_wr* badRecvWr = nullptr; -// std::array wrs; -// std::memset(wrs.data(), 0, sizeof(wrs)); -// for (int i = 0; i < std::min(num, kNumPolledWorkCompletions) - 1; i++) { -// wrs[i].next = &wrs[i + 1]; -// } -// int rv = getefaLib().post_srq_recv(srq_.get(), wrs.data(), &badRecvWr); -// TP_THROW_SYSTEM_IF(rv != 0, errno); -// TP_THROW_ASSERT_IF(badRecvWr != nullptr); -// num -= std::min(num, kNumPolledWorkCompletions); -// } -// } - void Reactor::setId(std::string id) { id_ = std::move(id); } @@ -176,7 +115,7 @@ Reactor::~Reactor() { // void Reactor::postRecvRequests(int num){ // uint64_t size_buffer; -// int ret = endpoint->PushRecvEvent(&size_buffer, sizeof(uint64_t), kLength, +// int ret = endpoint->post_recv(&size_buffer, sizeof(uint64_t), kLength, // FI_ADDR_UNSPEC); // } @@ -184,13 +123,18 @@ Reactor::~Reactor() { bool Reactor::pollOnce() { std::array cq_entries; std::array src_addrs; + + postPendingSends(); + postPendingRecvs(); + auto rv = - endpoint->PollCQ(cq_entries.data(), src_addrs.data(), cq_entries.size()); + endpoint->poll_cq(cq_entries.data(), src_addrs.data(), cq_entries.size()); - if (rv == 0) { + if (rv == 0 || rv == -FI_EAGAIN) { return false; + } else { + TP_CHECK_EFA_RET(rv, "Completion queue poll error."); } - TP_THROW_SYSTEM_IF(rv < 0, errno); int numRecvs = 0; int numWrites = 0; @@ -199,68 +143,63 @@ bool Reactor::pollOnce() { struct fi_cq_tagged_entry& cq = cq_entries[cqIdx]; fi_addr_t& src_addr = src_addrs[cqIdx]; uint32_t msg_idx = static_cast(cq.tag); - if (cq.flags && FI_SEND) { + if (cq.flags & FI_SEND) { // Send event - if (cq.flags && kLength) { + if (cq.tag & kLength) { // Send size finished, check whether it's zero sized message auto* operation_ptr = static_cast(cq.op_context); - if (operation_ptr->length_ == 0){ - operation_ptr->mode_ = EFAWriteOperation::Mode::COMPLETE; - operation_ptr->callbackFromLoop(Error::kSuccess); + if (operation_ptr->getLength() == 0) { + operation_ptr->setCompleted(); + efaEventHandler_[operation_ptr->getPeerAddr()]->onWriteCompleted(); } - } else if (cq.flags && kPayload) { + } else if (cq.tag & kPayload) { auto* operation_ptr = static_cast(cq.op_context); - operation_ptr->mode_ = EFAWriteOperation::Mode::COMPLETE; - operation_ptr->callbackFromLoop(Error::kSuccess); + operation_ptr->setCompleted(); + efaEventHandler_[operation_ptr->getPeerAddr()]->onWriteCompleted(); } - } else if (cq.flags && FI_RECV) { + } else if (cq.flags & FI_RECV) { // Receive event - // auto iter = efaEventHandler_.find(src_addr); - if (cq.tag && kLength) { + if (cq.tag & kLength) { // Received length information auto* operation_ptr = static_cast(cq.op_context); - operation_ptr->mode_ = EFAReadOperation::Mode::READ_PAYLOAD; - operation_ptr->allocFromLoop(); - // postRecv() - // void* buffer = operation_ptr->perpareBuffer(); - postRecv( - operation_ptr->ptr_, - operation_ptr->readLength_, - kPayload | msg_idx, - src_addr, - 0, // Exact match of tag - operation_ptr); - // iter->second->onRecvLength(msg_idx); - } else if (cq.tag && kPayload) { + if (operation_ptr->getReadLength() == 0) { + operation_ptr->setCompleted(); + efaEventHandler_[src_addr]->onReadCompleted(); + } else { + // operation_ptr->mode_ = EFAReadOperation::Mode::READ_PAYLOAD; + operation_ptr->allocFromLoop(); + postRecv( + operation_ptr->getBufferPtr(), + operation_ptr->getReadLength(), + kPayload | msg_idx, + src_addr, + 0, // Exact match of tag + operation_ptr); + operation_ptr->setWaitToCompleted(); + } + } else if (cq.tag & kPayload) { // Received payload auto* operation_ptr = static_cast(cq.op_context); - operation_ptr->mode_ = EFAReadOperation::Mode::COMPLETE; - operation_ptr->callbackFromLoop(Error::kSuccess); + operation_ptr->setCompleted(); + efaEventHandler_[src_addr]->onReadCompleted(); } } - // auto iter = queuePairEventHandler_.find(wc.qp_num); - // TP_THROW_ASSERT_IF(iter == queuePairEventHandler_.end()) - // << "Got work completion for unknown queue pair " << wc.qp_num; - - // if (wc.status != efaLib::WC_SUCCESS) { - // iter->second->onError(wc.status, wc.wr_id); - // continue; - // } } return true; } bool Reactor::readyToClose() { - return true; - // return queuePairEventHandler_.size() == 0; + return efaEventHandler_.size() == 0; } -// void Reactor::registerQp( -// uint32_t qpn, -// std::shared_ptr eventHandler) { -// queuePairEventHandler_.emplace(qpn, std::move(eventHandler)); -// } +void Reactor::registerHandler(fi_addr_t peer_addr, std::shared_ptr eventHandler) { + efaEventHandler_.emplace(peer_addr, std::move(eventHandler)); +} + +void Reactor::unregisterHandler(fi_addr_t peer_addr){ + efaEventHandler_.erase(peer_addr); +} } // namespace efa } // namespace transport diff --git a/tensorpipe/transport/efa/reactor.h b/tensorpipe/transport/efa/reactor.h index 90842ad28..ac96ccbc0 100644 --- a/tensorpipe/transport/efa/reactor.h +++ b/tensorpipe/transport/efa/reactor.h @@ -28,20 +28,14 @@ namespace tensorpipe { namespace transport { namespace efa { -// class efaEventHandler { -// public: -// virtual void onRecvLength(uint32_t msg_idx) = 0; - -// virtual void onSendData(uint32_t msg_idx) = 0; - -// virtual void onSendCompleted(uint32_t msg_idx) = 0; - -// virtual void onRecvCompleted(uint32_t msg_idx) = 0; +class efaEventHandler { + public: + virtual void onWriteCompleted() = 0; -// virtual void onError(int errno) = 0; + virtual void onReadCompleted() = 0; -// virtual ~efaEventHandler() = default; -// }; + virtual ~efaEventHandler() = default; +}; enum efaTag: uint64_t{ kLength = 1ULL << 32, @@ -81,9 +75,9 @@ class Reactor final : public BusyPollingLoop { // return addr_; // } - // void registerQp(uint32_t qpn, std::shared_ptr eventHandler); + void registerHandler(fi_addr_t peer_addr, std::shared_ptr eventHandler); - // void unregisterQp(uint32_t qpn); + void unregisterHandler(fi_addr_t peer_addr); void postSend(void* buffer, size_t size, uint64_t tag, fi_addr_t peer_addr, void* context); @@ -98,6 +92,9 @@ class Reactor final : public BusyPollingLoop { ~Reactor(); + + std::shared_ptr endpoint; + protected: bool pollOnce() override; @@ -130,7 +127,6 @@ class Reactor final : public BusyPollingLoop { int postPendingRecvs(); int postPendingSends(); - std::shared_ptr endpoint; // void postRecvRequests(int num); @@ -145,8 +141,8 @@ class Reactor final : public BusyPollingLoop { std::string id_{"N/A"}; // The registered event handlers for each queue pair. - // std::unordered_map> - // efaEventHandler_; + std::unordered_map> + efaEventHandler_; // uint32_t numAvailableWrites_{kNumPendingWriteReqs}; // uint32_t numAvailableAcks_{kNumPendingAckReqs}; From 9765f9fd5beab1044036cf435d16d58014395dc9 Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Thu, 2 Sep 2021 10:12:44 +0100 Subject: [PATCH 06/19] add description --- tensorpipe/transport/efa/mode.md | 76 ++++++++++++-------------------- 1 file changed, 27 insertions(+), 49 deletions(-) diff --git a/tensorpipe/transport/efa/mode.md b/tensorpipe/transport/efa/mode.md index ead2f46ae..3857f2bf0 100644 --- a/tensorpipe/transport/efa/mode.md +++ b/tensorpipe/transport/efa/mode.md @@ -1,49 +1,27 @@ -The general model of EFA is similar to efa, which has event queues and completion queues for send/recv. -User have to poll the completion queue to trigger the send/recv event happen. Otherwise the event will just stay in the event queue. - -The code below mainly from aws-ofi-nccl, which used libfabric to implement nccl's interface - -Send process: - -1. Push sent event to the completion queue -```Cpp -while (true){ - rc = fi_send(...) # return code - if (rc == 0) - break; # send succeed - else if (rc == -FI_EAGAIN) { - # This is a retryable error - # Can attempt to progress the completion queue to make send event happen - /* - * Process completions so that you have enough - * resources for sending connect message - */ - ret = nccl_ofi_progress(nccl_ofi_component[dev]); - if (OFI_UNLIKELY(ret != 0)) - goto error; - } - else { - NCCL_OFI_WARN("Unable to send connect message for dev %d. RC: %zd, ERROR: %s", - dev, rc, fi_strerror(-rc)); - ret = ncclSystemError; - goto error; - } -} while (true); -``` -This part is bit different from efa, that pushing send event to the event queue may fail, which might need retry. - -2. Progress the completion queue -```Cpp -do { - ret = nccl_ofi_progress(nccl_ofi_component[dev]); - if (OFI_UNLIKELY(ret != 0)) - goto error; -} while (true); -``` - -The receive process is the same as the send process. - -Some design question I'd like to ask for suggestions: -1. Since the memory doesn't need to be pinned/registered, do I still need the RingBuffer related class for EFA? -2. How should I arrange the event loop? Previously I used a busy polling thread for the progress of completion queue, and retrying push the send event to the queue directly in the main thread. Is this a good practice in tensorpipe? - \ No newline at end of file +# EFA + +The EFA communication model can be considered as a simplified ibverbs. The send/recv operation is async by the event queue and completion queue. And the operation itself doesn't need memory registration like ibverbs and also doesn't act like stream operation in socket. The complexity at the memory part is handled by the underlying provider(libfabric+efa). The overall implementation can be considered as Reactor from ibverbs + StreamOperation from uv. + +EFA supports the send-after-send order guarantees for data operation, which means the message order is preserved. However, the completion order is not guaranteed when reading events from completion queue. +For example, sender posts S1, S2, S3 three send operations; receiver posts R1, R2, R3 three recv operations. These operations are exactly matched due to send-after-send guarantee. But when reading from the completion queue at receiver side, the completion order might be R2, R1, R3 or other. Same for the sender side. + +This brings complexity in the busy polling thread when dealing with completion events, since the callback of write operations should be executed in order. To address this issue, the pointer of the `EFAWriteOperation` is passed as operation context when post send event. And in the completion stage, it will set the mode of `EFAWriteOperation` to completed. A seperate function is executed later by iterating the `writeOperations_` deque from front, and execute callback if the operation is done. + +For the receiver part, it's more complex. For example there are two incoming writeOperations. It will become 4 send operation at sender side, SEND_SIZE1, SEND_PAYLOAD1, SEND_SIZE2, SEND_PAYLOAD2. At the receiver side, the expected behavior is +1. Post receive event of single 64bits size, such as RECV_SIZE1 +2. Poll from cq when RECV_SIZE1 is done, and post RECV_PAYLOAD1 with the size in RECV_SIZE1 +3. Did the same thing for the second operation + +However when the four send operation issued concurrently, the first completion event at receiver side might be RECV_SIZE2. If we follow the process above, the recv order will be messed up. To address this problem, the implementation used tag matching. That each operation will have a index decided at the sender side. Two indicator, kLength=1ULL<<32, kPayload=1ULL<<33, are used to indicate the type of the message. The message tag is a 64bit integer, that the high 32 bits are indicators (kLength or kPayload), and the low 32 bits are operation ids. + +Send side: + +1. `post_send(buffer=&length, size=sizeof(int64_t), tag=kLength | msg_id, ...)` +2. `post_send(buffer=buffer, size=length, tag = kPayload | msg_id, ...)` +3. `msg_id++` msg_id is uint32_t + +Receiver side: +At receiver size, we first recv the message with high 32 bits equaling kLength and decode the index from low 32 bits. And then post a recv event with tag `kPayload | msg_id` +1. `post_recv(buffer=&length, size=sizeof(int64_t), tag=kLength, ignore=0xffffffff, ...)` ignore=0xfffffff means ignore lower 32 bits when matching tag +2. decode message id from the incoming message tag (take lower 32bits) +3. `post_recv(buffer=buffer, size=length, tag=kPayload | msg_id, ignore=0, ...)` ignore=0 means the tag should be exactly the same to match \ No newline at end of file From 26c765c56a0292f0ad57d06fe58f2adda2544530 Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Thu, 2 Sep 2021 11:55:53 +0100 Subject: [PATCH 07/19] add link --- tensorpipe/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorpipe/CMakeLists.txt b/tensorpipe/CMakeLists.txt index f3cf81405..34c42bb99 100644 --- a/tensorpipe/CMakeLists.txt +++ b/tensorpipe/CMakeLists.txt @@ -184,8 +184,8 @@ if(TP_ENABLE_EFA) transport/efa/factory.h transport/efa/utility.h) set(TENSORPIPE_HAS_EFA_TRANSPORT 1) + list(APPEND TP_LINK_LIBRARIES fabric) list(APPEND TP_INCLUDE_DIRS $) - # list(APPEND TP_STATIC_OR_SHARED fabric) endif() ## MAC OS specific library deps From f9b14e57e7e4c4c818ca9b402438a6a307729cd9 Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Fri, 3 Sep 2021 07:45:25 +0100 Subject: [PATCH 08/19] fix --- tensorpipe/common/epoll_loop.cc | 2 +- tensorpipe/transport/efa/connection_impl.cc | 8 ++++---- tensorpipe/transport/efa/reactor.cc | 7 ------- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/tensorpipe/common/epoll_loop.cc b/tensorpipe/common/epoll_loop.cc index b1c5e1df6..5fc07f912 100644 --- a/tensorpipe/common/epoll_loop.cc +++ b/tensorpipe/common/epoll_loop.cc @@ -129,7 +129,7 @@ bool EpollLoop::hasRegisteredHandlers() { } void EpollLoop::loop() { - setThreadName("TP_IBV_loop"); + setThreadName("TP_epoll_loop"); // Stop when another thread has asked the loop the close and when all // handlers have been unregistered except for the wakeup eventfd one. diff --git a/tensorpipe/transport/efa/connection_impl.cc b/tensorpipe/transport/efa/connection_impl.cc index d12a9be30..ee858cfb1 100644 --- a/tensorpipe/transport/efa/connection_impl.cc +++ b/tensorpipe/transport/efa/connection_impl.cc @@ -249,8 +249,8 @@ void ConnectionImpl::processReadOperationsFromLoop() { &readOperation); readOperation.setWaitToCompleted(); } else { - // if the operation is not READ_LENGTH, all operations back are all not - // READ_LENGTH, we can skip more checks + // if the operation is posted, all operations back should be posted + // we can skip more checks break; } } @@ -312,8 +312,8 @@ void ConnectionImpl::processWriteOperationsFromLoop() { } sendIdx++; } else { - // if the operation is not WAIT_TO_SEND, all operations back are all not - // WAIT_TO_SEND, we can skip more checks + // if the operation is posted, all operations back should be posted + // we can skip more checks break; } } diff --git a/tensorpipe/transport/efa/reactor.cc b/tensorpipe/transport/efa/reactor.cc index 943c07be7..739530046 100644 --- a/tensorpipe/transport/efa/reactor.cc +++ b/tensorpipe/transport/efa/reactor.cc @@ -113,13 +113,6 @@ Reactor::~Reactor() { join(); } -// void Reactor::postRecvRequests(int num){ -// uint64_t size_buffer; -// int ret = endpoint->post_recv(&size_buffer, sizeof(uint64_t), kLength, -// FI_ADDR_UNSPEC); - -// } - bool Reactor::pollOnce() { std::array cq_entries; std::array src_addrs; From 6e832180cef27324aab9431ad648ae0c53b34a6e Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Fri, 17 Sep 2021 14:09:02 +0100 Subject: [PATCH 09/19] remove dependencys --- tensorpipe/CMakeLists.txt | 1 - tensorpipe/common/efa.cc | 166 --------------- tensorpipe/common/efa.h | 211 +++++++++++++++----- tensorpipe/common/efa_lib.h | 65 +++++- tensorpipe/transport/efa/connection_impl.cc | 9 +- tensorpipe/transport/efa/connection_impl.h | 2 +- tensorpipe/transport/efa/context_impl.cc | 42 +--- tensorpipe/transport/efa/context_impl.h | 2 +- tensorpipe/transport/efa/reactor.cc | 77 ++++--- tensorpipe/transport/efa/reactor.h | 109 +++++----- 10 files changed, 337 insertions(+), 347 deletions(-) delete mode 100644 tensorpipe/common/efa.cc diff --git a/tensorpipe/CMakeLists.txt b/tensorpipe/CMakeLists.txt index 34c42bb99..4397e9761 100644 --- a/tensorpipe/CMakeLists.txt +++ b/tensorpipe/CMakeLists.txt @@ -184,7 +184,6 @@ if(TP_ENABLE_EFA) transport/efa/factory.h transport/efa/utility.h) set(TENSORPIPE_HAS_EFA_TRANSPORT 1) - list(APPEND TP_LINK_LIBRARIES fabric) list(APPEND TP_INCLUDE_DIRS $) endif() diff --git a/tensorpipe/common/efa.cc b/tensorpipe/common/efa.cc deleted file mode 100644 index d3db4317e..000000000 --- a/tensorpipe/common/efa.cc +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace tensorpipe{ - -FabricContext::FabricContext(){ - UniqueFabricPtr fabinfo = getFabricInfo(); - struct fi_info *info = fabinfo.get(); - struct fi_av_attr av_attr = {}; - - // fi_fabric: create fabric - struct fid_fabric *fabric_; - int ret = fi_fabric(info->fabric_attr, &fabric_, nullptr); - TP_CHECK_EFA_RET(ret, "Couldn't open a fabric provider"); - fabric.reset(fabric_); - - // fi_domain: create domain - struct fid_domain *domain_; - ret = fi_domain(fabric.get(), info, &domain_, nullptr); - // LOG(INFO) << domain_-> - TP_CHECK_EFA_RET(ret, "Couldn't open a fabric access domain"); - domain.reset(domain_); - - // fi_av_open: create address vector - av_attr.type = FI_AV_TABLE; - struct fid_av *av_; - ret = fi_av_open(domain.get(), &av_attr, &av_, nullptr); - av.reset(av_); - TP_CHECK_EFA_RET(ret, "Couldn't open AV"); - - // fi_cq_open: open completion queue - struct fid_cq *cq_; - struct fi_cq_attr cq_attr = {}; - cq_attr.format = FI_CQ_FORMAT_TAGGED; - cq_attr.size = info->rx_attr->size; - ret = fi_cq_open(domain.get(), &cq_attr, &cq_, nullptr); - cq.reset(cq_); - TP_CHECK_EFA_RET(ret, "Couldn't open CQ"); - - // fi_endpoint: create transport level communication endpoint(s) - struct fid_ep *ep_; - ret = fi_endpoint(domain.get(), info, &ep_, nullptr); - ep.reset(ep_); - TP_CHECK_EFA_RET(ret, "Couldn't allocate endpoint"); - - // fi_ep_bind: bind CQ and AV to the endpoint - ret = fi_ep_bind(ep.get(), (fid_t)cq.get(), FI_RECV | FI_TRANSMIT); - TP_CHECK_EFA_RET(ret, "Couldn't bind EP-CQ"); - ret = fi_ep_bind(ep.get(), (fid_t)av.get(), 0); - TP_CHECK_EFA_RET(ret, "Couldn't bind EP-AV"); - - // fi_enable: enable endpoint for communication - ret = fi_enable(ep.get()); - TP_CHECK_EFA_RET(ret, "Couldn't enable endpoint"); - - // fi_getname: get endpoint name - ret = fi_getname((fid_t)ep.get(), addr.name, &addr.len); - TP_CHECK_EFA_RET(ret, "Call to fi_getname() failed"); - // set readable address name - fi_av_straddr(av.get(), addr.name, readable_addr.name, &readable_addr.len); -} - - -UniqueFabricPtr FabricContext::getFabricInfo(){ - UniqueFabricPtr hints(fi_allocinfo()); - hints->mode = FI_CONTEXT; - hints->ep_attr->type = FI_EP_RDM; // Reliable Datagram - hints->caps = FI_TAGGED | FI_MSG | FI_REMOTE_COMM | FI_DIRECTED_RECV | FI_LOCAL_COMM | FI_SOURCE; - hints->tx_attr->msg_order = FI_ORDER_SAS; - hints->rx_attr->msg_order = FI_ORDER_SAS; - hints->domain_attr->control_progress = FI_PROGRESS_AUTO; - hints->domain_attr->data_progress = FI_PROGRESS_AUTO; - hints->domain_attr->caps = - FI_LOCAL_COMM | FI_REMOTE_COMM; // Enable local loopback - hints->domain_attr->av_type = FI_AV_TABLE; - hints->fabric_attr->prov_name = strdup("efa"); - - UniqueFabricPtr info; - struct fi_info* info_; - int ret = - fi_getinfo(FABRIC_VERSION, nullptr, nullptr, 0, hints.get(), &info_); - info.reset(info_); - TP_CHECK_EFA_RET(ret, "fi_getinfo failed"); - // TP_THROW_ASSERT() << "Could not find any optimal provider. Return Code: " - // << ret << ". ERROR: " << fi_strerror(-ret); - return info; -} - -bool FabricEndpoint::isEfaAvailable(){ - UniqueFabricPtr hints(fi_allocinfo()); - hints->mode = FI_CONTEXT; - hints->ep_attr->type = FI_EP_RDM; // Reliable Datagram - hints->caps = FI_TAGGED | FI_MSG | FI_REMOTE_COMM | FI_DIRECTED_RECV | FI_LOCAL_COMM | FI_SOURCE; - hints->tx_attr->msg_order = FI_ORDER_SAS; - hints->rx_attr->msg_order = FI_ORDER_SAS; - hints->domain_attr->control_progress = FI_PROGRESS_AUTO; - hints->domain_attr->data_progress = FI_PROGRESS_AUTO; - hints->domain_attr->caps = - FI_LOCAL_COMM | FI_REMOTE_COMM; // Enable local loopback - hints->domain_attr->av_type = FI_AV_TABLE; - hints->fabric_attr->prov_name = strdup("efa"); - UniqueFabricPtr info; - struct fi_info* info_; - int ret = - fi_getinfo(FABRIC_VERSION, nullptr, nullptr, 0, hints.get(), &info_); - info.reset(info_); - return (ret == 0); -} - -FabricEndpoint::FabricEndpoint(){ - fabric_ctx = std::make_unique(); -} - -int FabricEndpoint::poll_cq(struct fi_cq_tagged_entry* cq_entries, fi_addr_t* src_addrs, size_t count){ - int ret = fi_cq_readfrom(fabric_ctx->cq.get(), cq_entries, count, src_addrs); - return ret; -} - -int FabricEndpoint::post_send(void* buffer, size_t size, uint64_t tag, fi_addr_t dest_addr, void* context){ - int ret = fi_tsend(fabric_ctx->ep.get(), buffer, size, nullptr, dest_addr, tag, context); - if (ret < 0 && ret != -FI_EAGAIN) { - TP_CHECK_EFA_RET(ret, "Unable to do fi_tsend message"); - } - return ret; -} - -int FabricEndpoint::post_recv(void* buffer, size_t size, uint64_t tag, fi_addr_t dest_addr, uint64_t ignore, void* context){ - int ret = fi_trecv(fabric_ctx->ep.get(), buffer, size, nullptr, dest_addr, tag, ignore, context); - if (ret < 0 && ret != -FI_EAGAIN) { - TP_CHECK_EFA_RET(ret, "Unable to do fi_trecv message"); - } - return ret; -} - - -fi_addr_t FabricEndpoint::addPeerAddr(FabricAddr* addr){ - fi_addr_t peer_addr; - // TP_LOG_WARNING() << "Add: " << addr->DebugStr(); - int ret = - fi_av_insert(fabric_ctx->av.get(), addr->name, 1, &peer_addr, 0, nullptr); - TP_DCHECK_EQ(ret, 1); - TP_CHECK_EFA_RET(ret, "Unable to add address to endpoint"); - return peer_addr; -}; - - void FabricEndpoint::removePeerAddr(fi_addr_t peer_addr){ - int ret = - fi_av_remove(fabric_ctx->av.get(), &peer_addr, 1, 0); - TP_DCHECK_EQ(ret, 0); - }; - -} // namespace tensorpipe \ No newline at end of file diff --git a/tensorpipe/common/efa.h b/tensorpipe/common/efa.h index ff5e97ea0..303a9e9c0 100644 --- a/tensorpipe/common/efa.h +++ b/tensorpipe/common/efa.h @@ -9,7 +9,6 @@ #ifndef TENSORPIPE_COMMON_EFA_H_ #define TENSORPIPE_COMMON_EFA_H_ - #include #include #include @@ -19,64 +18,63 @@ #include #include -#include -#include #include +#include #include #include -static const int FABRIC_VERSION = FI_VERSION(1, 10); -static const int kMaxConcurrentWorkRequest = 4224; - namespace tensorpipe { -#define TP_CHECK_EFA_RET(ret, msg) \ - do { \ +static const int FABRIC_VERSION = FI_VERSION(1, 10); + +#define TP_CHECK_EFA_RET(ret, msg) \ + do { \ if (ret < 0) { \ - TP_THROW_ASSERT() << msg << ". Return Code: " << ret \ - << ". ERROR: " << fi_strerror(-ret); \ - } \ + TP_THROW_ASSERT() << msg << ". Return Code: " << ret; \ + } \ } while (false) struct FabricDeleter { void operator()(fi_info* info) { if (info) - fi_freeinfo(info); + efaLib->fi_freeinfo_op(info); } void operator()(fid* fid) { if (fid) - fi_close(fid); + efaLib->fi_close_op(fid); } void operator()(fid_domain* fid) { if (fid) - fi_close((fid_t)fid); + efaLib->fi_close_op((fid_t)fid); } void operator()(fid_fabric* fid) { if (fid) - fi_close((fid_t)fid); + efaLib->fi_close_op((fid_t)fid); } void operator()(fid_cq* fid) { if (fid) - fi_close((fid_t)fid); + efaLib->fi_close_op((fid_t)fid); } void operator()(fid_av* fid) { if (fid) - fi_close((fid_t)fid); + efaLib->fi_close_op((fid_t)fid); } void operator()(fid_ep* fid) { if (fid) - fi_close((fid_t)fid); + efaLib->fi_close_op((fid_t)fid); } void operator()(fid_eq* fid) { if (fid) - fi_close((fid_t)fid); + efaLib->fi_close_op((fid_t)fid); } + + EfaLib* efaLib; }; template using UniqueFabricPtr = std::unique_ptr; -struct FabricAddr { +struct EfaAddress { // endpoint name char name[64] = {}; // length of endpoint name @@ -107,48 +105,159 @@ struct FabricAddr { } }; -class FabricContext { - public: - // fabric top-level object - UniqueFabricPtr fabric; - // domains which maps to a specific local network interface adapter - UniqueFabricPtr domain; - // completion queue - UniqueFabricPtr cq; - // address vector - UniqueFabricPtr av; - // the endpoint - UniqueFabricPtr ep; - // endpoint name - struct FabricAddr addr; - // readable endpoint name - struct FabricAddr readable_addr; +inline EfaLib::device* getEfaDevices(EfaLib& efaLib) { + EfaLib::device* hints = efaLib.fi_allocinfo_op(); + hints->mode = FI_CONTEXT; + hints->ep_attr->type = FI_EP_RDM; // Reliable Datagram + hints->caps = FI_TAGGED | FI_MSG | FI_REMOTE_COMM | FI_DIRECTED_RECV | + FI_LOCAL_COMM | FI_SOURCE; + hints->tx_attr->msg_order = FI_ORDER_SAS; + hints->rx_attr->msg_order = FI_ORDER_SAS; + hints->domain_attr->control_progress = FI_PROGRESS_AUTO; + hints->domain_attr->data_progress = FI_PROGRESS_AUTO; + hints->domain_attr->caps = + FI_LOCAL_COMM | FI_REMOTE_COMM; // Enable local loopback + hints->domain_attr->av_type = FI_AV_TABLE; + hints->fabric_attr->prov_name = strdup("efa"); + UniqueFabricPtr info; + // info. + struct fi_info* info_; + int ret = + efaLib.fi_getinfo_op(FABRIC_VERSION, nullptr, nullptr, 0, hints, &info_); + return info_; +} - public: - explicit FabricContext(); +using EfaFabric = UniqueFabricPtr; +inline EfaFabric createEfaFabric(EfaLib& efaLib, EfaLib::device* info) { + struct fid_fabric* fabric_; + int ret = efaLib.fi_fabric_op(info->fabric_attr, &fabric_, nullptr); + TP_CHECK_EFA_RET(ret, "Couldn't open a fabric provider"); + return EfaFabric(fabric_, FabricDeleter{&efaLib}); +} + +using EfaDomain = UniqueFabricPtr; +inline EfaDomain createEfaDomain( + EfaLib& efaLib, + EfaFabric& fabric, + EfaLib::device* info) { + struct fid_domain* domain_; + int ret = efaLib.fi_domain_op(fabric.get(), info, &domain_, nullptr); + TP_CHECK_EFA_RET(ret, "Couldn't open a fabric access domain"); + return EfaDomain(domain_, FabricDeleter{&efaLib}); +} + +using EfaEndpoint = UniqueFabricPtr; +inline EfaEndpoint createEfaEndpoint( + EfaLib& efaLib, + EfaDomain& domain, + EfaLib::device* info) { + struct fid_ep* ep_; + int ret = efaLib.fi_endpoint_op(domain.get(), info, &ep_, nullptr); + TP_CHECK_EFA_RET(ret, "Couldn't allocate endpoint"); + return EfaEndpoint(ep_, FabricDeleter{&efaLib}); +} + +using EfaCompletionQueue = UniqueFabricPtr; +inline EfaCompletionQueue createEfaCompletionQueue( + EfaLib& efaLib, + EfaDomain& domain, + EfaLib::device* info) { + struct fid_cq* cq_; + struct fi_cq_attr cq_attr = {}; + cq_attr.format = FI_CQ_FORMAT_TAGGED; + cq_attr.size = info->rx_attr->size; + int ret = efaLib.fi_cq_open_op(domain.get(), &cq_attr, &cq_, nullptr); + TP_CHECK_EFA_RET(ret, "Couldn't open CQ"); + return EfaCompletionQueue(cq_, FabricDeleter{&efaLib}); +} + +using EfaAdressVector = UniqueFabricPtr; +inline EfaAdressVector createEfaAdressVector( + EfaLib& efaLib, + EfaDomain& domain) { + struct fi_av_attr av_attr = {}; + struct fid_av* av_; + int ret = efaLib.fi_av_open_op(domain.get(), &av_attr, &av_, nullptr); + TP_CHECK_EFA_RET(ret, "Couldn't open AV"); + return EfaAdressVector(av_, FabricDeleter{&efaLib}); +} + +inline EfaAddress enableEndpoint( + EfaLib& efaLib, + EfaEndpoint& ep, + EfaAdressVector& av, + EfaCompletionQueue& cq) { + // fi_ep_bind: bind CQ and AV to the endpoint + int ret; + ret = efaLib.fi_ep_bind_op(ep.get(), (fid_t)cq.get(), FI_RECV | FI_TRANSMIT); + TP_CHECK_EFA_RET(ret, "Couldn't bind EP-CQ"); + ret = efaLib.fi_ep_bind_op(ep.get(), (fid_t)av.get(), 0); + TP_CHECK_EFA_RET(ret, "Couldn't bind EP-AV"); + // fi_enable: enable endpoint for communication + ret = efaLib.fi_enable_op(ep.get()); + TP_CHECK_EFA_RET(ret, "Couldn't enable endpoint"); + + // fi_getname: get endpoint name + EfaAddress addr; + ret = efaLib.fi_getname_op((fid_t)ep.get(), addr.name, &addr.len); + TP_CHECK_EFA_RET(ret, "Call to fi_getname() failed"); + return addr; +} + +class EfaDeviceList { private: - UniqueFabricPtr getFabricInfo(); -}; + EfaDeviceList(EfaLib& efaLib, EfaLib::device* ptr, int size) + : deviceList_(ptr, Deleter{&efaLib}), size_(size) {} -class FabricEndpoint { public: - FabricEndpoint(); + EfaDeviceList() = default; - fi_addr_t addPeerAddr(FabricAddr* addr); - void removePeerAddr(fi_addr_t peer_addr); + static std::tuple create(EfaLib& efaLib) { + int size; + EfaLib::device* ptr = getEfaDevices(efaLib); + EfaLib::device* first_ptr = ptr; + if (ptr == nullptr) { + return std::make_tuple( + TP_CREATE_ERROR(SystemError, "fi_getinfo", -1), EfaDeviceList()); + } + size = 1; + while (ptr->next != nullptr) { + ptr = ptr->next; + size++; + }; + return std::make_tuple( + Error::kSuccess, EfaDeviceList(efaLib, first_ptr, size)); + } + + int size() { + return size_; + } - int post_send(void* buffer, size_t size, uint64_t tag, fi_addr_t dst_addr, void* context = nullptr); - int post_recv(void* buffer, size_t size, uint64_t tag, fi_addr_t src_addr, uint64_t ignore, void* context = nullptr); + EfaLib::device& operator[](int i) { + EfaLib::device* ptr = deviceList_.get(); + for (int j = 0; j < i; j++) { + ptr = ptr->next; + } + return *ptr; + } - int poll_cq(struct fi_cq_tagged_entry* cq_entries, fi_addr_t* src_addrs, size_t count); + void reset() { + deviceList_.reset(); + } - static bool isEfaAvailable(); + private: + struct Deleter { + void operator()(EfaLib::device* ptr) { + efaLib->fi_freeinfo_op(ptr); + } - // Fabric Context contains everything - std::unique_ptr fabric_ctx; + EfaLib* efaLib; + }; + std::unique_ptr deviceList_; + int size_; }; -} // namespace tensorpipe +} // namespace tensorpipe #endif // TENSORPIPE_COMMON_EFA_H_ diff --git a/tensorpipe/common/efa_lib.h b/tensorpipe/common/efa_lib.h index 1e299e024..311b77c8b 100644 --- a/tensorpipe/common/efa_lib.h +++ b/tensorpipe/common/efa_lib.h @@ -8,25 +8,69 @@ #pragma once -#include - +#include +#include +#include +#include +#include +#include #include #include +#include namespace tensorpipe { -// Wrapper for libibverbs. +#define TP_FORALL_FABRIC_SYMBOLS(_) \ + _(fi_freeinfo) \ + _(fi_allocinfo) \ + _(fi_fabric) \ + _(fi_domain) \ + _(fi_strerror) \ + _(fi_av_open) \ + _(fi_cq_open) \ + _(fi_endpoint) \ + _(fi_ep_bind) \ + _(fi_enable) \ + _(fi_getname) \ + _(fi_cq_readfrom) \ + _(fi_tsendmsg) \ + _(fi_trecvmsg) \ + _(fi_av_insert) \ + _(fi_av_remove) \ + _(fi_close) \ + _(fi_getinfo) \ + _(fi_av_straddr) + +// Wrapper for libfabric. class EfaLib { public: + using device = struct fi_info; + private: explicit EfaLib(DynamicLibraryHandle dlhandle) : dlhandle_(std::move(dlhandle)) {} DynamicLibraryHandle dlhandle_; + + decltype(&fi_allocinfo) fi_allocptr = nullptr; + +#define TP_DECLARE_FIELD(function_name) \ + decltype(&function_name) function_name##_ptr_ = nullptr; + TP_FORALL_FABRIC_SYMBOLS(TP_DECLARE_FIELD) +#undef TP_DECLARE_FIELD + public: EfaLib() = default; +#define TP_FORWARD_CALL(function_name) \ + template \ + auto function_name##_op(Args&&... args) { \ + return (*function_name##_ptr_)(std::forward(args)...); \ + } + TP_FORALL_FABRIC_SYMBOLS(TP_FORWARD_CALL) +#undef TP_FORWARD_CALL + static std::tuple create() { Error error; DynamicLibraryHandle dlhandle; @@ -51,10 +95,21 @@ class EfaLib { return "Found shared library libfabric.so at " + filename; }(); EfaLib lib(std::move(dlhandle)); +#define TP_LOAD_SYMBOL(function_name) \ + { \ + void* ptr; \ + std::tie(error, ptr) = lib.dlhandle_.loadSymbol(function_name); \ + if (error) { \ + return std::make_tuple(std::move(error), EfaLib()); \ + } \ + TP_THROW_ASSERT_IF(ptr == nullptr); \ + lib.function_name##_ptr_ = \ + reinterpret_cast(ptr); \ + } \ + TP_FORALL_FABRIC_SYMBOLS(TP_LOAD_SYMBOL) +#undef TP_LOAD_SYMBOL return std::make_tuple(Error::kSuccess, std::move(lib)); } - }; - } // namespace tensorpipe diff --git a/tensorpipe/transport/efa/connection_impl.cc b/tensorpipe/transport/efa/connection_impl.cc index ee858cfb1..09a22a895 100644 --- a/tensorpipe/transport/efa/connection_impl.cc +++ b/tensorpipe/transport/efa/connection_impl.cc @@ -57,7 +57,6 @@ void ConnectionImpl::initImplFromLoop() { context_->enroll(*this); Error error; // The connection either got a socket or an address, but not both. - endpoint = context_->getReactor().endpoint; TP_DCHECK(socket_.hasValue() ^ sockaddr_.has_value()); if (!socket_.hasValue()) { @@ -170,7 +169,7 @@ void ConnectionImpl::handleEventsFromLoop(int events) { void ConnectionImpl::handleEventInFromLoop() { TP_DCHECK(context_->inLoop()); if (state_ == RECV_ADDR) { - struct FabricAddr addr; + struct EfaAddress addr; // auto x = &addr.name; auto err = socket_.read(addr.name, sizeof(addr.name)); // Crossing our fingers that the exchange information is small enough that @@ -180,7 +179,7 @@ void ConnectionImpl::handleEventInFromLoop() { return; } - peer_addr = endpoint->addPeerAddr(&addr); + peer_addr = context_->getReactor().addPeerAddr(addr); // The connection is usable now. context_->getReactor().registerHandler(peer_addr, shared_from_this()); @@ -208,7 +207,7 @@ void ConnectionImpl::handleEventInFromLoop() { void ConnectionImpl::handleEventOutFromLoop() { TP_DCHECK(context_->inLoop()); if (state_ == SEND_ADDR) { - FabricAddr addr = endpoint->fabric_ctx->addr; + EfaAddress addr = context_->getReactor().getEfaAddress(); auto err = socket_.write(reinterpret_cast(addr.name), sizeof(addr.name)); // Crossing our fingers that the exchange information is small enough that @@ -346,7 +345,7 @@ void ConnectionImpl::cleanup() { TP_DCHECK(context_->inLoop()); TP_VLOG(8) << "Connection " << id_ << " is cleaning up"; context_->getReactor().unregisterHandler(peer_addr); - endpoint->removePeerAddr(peer_addr); + context_->getReactor().removePeerAddr(peer_addr); } } // namespace efa diff --git a/tensorpipe/transport/efa/connection_impl.h b/tensorpipe/transport/efa/connection_impl.h index a93f1fc07..453987eb0 100644 --- a/tensorpipe/transport/efa/connection_impl.h +++ b/tensorpipe/transport/efa/connection_impl.h @@ -97,7 +97,7 @@ class ConnectionImpl final : public ConnectionImplBoilerplate< State state_{INITIALIZING}; Socket socket_; optional sockaddr_; - std::shared_ptr endpoint; + fi_addr_t peer_addr; uint32_t sendIdx = 0; diff --git a/tensorpipe/transport/efa/context_impl.cc b/tensorpipe/transport/efa/context_impl.cc index be139c497..e738f07b5 100644 --- a/tensorpipe/transport/efa/context_impl.cc +++ b/tensorpipe/transport/efa/context_impl.cc @@ -45,45 +45,25 @@ std::shared_ptr ContextImpl::create() { return nullptr; } - // bool isEfaAvailable = FabricEndpoint::isEfaAvailable(); - if (!FabricEndpoint::isEfaAvailable()){ - TP_VLOG(7) - << "libfabric cannot find efa provider."; + EfaDeviceList deviceList; + std::tie(error, deviceList) = EfaDeviceList::create(efaLib); + if (error) { + TP_VLOG(7) << "EFA transport is not viable because it couldn't find any" + << "EFA devices"; return nullptr; } + TP_THROW_ASSERT_IF(error) + << "Couldn't get list of EFA devices: " << error.what(); - // efaDeviceList deviceList; - // std::tie(error, deviceList) = efaDeviceList::create(efaLib); - // if (error && error.isOfType() && - // error.castToType()->errorCode() == ENOSYS) { - // TP_VLOG(7) << "efa transport is not viable because it couldn't get list of " - // << "InfiniBand devices because the kernel module isn't loaded"; - // return nullptr; - // } - // TP_THROW_ASSERT_IF(error) - // << "Couldn't get list of InfiniBand devices: " << error.what(); - - // if (deviceList.size() == 0) { - // TP_VLOG(7) << "efa transport is not viable because it couldn't find any " - // << "InfiniBand NICs"; - // return nullptr; - // } - - // return std::make_shared( - // std::move(efaLib), std::move(deviceList)); - return std::make_shared(); + return std::make_shared(std::move(efaLib), std::move(deviceList)); } -ContextImpl::ContextImpl() +ContextImpl::ContextImpl(EfaLib efaLib, EfaDeviceList deviceList) : ContextImplBoilerplate( - generateDomainDescriptor()) { - } + generateDomainDescriptor()), + reactor_(std::move(efaLib), std::move(deviceList)) {} -// ContextImpl::ContextImpl(efaLib efaLib, efaDeviceList deviceList) -// : ContextImplBoilerplate( -// generateDomainDescriptor()), -// reactor_(std::move(efaLib), std::move(deviceList)) {} void ContextImpl::handleErrorImpl() { loop_.close(); diff --git a/tensorpipe/transport/efa/context_impl.h b/tensorpipe/transport/efa/context_impl.h index f2267e142..813ce0c6b 100644 --- a/tensorpipe/transport/efa/context_impl.h +++ b/tensorpipe/transport/efa/context_impl.h @@ -29,7 +29,7 @@ class ContextImpl final public: static std::shared_ptr create(); -// ContextImpl(efaLib efaLib, efaDeviceList deviceList); + ContextImpl(EfaLib efaLib, EfaDeviceList deviceList); ContextImpl(); // Implement the DeferredExecutor interface. diff --git a/tensorpipe/transport/efa/reactor.cc b/tensorpipe/transport/efa/reactor.cc index 739530046..a1a50b80c 100644 --- a/tensorpipe/transport/efa/reactor.cc +++ b/tensorpipe/transport/efa/reactor.cc @@ -16,9 +16,16 @@ namespace tensorpipe { namespace transport { namespace efa { -Reactor::Reactor() { - // postRecvRequests(kNumPendingRecvReqs); - endpoint = std::make_shared(); +Reactor::Reactor(EfaLib efaLib, EfaDeviceList efaDeviceList) { + efaLib_ = std::move(efaLib); + // AWS p4d instances may have multiple EFAs. Only use device 0 for now + EfaLib::device* device = &efaDeviceList[0]; + fabric_ = createEfaFabric(efaLib, device); + domain_ = createEfaDomain(efaLib, fabric_, device); + ep_ = createEfaEndpoint(efaLib, domain_, device); + av_ = createEfaAdressVector(efaLib, domain_); + cq_ = createEfaCompletionQueue(efaLib, domain_, device); + addr_ = enableEndpoint(efaLib, ep_, av_, cq_); startThread("TP_efa_reactor"); } @@ -28,7 +35,13 @@ void Reactor::postSend( uint64_t tag, fi_addr_t peer_addr, void* context) { - pendingSends_.push_back({buffer, size, tag, peer_addr, context}); + pendingSends_.emplace_back(EfaEvent(new + fi_msg_tagged{ + .msg_iov = new iovec{.iov_base = buffer, .iov_len = size}, + .iov_count = 1, + .addr = peer_addr, + .tag = tag, + .context = context})); postPendingRecvs(); } @@ -39,21 +52,21 @@ void Reactor::postRecv( fi_addr_t dest_addr, uint64_t ignore, void* context) { - // First try send all messages in pending queue - pendingRecvs_.push_back({buffer, size, tag, dest_addr, ignore, context}); + pendingRecvs_.emplace_back(EfaEvent(new + fi_msg_tagged{ + .msg_iov = new iovec{.iov_base = buffer, .iov_len = size}, + .iov_count = 1, + .addr = dest_addr, + .tag = tag, + .ignore = ignore, + .context = context})); postPendingRecvs(); } int Reactor::postPendingSends() { - // TP_LOG_WARNING() << "Post send event"; while (!pendingSends_.empty()) { - EFASendEvent sevent = pendingSends_.front(); - int ret = this->endpoint->post_send( - sevent.buffer, - sevent.size, - sevent.tag, - sevent.peer_addr, - sevent.context); // ignore low 32 bits on tag matching + fi_msg_tagged* sevent = pendingSends_.front().get(); + int ret = efaLib_.fi_tsendmsg_op(ep_.get(), sevent, 0); if (ret == 0) { // Send successfully, pop out events pendingSends_.pop_front(); @@ -68,16 +81,24 @@ int Reactor::postPendingSends() { return 0; } +fi_addr_t Reactor::addPeerAddr(EfaAddress& addr) { + fi_addr_t peer_addr; + int ret = + efaLib_.fi_av_insert_op(av_.get(), addr.name, 1, &peer_addr, 0, nullptr); + TP_THROW_ASSERT_IF(ret != 1) << "Unable to add address to endpoint"; + TP_CHECK_EFA_RET(ret, "Unable to add address to endpoint"); + return peer_addr; +} + +void Reactor::removePeerAddr(fi_addr_t faddr) { + int ret = efaLib_.fi_av_remove_op(av_.get(), &faddr, 1, 0); + TP_CHECK_EFA_RET(ret, "Unable to remove address from endpoint"); +}; + int Reactor::postPendingRecvs() { while (!pendingRecvs_.empty()) { - EFARecvEvent revent = pendingRecvs_.front(); - int ret = this->endpoint->post_recv( - revent.buffer, - revent.size, - revent.tag, - revent.peer_addr, - revent.ignore, - revent.context); // ignore low 32 bits on tag matching + fi_msg_tagged* revent = pendingRecvs_.front().get(); + int ret = efaLib_.fi_trecvmsg_op(ep_.get(), revent, 0); if (ret == 0) { // Send successfully, pop out events pendingRecvs_.pop_front(); @@ -119,10 +140,8 @@ bool Reactor::pollOnce() { postPendingSends(); postPendingRecvs(); - - auto rv = - endpoint->poll_cq(cq_entries.data(), src_addrs.data(), cq_entries.size()); - + int rv = efaLib_.fi_cq_readfrom_op( + cq_.get(), cq_entries.data(), cq_entries.size(), src_addrs.data()); if (rv == 0 || rv == -FI_EAGAIN) { return false; } else { @@ -186,11 +205,13 @@ bool Reactor::readyToClose() { return efaEventHandler_.size() == 0; } -void Reactor::registerHandler(fi_addr_t peer_addr, std::shared_ptr eventHandler) { +void Reactor::registerHandler( + fi_addr_t peer_addr, + std::shared_ptr eventHandler) { efaEventHandler_.emplace(peer_addr, std::move(eventHandler)); } -void Reactor::unregisterHandler(fi_addr_t peer_addr){ +void Reactor::unregisterHandler(fi_addr_t peer_addr) { efaEventHandler_.erase(peer_addr); } diff --git a/tensorpipe/transport/efa/reactor.h b/tensorpipe/transport/efa/reactor.h index ac96ccbc0..f465cb9d0 100644 --- a/tensorpipe/transport/efa/reactor.h +++ b/tensorpipe/transport/efa/reactor.h @@ -19,8 +19,8 @@ #include #include -#include #include +#include #include #include @@ -37,7 +37,7 @@ class efaEventHandler { virtual ~efaEventHandler() = default; }; -enum efaTag: uint64_t{ +enum efaTag : uint64_t { kLength = 1ULL << 32, kPayload = 1ULL << 33, }; @@ -52,37 +52,48 @@ enum efaTag: uint64_t{ // class Reactor final : public BusyPollingLoop { public: - // Reactor(efaLib efaLib, efaDeviceList deviceList); - Reactor(); - - // const efaLib& getefaLib() { - // return efaLib_; - // } + Reactor(EfaLib efaLib, EfaDeviceList efaDeviceList); - // efaProtectionDomain& getefaPd() { - // return pd_; - // } + const EfaLib& getefaLib() { + return efaLib_; + } - // efaCompletionQueue& getefaCq() { - // return cq_; - // } + EfaDomain& getefaDomain() { + return domain_; + } - // efaSharedReceiveQueue& getefaSrq() { - // return srq_; - // } + EfaCompletionQueue& getefaCq() { + return cq_; + } - // const efaAddress& getefaAddress() { - // return addr_; - // } + const EfaAddress& getEfaAddress() { + return addr_; + } - void registerHandler(fi_addr_t peer_addr, std::shared_ptr eventHandler); + void registerHandler( + fi_addr_t peer_addr, + std::shared_ptr eventHandler); void unregisterHandler(fi_addr_t peer_addr); - void postSend(void* buffer, size_t size, uint64_t tag, fi_addr_t peer_addr, void* context); + void postSend( + void* buffer, + size_t size, + uint64_t tag, + fi_addr_t peer_addr, + void* context); - void postRecv(void* buffer, size_t size, uint64_t tag, fi_addr_t peer_addr, uint64_t ignore, void* context); - // void postAck(efaQueuePair& qp, efaLib::send_wr& wr); + void postRecv( + void* buffer, + size_t size, + uint64_t tag, + fi_addr_t peer_addr, + uint64_t ignore, + void* context); + + fi_addr_t addPeerAddr(EfaAddress& addr); + + void removePeerAddr(fi_addr_t faddr); void setId(std::string id); @@ -92,49 +103,34 @@ class Reactor final : public BusyPollingLoop { ~Reactor(); - - std::shared_ptr endpoint; - protected: bool pollOnce() override; bool readyToClose() override; - struct EFASendEvent{ - void* buffer; - size_t size; - uint64_t tag; - fi_addr_t peer_addr; - void* context; + class EfaEventDeleter { + public: + void operator()(fi_msg_tagged* msg) { + delete msg->msg_iov; + } }; + using EfaEvent = std::unique_ptr; - struct EFARecvEvent{ - void* buffer; - size_t size; - uint64_t tag; - fi_addr_t peer_addr; - uint64_t ignore; - void* context; - }; private: - // InfiniBand stuff - // const efaLib efaLib_; - // efaContext ctx_; - // efaProtectionDomain pd_; - // efaCompletionQueue cq_; - // efaSharedReceiveQueue srq_; - // efaAddress addr_; + EfaLib efaLib_; + EfaFabric fabric_; + EfaDomain domain_; + EfaEndpoint ep_; + EfaCompletionQueue cq_; + EfaAdressVector av_; + EfaAddress addr_; + int postPendingRecvs(); int postPendingSends(); - - // void postRecvRequests(int num); - std::atomic closed_{false}; std::atomic joined_{false}; - std::array size_buffer; - // An identifier for the context, composed of the identifier for the context, // combined with the transport's name. It will only be used for logging and // debugging purposes. @@ -144,11 +140,8 @@ class Reactor final : public BusyPollingLoop { std::unordered_map> efaEventHandler_; - // uint32_t numAvailableWrites_{kNumPendingWriteReqs}; - // uint32_t numAvailableAcks_{kNumPendingAckReqs}; - std::deque pendingSends_; - std::deque pendingRecvs_; - // std::deque> pendingQpAcks_; + std::deque pendingSends_; + std::deque pendingRecvs_; }; } // namespace efa From efb8aa20badc424b36cfe95c3bc21687db34e77f Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Fri, 17 Sep 2021 14:59:12 +0100 Subject: [PATCH 10/19] fix --- tensorpipe/CMakeLists.txt | 1 - tensorpipe/common/efa.h | 35 ++++++++++++----------- tensorpipe/common/efa_lib.h | 43 ++++++++++------------------- tensorpipe/transport/efa/reactor.cc | 10 +++---- 4 files changed, 37 insertions(+), 52 deletions(-) diff --git a/tensorpipe/CMakeLists.txt b/tensorpipe/CMakeLists.txt index 4397e9761..acbdc6188 100644 --- a/tensorpipe/CMakeLists.txt +++ b/tensorpipe/CMakeLists.txt @@ -170,7 +170,6 @@ tp_conditional_backend( TP_ENABLE_EFA "Enable EFA transport" "LINUX") if(TP_ENABLE_EFA) list(APPEND TP_SRCS - common/efa.cc transport/efa/connection_impl.cc transport/efa/context_impl.cc transport/efa/error.cc diff --git a/tensorpipe/common/efa.h b/tensorpipe/common/efa.h index 303a9e9c0..8e218dcda 100644 --- a/tensorpipe/common/efa.h +++ b/tensorpipe/common/efa.h @@ -41,31 +41,31 @@ struct FabricDeleter { } void operator()(fid* fid) { if (fid) - efaLib->fi_close_op(fid); + fi_close(fid); } void operator()(fid_domain* fid) { if (fid) - efaLib->fi_close_op((fid_t)fid); + fi_close((fid_t)fid); } void operator()(fid_fabric* fid) { if (fid) - efaLib->fi_close_op((fid_t)fid); + fi_close((fid_t)fid); } void operator()(fid_cq* fid) { if (fid) - efaLib->fi_close_op((fid_t)fid); + fi_close((fid_t)fid); } void operator()(fid_av* fid) { if (fid) - efaLib->fi_close_op((fid_t)fid); + fi_close((fid_t)fid); } void operator()(fid_ep* fid) { if (fid) - efaLib->fi_close_op((fid_t)fid); + fi_close((fid_t)fid); } void operator()(fid_eq* fid) { if (fid) - efaLib->fi_close_op((fid_t)fid); + fi_close((fid_t)fid); } EfaLib* efaLib; @@ -106,7 +106,7 @@ struct EfaAddress { }; inline EfaLib::device* getEfaDevices(EfaLib& efaLib) { - EfaLib::device* hints = efaLib.fi_allocinfo_op(); + EfaLib::device* hints = efaLib.fi_dupinfo_op((const fi_info*)NULL); hints->mode = FI_CONTEXT; hints->ep_attr->type = FI_EP_RDM; // Reliable Datagram hints->caps = FI_TAGGED | FI_MSG | FI_REMOTE_COMM | FI_DIRECTED_RECV | @@ -119,7 +119,6 @@ inline EfaLib::device* getEfaDevices(EfaLib& efaLib) { FI_LOCAL_COMM | FI_REMOTE_COMM; // Enable local loopback hints->domain_attr->av_type = FI_AV_TABLE; hints->fabric_attr->prov_name = strdup("efa"); - UniqueFabricPtr info; // info. struct fi_info* info_; int ret = @@ -141,7 +140,7 @@ inline EfaDomain createEfaDomain( EfaFabric& fabric, EfaLib::device* info) { struct fid_domain* domain_; - int ret = efaLib.fi_domain_op(fabric.get(), info, &domain_, nullptr); + int ret = fi_domain(fabric.get(), info, &domain_, nullptr); TP_CHECK_EFA_RET(ret, "Couldn't open a fabric access domain"); return EfaDomain(domain_, FabricDeleter{&efaLib}); } @@ -152,7 +151,7 @@ inline EfaEndpoint createEfaEndpoint( EfaDomain& domain, EfaLib::device* info) { struct fid_ep* ep_; - int ret = efaLib.fi_endpoint_op(domain.get(), info, &ep_, nullptr); + int ret = fi_endpoint(domain.get(), info, &ep_, nullptr); TP_CHECK_EFA_RET(ret, "Couldn't allocate endpoint"); return EfaEndpoint(ep_, FabricDeleter{&efaLib}); } @@ -166,7 +165,7 @@ inline EfaCompletionQueue createEfaCompletionQueue( struct fi_cq_attr cq_attr = {}; cq_attr.format = FI_CQ_FORMAT_TAGGED; cq_attr.size = info->rx_attr->size; - int ret = efaLib.fi_cq_open_op(domain.get(), &cq_attr, &cq_, nullptr); + int ret = fi_cq_open(domain.get(), &cq_attr, &cq_, nullptr); TP_CHECK_EFA_RET(ret, "Couldn't open CQ"); return EfaCompletionQueue(cq_, FabricDeleter{&efaLib}); } @@ -176,8 +175,8 @@ inline EfaAdressVector createEfaAdressVector( EfaLib& efaLib, EfaDomain& domain) { struct fi_av_attr av_attr = {}; - struct fid_av* av_; - int ret = efaLib.fi_av_open_op(domain.get(), &av_attr, &av_, nullptr); + struct fid_av* av_; + int ret = fi_av_open(domain.get(), &av_attr, &av_, nullptr); TP_CHECK_EFA_RET(ret, "Couldn't open AV"); return EfaAdressVector(av_, FabricDeleter{&efaLib}); } @@ -189,18 +188,18 @@ inline EfaAddress enableEndpoint( EfaCompletionQueue& cq) { // fi_ep_bind: bind CQ and AV to the endpoint int ret; - ret = efaLib.fi_ep_bind_op(ep.get(), (fid_t)cq.get(), FI_RECV | FI_TRANSMIT); + ret = fi_ep_bind(ep.get(), (fid_t)cq.get(), FI_RECV | FI_TRANSMIT); TP_CHECK_EFA_RET(ret, "Couldn't bind EP-CQ"); - ret = efaLib.fi_ep_bind_op(ep.get(), (fid_t)av.get(), 0); + ret = fi_ep_bind(ep.get(), (fid_t)av.get(), 0); TP_CHECK_EFA_RET(ret, "Couldn't bind EP-AV"); // fi_enable: enable endpoint for communication - ret = efaLib.fi_enable_op(ep.get()); + ret = fi_enable(ep.get()); TP_CHECK_EFA_RET(ret, "Couldn't enable endpoint"); // fi_getname: get endpoint name EfaAddress addr; - ret = efaLib.fi_getname_op((fid_t)ep.get(), addr.name, &addr.len); + ret = fi_getname((fid_t)ep.get(), addr.name, &addr.len); TP_CHECK_EFA_RET(ret, "Call to fi_getname() failed"); return addr; } diff --git a/tensorpipe/common/efa_lib.h b/tensorpipe/common/efa_lib.h index 311b77c8b..a2293c39c 100644 --- a/tensorpipe/common/efa_lib.h +++ b/tensorpipe/common/efa_lib.h @@ -22,24 +22,10 @@ namespace tensorpipe { #define TP_FORALL_FABRIC_SYMBOLS(_) \ _(fi_freeinfo) \ - _(fi_allocinfo) \ + _(fi_dupinfo) \ _(fi_fabric) \ - _(fi_domain) \ _(fi_strerror) \ - _(fi_av_open) \ - _(fi_cq_open) \ - _(fi_endpoint) \ - _(fi_ep_bind) \ - _(fi_enable) \ - _(fi_getname) \ - _(fi_cq_readfrom) \ - _(fi_tsendmsg) \ - _(fi_trecvmsg) \ - _(fi_av_insert) \ - _(fi_av_remove) \ - _(fi_close) \ - _(fi_getinfo) \ - _(fi_av_straddr) + _(fi_getinfo) // Wrapper for libfabric. @@ -80,6 +66,7 @@ class EfaLib { std::tie(error, dlhandle) = DynamicLibraryHandle::create("libfabric.so", RTLD_LOCAL | RTLD_LAZY); if (error) { + TP_LOG_WARNING() << "Load so fail"; return std::make_tuple(std::move(error), EfaLib()); } // Log at level 9 as we can't know whether this will be used in a transport @@ -95,18 +82,18 @@ class EfaLib { return "Found shared library libfabric.so at " + filename; }(); EfaLib lib(std::move(dlhandle)); -#define TP_LOAD_SYMBOL(function_name) \ - { \ - void* ptr; \ - std::tie(error, ptr) = lib.dlhandle_.loadSymbol(function_name); \ - if (error) { \ - return std::make_tuple(std::move(error), EfaLib()); \ - } \ - TP_THROW_ASSERT_IF(ptr == nullptr); \ - lib.function_name##_ptr_ = \ - reinterpret_cast(ptr); \ - } \ - TP_FORALL_FABRIC_SYMBOLS(TP_LOAD_SYMBOL) +#define TP_LOAD_SYMBOL(function_name) \ + { \ + void* ptr; \ + std::tie(error, ptr) = lib.dlhandle_.loadSymbol(#function_name); \ + if (error) { \ + return std::make_tuple(std::move(error), EfaLib()); \ + } \ + TP_THROW_ASSERT_IF(ptr == nullptr); \ + lib.function_name##_ptr_ = \ + reinterpret_cast(ptr); \ + } + TP_FORALL_FABRIC_SYMBOLS(TP_LOAD_SYMBOL) #undef TP_LOAD_SYMBOL return std::make_tuple(Error::kSuccess, std::move(lib)); } diff --git a/tensorpipe/transport/efa/reactor.cc b/tensorpipe/transport/efa/reactor.cc index a1a50b80c..12fc7da3e 100644 --- a/tensorpipe/transport/efa/reactor.cc +++ b/tensorpipe/transport/efa/reactor.cc @@ -66,7 +66,7 @@ void Reactor::postRecv( int Reactor::postPendingSends() { while (!pendingSends_.empty()) { fi_msg_tagged* sevent = pendingSends_.front().get(); - int ret = efaLib_.fi_tsendmsg_op(ep_.get(), sevent, 0); + int ret = fi_tsendmsg(ep_.get(), sevent, 0); if (ret == 0) { // Send successfully, pop out events pendingSends_.pop_front(); @@ -84,21 +84,21 @@ int Reactor::postPendingSends() { fi_addr_t Reactor::addPeerAddr(EfaAddress& addr) { fi_addr_t peer_addr; int ret = - efaLib_.fi_av_insert_op(av_.get(), addr.name, 1, &peer_addr, 0, nullptr); + fi_av_insert(av_.get(), addr.name, 1, &peer_addr, 0, nullptr); TP_THROW_ASSERT_IF(ret != 1) << "Unable to add address to endpoint"; TP_CHECK_EFA_RET(ret, "Unable to add address to endpoint"); return peer_addr; } void Reactor::removePeerAddr(fi_addr_t faddr) { - int ret = efaLib_.fi_av_remove_op(av_.get(), &faddr, 1, 0); + int ret = fi_av_remove(av_.get(), &faddr, 1, 0); TP_CHECK_EFA_RET(ret, "Unable to remove address from endpoint"); }; int Reactor::postPendingRecvs() { while (!pendingRecvs_.empty()) { fi_msg_tagged* revent = pendingRecvs_.front().get(); - int ret = efaLib_.fi_trecvmsg_op(ep_.get(), revent, 0); + int ret = fi_trecvmsg(ep_.get(), revent, 0); if (ret == 0) { // Send successfully, pop out events pendingRecvs_.pop_front(); @@ -140,7 +140,7 @@ bool Reactor::pollOnce() { postPendingSends(); postPendingRecvs(); - int rv = efaLib_.fi_cq_readfrom_op( + int rv = fi_cq_readfrom( cq_.get(), cq_entries.data(), cq_entries.size(), src_addrs.data()); if (rv == 0 || rv == -FI_EAGAIN) { return false; From 30b0fc9e33d7409ae77fb66319cfadcd3408636f Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Fri, 17 Sep 2021 17:54:51 +0100 Subject: [PATCH 11/19] fix --- tensorpipe/common/efa.h | 2 +- tensorpipe/transport/efa/reactor.cc | 36 +++++++++++++++-------------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/tensorpipe/common/efa.h b/tensorpipe/common/efa.h index 8e218dcda..c54c36098 100644 --- a/tensorpipe/common/efa.h +++ b/tensorpipe/common/efa.h @@ -175,7 +175,7 @@ inline EfaAdressVector createEfaAdressVector( EfaLib& efaLib, EfaDomain& domain) { struct fi_av_attr av_attr = {}; - struct fid_av* av_; + struct fid_av* av_; int ret = fi_av_open(domain.get(), &av_attr, &av_, nullptr); TP_CHECK_EFA_RET(ret, "Couldn't open AV"); return EfaAdressVector(av_, FabricDeleter{&efaLib}); diff --git a/tensorpipe/transport/efa/reactor.cc b/tensorpipe/transport/efa/reactor.cc index 12fc7da3e..151af4ae2 100644 --- a/tensorpipe/transport/efa/reactor.cc +++ b/tensorpipe/transport/efa/reactor.cc @@ -35,13 +35,15 @@ void Reactor::postSend( uint64_t tag, fi_addr_t peer_addr, void* context) { - pendingSends_.emplace_back(EfaEvent(new - fi_msg_tagged{ - .msg_iov = new iovec{.iov_base = buffer, .iov_len = size}, - .iov_count = 1, - .addr = peer_addr, - .tag = tag, - .context = context})); + pendingSends_.emplace_back(EfaEvent((new fi_msg_tagged{ + /* msg_iov */ new iovec{.iov_base = buffer, .iov_len = size}, + /* desc */ 0, + /* iov_count */ 1, + /* peer addr */ peer_addr, + /* tag */ tag, + /* ignore */ 0, + /* context */ context, + /* data */ 0}))); postPendingRecvs(); } @@ -52,14 +54,15 @@ void Reactor::postRecv( fi_addr_t dest_addr, uint64_t ignore, void* context) { - pendingRecvs_.emplace_back(EfaEvent(new - fi_msg_tagged{ - .msg_iov = new iovec{.iov_base = buffer, .iov_len = size}, - .iov_count = 1, - .addr = dest_addr, - .tag = tag, - .ignore = ignore, - .context = context})); + pendingRecvs_.emplace_back(EfaEvent(new fi_msg_tagged{ + /* msg_iov */ new iovec{.iov_base = buffer, .iov_len = size}, + /* desc */ 0, + /* iov_count */ 1, + /* peer addr */ dest_addr, + /* tag */ tag, + /* ignore */ ignore, + /* context */ context, + /* data */ 0})); postPendingRecvs(); } @@ -83,8 +86,7 @@ int Reactor::postPendingSends() { fi_addr_t Reactor::addPeerAddr(EfaAddress& addr) { fi_addr_t peer_addr; - int ret = - fi_av_insert(av_.get(), addr.name, 1, &peer_addr, 0, nullptr); + int ret = fi_av_insert(av_.get(), addr.name, 1, &peer_addr, 0, nullptr); TP_THROW_ASSERT_IF(ret != 1) << "Unable to add address to endpoint"; TP_CHECK_EFA_RET(ret, "Unable to add address to endpoint"); return peer_addr; From a8dca10ae48307fb3da71f7d905d3ac7cae3a809 Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Fri, 17 Sep 2021 17:57:12 +0100 Subject: [PATCH 12/19] lint --- tensorpipe/transport/efa/connection_impl.h | 1 - tensorpipe/transport/efa/constants.h | 3 +-- tensorpipe/transport/efa/context_impl.cc | 9 ++++----- tensorpipe/transport/efa/listener_impl.cc | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/tensorpipe/transport/efa/connection_impl.h b/tensorpipe/transport/efa/connection_impl.h index 453987eb0..4ab9ef0f6 100644 --- a/tensorpipe/transport/efa/connection_impl.h +++ b/tensorpipe/transport/efa/connection_impl.h @@ -37,7 +37,6 @@ class ConnectionImpl final : public ConnectionImplBoilerplate< ConnectionImpl>, public efaEventHandler, public EpollLoop::EventHandler { - enum State { INITIALIZING = 1, SEND_ADDR, diff --git a/tensorpipe/transport/efa/constants.h b/tensorpipe/transport/efa/constants.h index ae03cfa0d..920b40efd 100644 --- a/tensorpipe/transport/efa/constants.h +++ b/tensorpipe/transport/efa/constants.h @@ -42,8 +42,7 @@ constexpr uint32_t kNumPendingWriteReqs = 1024; // will be either the completed receive requests of the SRQ, or the completed // send requests from a connection's queue pair. We can bound the former value // but not the latter, so we try to add some margin. -constexpr int kCompletionQueueSize = - kNumPendingRecvReqs + kNumPendingWriteReqs; +constexpr int kCompletionQueueSize = kNumPendingRecvReqs + kNumPendingWriteReqs; // How many work completions to poll from the completion queue at each reactor // iteration. diff --git a/tensorpipe/transport/efa/context_impl.cc b/tensorpipe/transport/efa/context_impl.cc index e738f07b5..3e14c873f 100644 --- a/tensorpipe/transport/efa/context_impl.cc +++ b/tensorpipe/transport/efa/context_impl.cc @@ -6,10 +6,10 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include #include +#include #include +#include #include namespace tensorpipe { @@ -55,16 +55,15 @@ std::shared_ptr ContextImpl::create() { TP_THROW_ASSERT_IF(error) << "Couldn't get list of EFA devices: " << error.what(); - return std::make_shared(std::move(efaLib), std::move(deviceList)); + return std::make_shared( + std::move(efaLib), std::move(deviceList)); } - ContextImpl::ContextImpl(EfaLib efaLib, EfaDeviceList deviceList) : ContextImplBoilerplate( generateDomainDescriptor()), reactor_(std::move(efaLib), std::move(deviceList)) {} - void ContextImpl::handleErrorImpl() { loop_.close(); reactor_.close(); diff --git a/tensorpipe/transport/efa/listener_impl.cc b/tensorpipe/transport/efa/listener_impl.cc index ee6b6d147..e14428b47 100644 --- a/tensorpipe/transport/efa/listener_impl.cc +++ b/tensorpipe/transport/efa/listener_impl.cc @@ -17,10 +17,10 @@ #include #include #include -#include #include #include #include +#include namespace tensorpipe { namespace transport { From c0596b63f3999c6232514c2f74c2e3430fc84b69 Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Fri, 17 Sep 2021 17:59:01 +0100 Subject: [PATCH 13/19] lint --- tensorpipe/benchmark/transport_registry.cc | 1 - tensorpipe/transport/efa/listener_impl.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorpipe/benchmark/transport_registry.cc b/tensorpipe/benchmark/transport_registry.cc index 40c999f08..d033a5755 100644 --- a/tensorpipe/benchmark/transport_registry.cc +++ b/tensorpipe/benchmark/transport_registry.cc @@ -52,7 +52,6 @@ std::shared_ptr makeEfaContext() { TP_REGISTER_CREATOR(TensorpipeTransportRegistry, efa, makeEfaContext); #endif // TENSORPIPE_HAS_EFA_TRANSPORT - void validateTransportContext( std::shared_ptr context) { if (!context) { diff --git a/tensorpipe/transport/efa/listener_impl.h b/tensorpipe/transport/efa/listener_impl.h index a85e6e44a..e4c09ef8b 100644 --- a/tensorpipe/transport/efa/listener_impl.h +++ b/tensorpipe/transport/efa/listener_impl.h @@ -17,8 +17,8 @@ #include #include #include -#include #include +#include #include namespace tensorpipe { From f99da62eb6e80826c2d13f05760d1064fb12cfab Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Fri, 17 Sep 2021 18:01:18 +0100 Subject: [PATCH 14/19] merge --- tensorpipe/transport/efa/listener_impl.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tensorpipe/transport/efa/listener_impl.cc b/tensorpipe/transport/efa/listener_impl.cc index e14428b47..d01aff347 100644 --- a/tensorpipe/transport/efa/listener_impl.cc +++ b/tensorpipe/transport/efa/listener_impl.cc @@ -68,6 +68,15 @@ void ListenerImpl::initImplFromLoop() { setError(std::move(error)); return; } + + struct sockaddr_storage addr; + socklen_t addrlen; + std::tie(error, addr, addrlen) = socket_.getSockName(); + if (error) { + setError(std::move(error)); + return; + } + sockaddr_ = Sockaddr(reinterpret_cast(&addr), addrlen); } void ListenerImpl::handleErrorImpl() { @@ -95,12 +104,7 @@ void ListenerImpl::acceptImplFromLoop(accept_callback_fn fn) { } std::string ListenerImpl::addrImplFromLoop() const { - struct sockaddr_storage ss; - struct sockaddr* addr = reinterpret_cast(&ss); - socklen_t addrlen = sizeof(ss); - int rv = getsockname(socket_.fd(), addr, &addrlen); - TP_THROW_SYSTEM_IF(rv < 0, errno); - return Sockaddr(addr, addrlen).str(); + return sockaddr_.str(); } void ListenerImpl::handleEventsFromLoop(int events) { From fb38536e60da45540188472f19b2c693f788a5ba Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Wed, 22 Sep 2021 12:29:21 +0100 Subject: [PATCH 15/19] fix --- tensorpipe/common/efa_lib.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorpipe/common/efa_lib.h b/tensorpipe/common/efa_lib.h index a2293c39c..8f4281a7a 100644 --- a/tensorpipe/common/efa_lib.h +++ b/tensorpipe/common/efa_lib.h @@ -39,8 +39,6 @@ class EfaLib { DynamicLibraryHandle dlhandle_; - decltype(&fi_allocinfo) fi_allocptr = nullptr; - #define TP_DECLARE_FIELD(function_name) \ decltype(&function_name) function_name##_ptr_ = nullptr; TP_FORALL_FABRIC_SYMBOLS(TP_DECLARE_FIELD) From 1a50dc1806ace051cb1c9a62cc7138b40194a69d Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Fri, 5 Nov 2021 14:54:52 +0000 Subject: [PATCH 16/19] fix bugs --- tensorpipe/common/efa.h | 18 ++--- tensorpipe/common/efa_read_write_ops.h | 51 ++++++++---- tensorpipe/transport/efa/connection_impl.cc | 45 ++++++----- tensorpipe/transport/efa/connection_impl.h | 10 +-- tensorpipe/transport/efa/error.cc | 2 +- tensorpipe/transport/efa/error.h | 4 +- tensorpipe/transport/efa/reactor.cc | 87 ++++++++++----------- tensorpipe/transport/efa/reactor.h | 23 ++---- 8 files changed, 124 insertions(+), 116 deletions(-) diff --git a/tensorpipe/common/efa.h b/tensorpipe/common/efa.h index c54c36098..bee6b12ab 100644 --- a/tensorpipe/common/efa.h +++ b/tensorpipe/common/efa.h @@ -6,8 +6,8 @@ * LICENSE file in the root directory of this source tree. */ -#ifndef TENSORPIPE_COMMON_EFA_H_ -#define TENSORPIPE_COMMON_EFA_H_ +#ifndef COMMON_EFA_H_ +#define COMMON_EFA_H_ #include #include @@ -94,14 +94,14 @@ struct EfaAddress { return std::string(name, len); } - void CopyFrom(void* ep_name, const size_t ep_name_len) { - len = ep_name_len; - memcpy(name, ep_name, sizeof(name)); + void copyFrom(void* epName, const size_t epNameLen) { + len = epNameLen; + memcpy(name, epName, sizeof(name)); } - void CopyTo(char* ep_name, size_t* ep_name_len) { - *(ep_name_len) = len; - memcpy(ep_name, name, sizeof(name)); + void copyTo(char* epName, size_t* epNameLen) { + *(epNameLen) = len; + memcpy(epName, name, sizeof(name)); } }; @@ -259,4 +259,4 @@ class EfaDeviceList { } // namespace tensorpipe -#endif // TENSORPIPE_COMMON_EFA_H_ +#endif // COMMON_EFA_H_ diff --git a/tensorpipe/common/efa_read_write_ops.h b/tensorpipe/common/efa_read_write_ops.h index 9662aa9e0..72db31cd9 100644 --- a/tensorpipe/common/efa_read_write_ops.h +++ b/tensorpipe/common/efa_read_write_ops.h @@ -9,12 +9,14 @@ #pragma once #include +#include #include #include #include #include #include +#include #include #include @@ -39,9 +41,13 @@ class EFAReadOperation { using read_callback_fn = std::function; - explicit inline EFAReadOperation(read_callback_fn fn); + explicit inline EFAReadOperation(void* opContext, read_callback_fn fn); - inline EFAReadOperation(void* ptr, size_t length, read_callback_fn fn); + inline EFAReadOperation( + void* ptr, + size_t length, + void* opContext, + read_callback_fn fn); // Called when a buffer is needed to read data from stream. inline void allocFromLoop(); @@ -62,12 +68,16 @@ class EFAReadOperation { inline size_t* getLengthPtr(); inline char* getBufferPtr(); + // Get op context + inline void* getOpContext(); + // Invoke user callback. inline void callbackFromLoop(const Error& error); private: Mode mode_{WAIT_TO_POST}; char* ptr_{nullptr}; + void* opContext_{nullptr}; // Number of bytes as specified by the user (if applicable). optional givenLength_; @@ -86,13 +96,18 @@ class EFAReadOperation { read_callback_fn fn_; }; -EFAReadOperation::EFAReadOperation(read_callback_fn fn) : fn_(std::move(fn)) {} +EFAReadOperation::EFAReadOperation(void* opContext, read_callback_fn fn) + : opContext_(opContext), fn_(std::move(fn)) {} EFAReadOperation::EFAReadOperation( void* ptr, size_t length, + void* opContext, read_callback_fn fn) - : ptr_(static_cast(ptr)), givenLength_(length), fn_(std::move(fn)) {} + : ptr_(static_cast(ptr)), + givenLength_(length), + opContext_(opContext), + fn_(std::move(fn)) {} void EFAReadOperation::allocFromLoop() { if (givenLength_.has_value()) { @@ -136,6 +151,10 @@ void EFAReadOperation::callbackFromLoop(const Error& error) { fn_(error, ptr_, readLength_); } +void* EFAReadOperation::getOpContext() { + return opContext_; +} + // The write operation captures all state associated with writing a // fixed length chunk of data from the underlying connection. The // write includes a word-sized header containing the length of the @@ -155,6 +174,7 @@ class EFAWriteOperation { inline EFAWriteOperation( const void* ptr, size_t length, + void* opContext, write_callback_fn fn); struct Buf { @@ -175,18 +195,17 @@ class EFAWriteOperation { inline bool completed() const; // set mode to complete inline void setCompleted(); - // get peer address - inline fi_addr_t getPeerAddr(); - // set peer address - inline void setPeerAddr(fi_addr_t peer_addr); // get length inline size_t getLength() const; + // get op context + inline void* getOpContext(); private: Mode mode_{WAIT_TO_POST}; const char* ptr_; const size_t length_; - fi_addr_t peer_addr_; + fi_addr_t peerAddr_; + void* opContext_{nullptr}; // Buffers (structs with pointers and lengths) to write to stream. std::array bufs_; @@ -198,8 +217,12 @@ class EFAWriteOperation { EFAWriteOperation::EFAWriteOperation( const void* ptr, size_t length, + void* opContext, write_callback_fn fn) - : ptr_(static_cast(ptr)), length_(length), fn_(std::move(fn)) { + : ptr_(static_cast(ptr)), + length_(length), + opContext_(opContext), + fn_(std::move(fn)) { bufs_[0].base = const_cast(reinterpret_cast(&length_)); bufs_[0].len = sizeof(length_); bufs_[1].base = const_cast(ptr_); @@ -235,12 +258,8 @@ bool EFAWriteOperation::completed() const { return mode_ == COMPLETE; } -void EFAWriteOperation::setPeerAddr(fi_addr_t peer_addr) { - peer_addr_ = peer_addr; -} - -fi_addr_t EFAWriteOperation::getPeerAddr() { - return peer_addr_; +void* EFAWriteOperation::getOpContext() { + return opContext_; } } // namespace tensorpipe diff --git a/tensorpipe/transport/efa/connection_impl.cc b/tensorpipe/transport/efa/connection_impl.cc index 09a22a895..28c6a13d1 100644 --- a/tensorpipe/transport/efa/connection_impl.cc +++ b/tensorpipe/transport/efa/connection_impl.cc @@ -91,7 +91,7 @@ void ConnectionImpl::initImplFromLoop() { } void ConnectionImpl::readImplFromLoop(read_callback_fn fn) { - readOperations_.emplace_back(std::move(fn)); + readOperations_.emplace_back(this, std::move(fn)); processReadOperationsFromLoop(); } @@ -100,7 +100,7 @@ void ConnectionImpl::readImplFromLoop( void* ptr, size_t length, read_callback_fn fn) { - readOperations_.emplace_back(ptr, length, std::move(fn)); + readOperations_.emplace_back(ptr, length, this, std::move(fn)); // If the inbox already contains some data, we may be able to process this // operation right away. @@ -111,7 +111,7 @@ void ConnectionImpl::writeImplFromLoop( const void* ptr, size_t length, write_callback_fn fn) { - writeOperations_.emplace_back(ptr, length, std::move(fn)); + writeOperations_.emplace_back(ptr, length, this, std::move(fn)); // If the outbox has some free space, we may be able to process this operation // right away. @@ -179,11 +179,9 @@ void ConnectionImpl::handleEventInFromLoop() { return; } - peer_addr = context_->getReactor().addPeerAddr(addr); + peerAddr_ = context_->getReactor().addPeerAddr(addr); // The connection is usable now. - context_->getReactor().registerHandler(peer_addr, shared_from_this()); - state_ = ESTABLISHED; processWriteOperationsFromLoop(); // Trigger read operations in case a pair of local read() and remote @@ -242,11 +240,12 @@ void ConnectionImpl::processReadOperationsFromLoop() { context_->getReactor().postRecv( readOperation.getLengthPtr(), sizeof(size_t), - kLength, - peer_addr, - 0xffffffff, // ignore lower bits for msg index + kLength | recvIdx_, + peerAddr_, + 0, &readOperation); readOperation.setWaitToCompleted(); + recvIdx_++; } else { // if the operation is posted, all operations back should be posted // we can skip more checks @@ -289,27 +288,28 @@ void ConnectionImpl::processWriteOperationsFromLoop() { for (int i = 0; i < writeOperations_.size(); i++) { EFAWriteOperation& writeOperation = writeOperations_[i]; if (!writeOperation.posted()) { - EFAWriteOperation::Buf* buf_array; + EFAWriteOperation::Buf* bufArray; size_t size; - std::tie(buf_array, size) = writeOperation.getBufs(); - writeOperation.setPeerAddr(peer_addr); + std::tie(bufArray, size) = writeOperation.getBufs(); + // writeOperation.setPeerAddr(peerAddr_); // auto size_buf = std::get<0>(writeOperation.getBufs()); // auto payload_buf = std::get<1>(writeOperation.getBufs()); context_->getReactor().postSend( - buf_array[0].base, - buf_array[0].len, - kLength | sendIdx, - peer_addr, + bufArray[0].base, + bufArray[0].len, + kLength | sendIdx_, + peerAddr_, &writeOperation); if (size > 1) { context_->getReactor().postSend( - buf_array[1].base, - buf_array[1].len, - kPayload | sendIdx, - peer_addr, + bufArray[1].base, + bufArray[1].len, + kPayload | sendIdx_, + peerAddr_, &writeOperation); } - sendIdx++; + writeOperation.setWaitComplete(); + sendIdx_++; } else { // if the operation is posted, all operations back should be posted // we can skip more checks @@ -344,8 +344,7 @@ void ConnectionImpl::handleErrorImpl() { void ConnectionImpl::cleanup() { TP_DCHECK(context_->inLoop()); TP_VLOG(8) << "Connection " << id_ << " is cleaning up"; - context_->getReactor().unregisterHandler(peer_addr); - context_->getReactor().removePeerAddr(peer_addr); + context_->getReactor().removePeerAddr(peerAddr_); } } // namespace efa diff --git a/tensorpipe/transport/efa/connection_impl.h b/tensorpipe/transport/efa/connection_impl.h index 4ab9ef0f6..c2299df33 100644 --- a/tensorpipe/transport/efa/connection_impl.h +++ b/tensorpipe/transport/efa/connection_impl.h @@ -35,7 +35,6 @@ class ConnectionImpl final : public ConnectionImplBoilerplate< ContextImpl, ListenerImpl, ConnectionImpl>, - public efaEventHandler, public EpollLoop::EventHandler { enum State { INITIALIZING = 1, @@ -65,8 +64,8 @@ class ConnectionImpl final : public ConnectionImplBoilerplate< // Implementation of efaEventHandler. // void onRemoteProducedData(uint32_t length) override; // void onRemoteConsumedData(uint32_t length) override; - void onWriteCompleted() override; - void onReadCompleted() override; + void onWriteCompleted(); + void onReadCompleted(); // void onAckCompleted() override; // void onError(efaLib::wc_status status, uint64_t wrId) override; @@ -97,9 +96,10 @@ class ConnectionImpl final : public ConnectionImplBoilerplate< Socket socket_; optional sockaddr_; - fi_addr_t peer_addr; + fi_addr_t peerAddr_; - uint32_t sendIdx = 0; + uint32_t sendIdx_ = 0; + uint32_t recvIdx_ = 0; // Pending read operations. std::deque readOperations_; diff --git a/tensorpipe/transport/efa/error.cc b/tensorpipe/transport/efa/error.cc index cf91931b6..17c8f01ba 100644 --- a/tensorpipe/transport/efa/error.cc +++ b/tensorpipe/transport/efa/error.cc @@ -18,7 +18,7 @@ namespace tensorpipe { namespace transport { namespace efa { -std::string efaError::what() const { +std::string EfaError::what() const { return error_; } diff --git a/tensorpipe/transport/efa/error.h b/tensorpipe/transport/efa/error.h index 409b4bb70..a37dc632e 100644 --- a/tensorpipe/transport/efa/error.h +++ b/tensorpipe/transport/efa/error.h @@ -16,9 +16,9 @@ namespace tensorpipe { namespace transport { namespace efa { -class efaError final : public BaseError { +class EfaError final : public BaseError { public: - explicit efaError(std::string error) : error_(error) {} + explicit EfaError(std::string error) : error_(error) {} std::string what() const override; diff --git a/tensorpipe/transport/efa/reactor.cc b/tensorpipe/transport/efa/reactor.cc index 151af4ae2..cefcb8dcb 100644 --- a/tensorpipe/transport/efa/reactor.cc +++ b/tensorpipe/transport/efa/reactor.cc @@ -10,6 +10,7 @@ #include #include +#include #include namespace tensorpipe { @@ -33,13 +34,13 @@ void Reactor::postSend( void* buffer, size_t size, uint64_t tag, - fi_addr_t peer_addr, + fi_addr_t peerAddr, void* context) { pendingSends_.emplace_back(EfaEvent((new fi_msg_tagged{ /* msg_iov */ new iovec{.iov_base = buffer, .iov_len = size}, /* desc */ 0, /* iov_count */ 1, - /* peer addr */ peer_addr, + /* peer addr */ peerAddr, /* tag */ tag, /* ignore */ 0, /* context */ context, @@ -51,14 +52,14 @@ void Reactor::postRecv( void* buffer, size_t size, uint64_t tag, - fi_addr_t dest_addr, + fi_addr_t peerAddr, uint64_t ignore, void* context) { pendingRecvs_.emplace_back(EfaEvent(new fi_msg_tagged{ /* msg_iov */ new iovec{.iov_base = buffer, .iov_len = size}, /* desc */ 0, /* iov_count */ 1, - /* peer addr */ dest_addr, + /* peer addr */ peerAddr, /* tag */ tag, /* ignore */ ignore, /* context */ context, @@ -85,16 +86,18 @@ int Reactor::postPendingSends() { } fi_addr_t Reactor::addPeerAddr(EfaAddress& addr) { - fi_addr_t peer_addr; - int ret = fi_av_insert(av_.get(), addr.name, 1, &peer_addr, 0, nullptr); + fi_addr_t peerAddr; + int ret = fi_av_insert(av_.get(), addr.name, 1, &peerAddr, 0, nullptr); TP_THROW_ASSERT_IF(ret != 1) << "Unable to add address to endpoint"; TP_CHECK_EFA_RET(ret, "Unable to add address to endpoint"); - return peer_addr; + efaAddrSet_.emplace(peerAddr); + return peerAddr; } void Reactor::removePeerAddr(fi_addr_t faddr) { int ret = fi_av_remove(av_.get(), &faddr, 1, 0); TP_CHECK_EFA_RET(ret, "Unable to remove address from endpoint"); + efaAddrSet_.erase(faddr); }; int Reactor::postPendingRecvs() { @@ -137,13 +140,13 @@ Reactor::~Reactor() { } bool Reactor::pollOnce() { - std::array cq_entries; - std::array src_addrs; + std::array cqEntries; + std::array srcAddrs; postPendingSends(); postPendingRecvs(); int rv = fi_cq_readfrom( - cq_.get(), cq_entries.data(), cq_entries.size(), src_addrs.data()); + cq_.get(), cqEntries.data(), cqEntries.size(), srcAddrs.data()); if (rv == 0 || rv == -FI_EAGAIN) { return false; } else { @@ -154,48 +157,52 @@ bool Reactor::pollOnce() { int numWrites = 0; int numAcks = 0; for (int cqIdx = 0; cqIdx < rv; cqIdx++) { - struct fi_cq_tagged_entry& cq = cq_entries[cqIdx]; - fi_addr_t& src_addr = src_addrs[cqIdx]; - uint32_t msg_idx = static_cast(cq.tag); + struct fi_cq_tagged_entry& cq = cqEntries[cqIdx]; + fi_addr_t& srcAddr = srcAddrs[cqIdx]; + uint32_t msgIdx = static_cast(cq.tag); if (cq.flags & FI_SEND) { // Send event if (cq.tag & kLength) { // Send size finished, check whether it's zero sized message - auto* operation_ptr = static_cast(cq.op_context); - if (operation_ptr->getLength() == 0) { - operation_ptr->setCompleted(); - efaEventHandler_[operation_ptr->getPeerAddr()]->onWriteCompleted(); + auto* operationPtr = static_cast(cq.op_context); + if (operationPtr->getLength() == 0) { + operationPtr->setCompleted(); + reinterpret_cast(operationPtr->getOpContext()) + ->onWriteCompleted(); } } else if (cq.tag & kPayload) { - auto* operation_ptr = static_cast(cq.op_context); - operation_ptr->setCompleted(); - efaEventHandler_[operation_ptr->getPeerAddr()]->onWriteCompleted(); + auto* operationPtr = static_cast(cq.op_context); + operationPtr->setCompleted(); + reinterpret_cast(operationPtr->getOpContext()) + ->onWriteCompleted(); } } else if (cq.flags & FI_RECV) { // Receive event if (cq.tag & kLength) { // Received length information - auto* operation_ptr = static_cast(cq.op_context); - if (operation_ptr->getReadLength() == 0) { - operation_ptr->setCompleted(); - efaEventHandler_[src_addr]->onReadCompleted(); + auto* operationPtr = static_cast(cq.op_context); + if (operationPtr->getReadLength() == 0) { + operationPtr->setCompleted(); + reinterpret_cast(operationPtr->getOpContext()) + ->onReadCompleted(); } else { // operation_ptr->mode_ = EFAReadOperation::Mode::READ_PAYLOAD; - operation_ptr->allocFromLoop(); + operationPtr->allocFromLoop(); postRecv( - operation_ptr->getBufferPtr(), - operation_ptr->getReadLength(), - kPayload | msg_idx, - src_addr, + operationPtr->getBufferPtr(), + operationPtr->getReadLength(), + kPayload | msgIdx, + srcAddr, 0, // Exact match of tag - operation_ptr); - operation_ptr->setWaitToCompleted(); + operationPtr); + operationPtr->setWaitToCompleted(); } } else if (cq.tag & kPayload) { // Received payload - auto* operation_ptr = static_cast(cq.op_context); - operation_ptr->setCompleted(); - efaEventHandler_[src_addr]->onReadCompleted(); + auto* operationPtr = static_cast(cq.op_context); + operationPtr->setCompleted(); + reinterpret_cast(operationPtr->getOpContext()) + ->onReadCompleted(); } } } @@ -204,17 +211,7 @@ bool Reactor::pollOnce() { } bool Reactor::readyToClose() { - return efaEventHandler_.size() == 0; -} - -void Reactor::registerHandler( - fi_addr_t peer_addr, - std::shared_ptr eventHandler) { - efaEventHandler_.emplace(peer_addr, std::move(eventHandler)); -} - -void Reactor::unregisterHandler(fi_addr_t peer_addr) { - efaEventHandler_.erase(peer_addr); + return efaAddrSet_.size() == 0; } } // namespace efa diff --git a/tensorpipe/transport/efa/reactor.h b/tensorpipe/transport/efa/reactor.h index f465cb9d0..cd79bfe57 100644 --- a/tensorpipe/transport/efa/reactor.h +++ b/tensorpipe/transport/efa/reactor.h @@ -13,8 +13,8 @@ #include #include #include -#include #include +#include #include #include @@ -28,16 +28,16 @@ namespace tensorpipe { namespace transport { namespace efa { -class efaEventHandler { +class EfaEventHandler { public: virtual void onWriteCompleted() = 0; virtual void onReadCompleted() = 0; - virtual ~efaEventHandler() = default; + virtual ~EfaEventHandler() = default; }; -enum efaTag : uint64_t { +enum EfaTag : uint64_t { kLength = 1ULL << 32, kPayload = 1ULL << 33, }; @@ -70,24 +70,18 @@ class Reactor final : public BusyPollingLoop { return addr_; } - void registerHandler( - fi_addr_t peer_addr, - std::shared_ptr eventHandler); - - void unregisterHandler(fi_addr_t peer_addr); - void postSend( void* buffer, size_t size, uint64_t tag, - fi_addr_t peer_addr, + fi_addr_t peerAddr, void* context); void postRecv( void* buffer, size_t size, uint64_t tag, - fi_addr_t peer_addr, + fi_addr_t peerAddr, uint64_t ignore, void* context); @@ -136,9 +130,8 @@ class Reactor final : public BusyPollingLoop { // debugging purposes. std::string id_{"N/A"}; - // The registered event handlers for each queue pair. - std::unordered_map> - efaEventHandler_; + // The registered connections for each queue pair. + std::unordered_set efaAddrSet_; std::deque pendingSends_; std::deque pendingRecvs_; From 8ff3268a1ee99f67adf6ba447fe5df87a0260f43 Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Fri, 5 Nov 2021 15:02:55 +0000 Subject: [PATCH 17/19] remove bad header --- tensorpipe/common/efa.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tensorpipe/common/efa.h b/tensorpipe/common/efa.h index bee6b12ab..215dd0180 100644 --- a/tensorpipe/common/efa.h +++ b/tensorpipe/common/efa.h @@ -6,8 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#ifndef COMMON_EFA_H_ -#define COMMON_EFA_H_ +#pragma once #include #include @@ -80,7 +79,7 @@ struct EfaAddress { // length of endpoint name size_t len = sizeof(name); - std::string DebugStr() const { + std::string debugStr() const { std::stringstream ss; ss << "["; for (size_t i = 0; i < len; i++) { @@ -215,7 +214,7 @@ class EfaDeviceList { static std::tuple create(EfaLib& efaLib) { int size; EfaLib::device* ptr = getEfaDevices(efaLib); - EfaLib::device* first_ptr = ptr; + EfaLib::device* firstDevice = ptr; if (ptr == nullptr) { return std::make_tuple( TP_CREATE_ERROR(SystemError, "fi_getinfo", -1), EfaDeviceList()); @@ -226,16 +225,16 @@ class EfaDeviceList { size++; }; return std::make_tuple( - Error::kSuccess, EfaDeviceList(efaLib, first_ptr, size)); + Error::kSuccess, EfaDeviceList(efaLib, firstDevice, size)); } int size() { return size_; } - EfaLib::device& operator[](int i) { + EfaLib::device& operator[](int index) { EfaLib::device* ptr = deviceList_.get(); - for (int j = 0; j < i; j++) { + for (int j = 0; j < index; j++) { ptr = ptr->next; } return *ptr; @@ -257,6 +256,4 @@ class EfaDeviceList { int size_; }; -} // namespace tensorpipe - -#endif // COMMON_EFA_H_ +} // namespace tensorpipe \ No newline at end of file From a0957d0beb2ddf25b5292d0f6b3a9010040fb9a5 Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Fri, 5 Nov 2021 18:26:51 +0000 Subject: [PATCH 18/19] fix --- tensorpipe/transport/efa/connection_impl.cc | 6 +++--- tensorpipe/transport/efa/reactor.h | 9 --------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/tensorpipe/transport/efa/connection_impl.cc b/tensorpipe/transport/efa/connection_impl.cc index 28c6a13d1..06208dd75 100644 --- a/tensorpipe/transport/efa/connection_impl.cc +++ b/tensorpipe/transport/efa/connection_impl.cc @@ -183,11 +183,11 @@ void ConnectionImpl::handleEventInFromLoop() { // The connection is usable now. state_ = ESTABLISHED; - processWriteOperationsFromLoop(); // Trigger read operations in case a pair of local read() and remote // write() happened before connection is established. Otherwise read() // callback would lose if it's the only read() request. processReadOperationsFromLoop(); + processWriteOperationsFromLoop(); return; } @@ -249,7 +249,7 @@ void ConnectionImpl::processReadOperationsFromLoop() { } else { // if the operation is posted, all operations back should be posted // we can skip more checks - break; + // break; } } } @@ -313,7 +313,7 @@ void ConnectionImpl::processWriteOperationsFromLoop() { } else { // if the operation is posted, all operations back should be posted // we can skip more checks - break; + // break; } } } diff --git a/tensorpipe/transport/efa/reactor.h b/tensorpipe/transport/efa/reactor.h index cd79bfe57..a3b96b361 100644 --- a/tensorpipe/transport/efa/reactor.h +++ b/tensorpipe/transport/efa/reactor.h @@ -28,15 +28,6 @@ namespace tensorpipe { namespace transport { namespace efa { -class EfaEventHandler { - public: - virtual void onWriteCompleted() = 0; - - virtual void onReadCompleted() = 0; - - virtual ~EfaEventHandler() = default; -}; - enum EfaTag : uint64_t { kLength = 1ULL << 32, kPayload = 1ULL << 33, From 184d923885fea9b23e7268c59759d95b546b9204 Mon Sep 17 00:00:00 2001 From: VoVAllen Date: Sun, 7 Nov 2021 10:16:24 +0000 Subject: [PATCH 19/19] remove unused codes --- tensorpipe/transport/efa/connection_impl.cc | 3 --- tensorpipe/transport/efa/connection_impl.h | 6 ------ 2 files changed, 9 deletions(-) diff --git a/tensorpipe/transport/efa/connection_impl.cc b/tensorpipe/transport/efa/connection_impl.cc index 06208dd75..834e0fb03 100644 --- a/tensorpipe/transport/efa/connection_impl.cc +++ b/tensorpipe/transport/efa/connection_impl.cc @@ -291,9 +291,6 @@ void ConnectionImpl::processWriteOperationsFromLoop() { EFAWriteOperation::Buf* bufArray; size_t size; std::tie(bufArray, size) = writeOperation.getBufs(); - // writeOperation.setPeerAddr(peerAddr_); - // auto size_buf = std::get<0>(writeOperation.getBufs()); - // auto payload_buf = std::get<1>(writeOperation.getBufs()); context_->getReactor().postSend( bufArray[0].base, bufArray[0].len, diff --git a/tensorpipe/transport/efa/connection_impl.h b/tensorpipe/transport/efa/connection_impl.h index c2299df33..57ae9aff8 100644 --- a/tensorpipe/transport/efa/connection_impl.h +++ b/tensorpipe/transport/efa/connection_impl.h @@ -61,16 +61,10 @@ class ConnectionImpl final : public ConnectionImplBoilerplate< // Implementation of EventHandler. void handleEventsFromLoop(int events) override; - // Implementation of efaEventHandler. - // void onRemoteProducedData(uint32_t length) override; - // void onRemoteConsumedData(uint32_t length) override; void onWriteCompleted(); void onReadCompleted(); - // void onAckCompleted() override; - // void onError(efaLib::wc_status status, uint64_t wrId) override; protected: - // Implement the entry points called by ConnectionImplBoilerplate. // Implement the entry points called by ConnectionImplBoilerplate. void initImplFromLoop() override; void readImplFromLoop(read_callback_fn fn) override;