Skip to content

Commit d0ab197

Browse files
committed
rpc : do not wait for response when sending RPC_CMD_SET_TENSOR
RPC_CMD_SET_TENSOR always returns an empty response and we send this 4 times per token. We can improve TG speed if we don't wait for this empty response. The performance impact of this change depends on the network latency.
1 parent 75afa0a commit d0ab197

File tree

1 file changed

+15
-16
lines changed

1 file changed

+15
-16
lines changed

ggml/src/ggml-rpc/ggml-rpc.cpp

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
372372

373373
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
374374
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
375-
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
375+
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size, bool noresp = false) {
376376
uint8_t cmd_byte = cmd;
377377
if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
378378
return false;
@@ -383,17 +383,19 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
383383
if (!send_data(sock->fd, input, input_size)) {
384384
return false;
385385
}
386-
// TODO: currently the output_size is always known, do we need support for commands with variable output size?
387-
// even if we do, we can skip sending output_size from the server for commands with known output size
388-
uint64_t out_size;
389-
if (!recv_data(sock->fd, &out_size, sizeof(out_size))) {
390-
return false;
391-
}
392-
if (out_size != output_size) {
393-
return false;
394-
}
395-
if (!recv_data(sock->fd, output, output_size)) {
396-
return false;
386+
if (!noresp) {
387+
// TODO: currently the output_size is always known, do we need support for commands with variable output size?
388+
// even if we do, we can skip sending output_size from the server for commands with known output size
389+
uint64_t out_size;
390+
if (!recv_data(sock->fd, &out_size, sizeof(out_size))) {
391+
return false;
392+
}
393+
if (out_size != output_size) {
394+
return false;
395+
}
396+
if (!recv_data(sock->fd, output, output_size)) {
397+
return false;
398+
}
397399
}
398400
return true;
399401
}
@@ -531,7 +533,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
531533
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
532534
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
533535
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
534-
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0);
536+
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0, true);
535537
GGML_ASSERT(status);
536538
}
537539

@@ -1376,9 +1378,6 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
13761378
if (!server.set_tensor(input)) {
13771379
return;
13781380
}
1379-
if (!send_msg(sockfd, nullptr, 0)) {
1380-
return;
1381-
}
13821381
break;
13831382
}
13841383
case RPC_CMD_SET_TENSOR_HASH: {

0 commit comments

Comments
 (0)