From b4bf42a9fc430e1fed1d9054ab2d8a134f42b8ba Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Mon, 13 May 2024 10:11:46 +0300 Subject: [PATCH] Address review comments --- examples/rpc/README.md | 14 ++- examples/rpc/rpc-server.cpp | 91 ++--------------- ggml-rpc.cpp | 193 ++++++++++++++++++++++++++++++------ ggml-rpc.h | 65 +----------- llama.cpp | 6 +- 5 files changed, 191 insertions(+), 178 deletions(-) diff --git a/examples/rpc/README.md b/examples/rpc/README.md index 2c974d1a5..325d0abc4 100644 --- a/examples/rpc/README.md +++ b/examples/rpc/README.md @@ -1,6 +1,6 @@ ## Overview -The `rpc-server` allows running a `ggml` backend on a remote host. +The `rpc-server` allows running `ggml` backend on a remote host. The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. This can be used for distributed LLM inference with `llama.cpp` in the following way: @@ -25,6 +25,7 @@ flowchart TD ``` Each host can run a different backend, e.g. one with CUDA and another with Metal. +You can also run multiple `rpc-server` instances on the same host, each with a different backend. ## Usage @@ -35,7 +36,7 @@ For example, to build the CUDA backend with RPC support: mkdir build-rpc-cuda cd build-rpc-cuda cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON -make -j +cmake --build . --config Release ``` Then, start the `rpc-server` with the backend: @@ -50,13 +51,20 @@ ggml_cuda_init: found 1 CUDA devices: Starting RPC server on 0.0.0.0:50052 ``` +When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.: +```bash +$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052 +``` +This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. + + On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`: ```bash mkdir build-rpc cd build-rpc cmake .. -DLLAMA_RPC=ON -make -j +cmake --build . --config Release ``` Finally, use the `--rpc` option to specify the host and port of each `rpc-server`: diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index b81c4184b..be5d18058 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -7,19 +7,8 @@ #endif #include "ggml-rpc.h" -#include #include -#ifndef _WIN32 -# include -# include -# include -# include -# include -# include -# include -#endif #include -#include static ggml_backend_t create_backend() { ggml_backend_t backend = NULL; @@ -55,91 +44,27 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) { #endif } -static std::shared_ptr make_socket(sockfd_t fd) { -#ifdef _WIN32 - if (fd == INVALID_SOCKET) { - return nullptr; - } -#else - if (fd < 0) { - return nullptr; - } -#endif - return std::make_shared(fd); -} - -static std::shared_ptr create_server_socket(const char * host, int port) { - auto sockfd = socket(AF_INET, SOCK_STREAM, 0); - auto sock = make_socket(sockfd); - if (sock == nullptr) { - return nullptr; - } - - struct sockaddr_in serv_addr; - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = inet_addr(host); - serv_addr.sin_port = htons(port); - - if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) { - return nullptr; - } - if (listen(sockfd, 5) < 0) { - return nullptr; - } - return sock; -} - int main(int argc, char * argv[]) { -#ifdef _WIN32 - WSADATA wsaData; - int res = WSAStartup(MAKEWORD(2, 2), &wsaData); - if (res != 0) { - fprintf(stderr, "WSAStartup failed: %d\n", res); - return 1; - } -#endif if (argc < 3) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } const char * host = argv[1]; int port = std::stoi(argv[2]); - + if (port <= 0 || port > 65535) { + fprintf(stderr, "Invalid port number: %d\n", port); + return 1; + } ggml_backend_t backend = create_backend(); if (!backend) { fprintf(stderr, "Failed to create backend\n"); return 1; } - printf("Starting RPC server on %s:%d\n", host, port); - auto server_socket = create_server_socket(host, port); - if (server_socket == nullptr) { - fprintf(stderr, "Failed to create server socket\n"); - return 1; - } - while (true) { - auto client_socket_fd = accept(server_socket->fd, NULL, NULL); - auto client_socket = make_socket(client_socket_fd); - if (client_socket == nullptr) { - fprintf(stderr, "Failed to accept client connection\n"); - return 1; - } - // set TCP_NODELAY to disable Nagle's algorithm - int flag = 1; - int ret = setsockopt(client_socket->fd, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int)); - if (ret < 0) { - fprintf(stderr, "Failed to set TCP_NODELAY\n"); - continue; - } - size_t free_mem, total_mem; - get_backend_memory(&free_mem, &total_mem); - printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem); - rpc_serve_client(backend, client_socket->fd, free_mem, total_mem); - printf("Client connection closed\n"); - } -#ifdef _WIN32 - WSACleanup(); -#endif + size_t free_mem, total_mem; + get_backend_memory(&free_mem, &total_mem); + std::string endpoint = std::string(host) + ":" + std::to_string(port); + start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem); return 0; } diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index 84768509c..62c054bbb 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -4,9 +4,18 @@ #include #include +#include #include #include -#ifndef _WIN32 +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +#else +# include # include # include # include @@ -26,9 +35,57 @@ #endif #ifdef _WIN32 +typedef SOCKET sockfd_t; using ssize_t = __int64; +#else +typedef int sockfd_t; #endif +// cross-platform socket +struct socket_t { + sockfd_t fd; + socket_t(sockfd_t fd) : fd(fd) {} + ~socket_t() { +#ifdef _WIN32 + closesocket(this->fd); +#else + close(this->fd); +#endif + } +}; + +// ggml_tensor is serialized into rpc_tensor +struct rpc_tensor { + uint64_t id; + uint32_t type; + uint64_t buffer; + uint32_t ne[GGML_MAX_DIMS]; + uint32_t nb[GGML_MAX_DIMS]; + uint32_t op; + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; + int32_t flags; + uint64_t src[GGML_MAX_SRC]; + uint64_t view_src; + uint64_t view_offs; + uint64_t data; + char name[GGML_MAX_NAME]; +}; + +// RPC commands +enum rpc_cmd { + ALLOC_BUFFER = 0, + GET_ALIGNMENT, + GET_MAX_SIZE, + BUFFER_GET_BASE, + FREE_BUFFER, + BUFFER_CLEAR, + SET_TENSOR, + GET_TENSOR, + COPY_TENSOR, + GRAPH_COMPUTE, + GET_DEVICE_MEMORY, +}; + // RPC data structures static ggml_guid_t ggml_backend_rpc_guid() { @@ -59,14 +116,6 @@ struct ggml_backend_rpc_buffer_context { // RPC helper functions -socket_t::~socket_t() { -#ifdef _WIN32 - closesocket(this->fd); -#else - close(this->fd); -#endif -} - static std::shared_ptr make_socket(sockfd_t fd) { #ifdef _WIN32 if (fd == INVALID_SOCKET) { @@ -80,6 +129,13 @@ static std::shared_ptr make_socket(sockfd_t fd) { return std::make_shared(fd); } +static bool set_no_delay(sockfd_t sockfd) { + int flag = 1; + // set TCP_NODELAY to disable Nagle's algorithm + int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int)); + return ret >= 0; +} + static std::shared_ptr socket_connect(const char * host, int port) { struct sockaddr_in addr; auto sockfd = socket(AF_INET, SOCK_STREAM, 0); @@ -87,10 +143,8 @@ static std::shared_ptr socket_connect(const char * host, int port) { if (sock_ptr == nullptr) { return nullptr; } - // set TCP_NODELAY to disable Nagle's algorithm - int flag = 1; - int ret = setsockopt(sock_ptr->fd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int)); - if (ret < 0) { + if (!set_no_delay(sockfd)) { + fprintf(stderr, "Failed to set TCP_NODELAY\n"); return nullptr; } addr.sin_family = AF_INET; @@ -107,6 +161,40 @@ static std::shared_ptr socket_connect(const char * host, int port) { return sock_ptr; } +static std::shared_ptr socket_accept(sockfd_t srv_sockfd) { + auto client_socket_fd = accept(srv_sockfd, NULL, NULL); + auto client_socket = make_socket(client_socket_fd); + if (client_socket == nullptr) { + return nullptr; + } + if (!set_no_delay(client_socket_fd)) { + fprintf(stderr, "Failed to set TCP_NODELAY\n"); + return nullptr; + } + return client_socket; +} + +static std::shared_ptr create_server_socket(const char * host, int port) { + auto sockfd = socket(AF_INET, SOCK_STREAM, 0); + auto sock = make_socket(sockfd); + if (sock == nullptr) { + return nullptr; + } + + struct sockaddr_in serv_addr; + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = inet_addr(host); + serv_addr.sin_port = htons(port); + + if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) { + return nullptr; + } + if (listen(sockfd, 1) < 0) { + return nullptr; + } + return sock; +} + static bool send_data(sockfd_t sockfd, const void * data, size_t size) { size_t bytes_sent = 0; while (bytes_sent < size) { @@ -131,6 +219,17 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) { return true; } +static bool parse_endpoint(const char * endpoint, std::string & host, int & port) { + std::string str(endpoint); + size_t pos = str.find(':'); + if (pos == std::string::npos) { + return false; + } + host = str.substr(0, pos); + port = std::stoi(str.substr(pos + 1)); + return true; +} + // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) | // RPC response: | response_size (8 bytes) | response_data (response_size bytes) | static bool send_rpc_cmd(const std::shared_ptr & sock, enum rpc_cmd cmd, const std::vector & input, std::vector & output) { @@ -244,7 +343,7 @@ static ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_ten } result->flags = tensor->flags; result->data = reinterpret_cast(tensor->data); - snprintf(result->name, GGML_MAX_NAME, "%s", tensor->name); + ggml_set_name(result, tensor->name); return result; } @@ -259,7 +358,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context; // input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) | - int input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size; + size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size; std::vector input(input_size, 0); rpc_tensor rpc_tensor = serialize_tensor(tensor); memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor)); @@ -530,14 +629,15 @@ static ggml_backend_i ggml_backend_rpc_interface = { static std::unordered_map instances; -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const std::string & endpoint) { +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) { ggml_backend_t backend = ggml_backend_rpc_init(endpoint); return backend != nullptr ? ggml_backend_rpc_get_default_buffer_type(backend) : nullptr; } -GGML_CALL ggml_backend_t ggml_backend_rpc_init(const std::string & endpoint) { - if (instances.find(endpoint) != instances.end()) { - return instances[endpoint]; +GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { + std::string endpoint_str(endpoint); + if (instances.find(endpoint_str) != instances.end()) { + return instances[endpoint_str]; } #ifdef _WIN32 { @@ -548,11 +648,12 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const std::string & endpoint) { } } #endif - GGML_PRINT_DEBUG("Connecting to %s\n", endpoint.c_str()); - // split the endpoint into host and port - size_t pos = endpoint.find(":"); - std::string host = endpoint.substr(0, pos); - int port = std::stoi(endpoint.substr(pos + 1)); + GGML_PRINT_DEBUG("Connecting to %s\n", endpoint); + std::string host; + int port; + if (!parse_endpoint(endpoint, host, port)) { + return nullptr; + } auto sock = socket_connect(host.c_str(), port); if (sock == nullptr) { return nullptr; @@ -607,7 +708,7 @@ static void get_device_memory(const std::shared_ptr & sock, size_t * f *total = total_mem; } -GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const std::string & endpoint, size_t * free, size_t * total) { +GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) { ggml_backend_t backend = ggml_backend_rpc_init(endpoint); if (backend == nullptr) { *free = 0; @@ -781,7 +882,7 @@ static void rpc_graph_compute(ggml_backend_t backend, const std::vector const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors)); GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors); - static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead(); + static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false); struct ggml_init_params params = { /*.mem_size =*/ buf_size, /*.mem_buffer =*/ NULL, @@ -805,7 +906,7 @@ static void rpc_graph_compute(ggml_backend_t backend, const std::vector ggml_free(ctx); } -void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem) { +static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem) { while (true) { uint8_t cmd; if (!recv_data(sockfd, &cmd, 1)) { @@ -871,7 +972,7 @@ void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, } default: { fprintf(stderr, "Unknown command: %d\n", cmd); - break; + return; } } uint64_t output_size = output.size(); @@ -883,3 +984,39 @@ void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, } } } + +void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) { + std::string host; + int port; + if (!parse_endpoint(endpoint, host, port)) { + return; + } +#ifdef _WIN32 + { + WSADATA wsaData; + int res = WSAStartup(MAKEWORD(2, 2), &wsaData); + if (res != 0) { + fprintf(stderr, "WSAStartup failed: %d\n", res); + return 1; + } + } +#endif + auto server_socket = create_server_socket(host.c_str(), port); + if (server_socket == nullptr) { + fprintf(stderr, "Failed to create server socket\n"); + return; + } + while (true) { + auto client_socket = socket_accept(server_socket->fd); + if (client_socket == nullptr) { + fprintf(stderr, "Failed to accept client connection\n"); + return; + } + printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem); + rpc_serve_client(backend, client_socket->fd, free_mem, total_mem); + printf("Client connection closed\n"); + } +#ifdef _WIN32 + WSACleanup(); +#endif +} diff --git a/ggml-rpc.h b/ggml-rpc.h index 8472d19df..aa144832a 100644 --- a/ggml-rpc.h +++ b/ggml-rpc.h @@ -2,79 +2,22 @@ #include "ggml.h" #include "ggml-backend.h" -#include -#include -#ifdef _WIN32 -# define WIN32_LEAN_AND_MEAN -# ifndef NOMINMAX -# define NOMINMAX -# endif -# include -# include -#endif #ifdef __cplusplus extern "C" { #endif -// cross-platform socket fd -#ifdef _WIN32 -typedef SOCKET sockfd_t; -#else -typedef int sockfd_t; -#endif - -// cross-platform socket -struct socket_t { - sockfd_t fd; - socket_t(sockfd_t fd) : fd(fd) {} - ~socket_t(); -}; - - -// ggml_tensor is serialized into rpc_tensor -struct rpc_tensor { - uint64_t id; - uint32_t type; - uint64_t buffer; - uint32_t ne[GGML_MAX_DIMS]; - uint32_t nb[GGML_MAX_DIMS]; - uint32_t op; - int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; - int32_t flags; - uint64_t src[GGML_MAX_SRC]; - uint64_t view_src; - uint64_t view_offs; - uint64_t data; - char name[GGML_MAX_NAME]; -}; - -// RPC commands -enum rpc_cmd { - ALLOC_BUFFER = 0, - GET_ALIGNMENT, - GET_MAX_SIZE, - BUFFER_GET_BASE, - FREE_BUFFER, - BUFFER_CLEAR, - SET_TENSOR, - GET_TENSOR, - COPY_TENSOR, - GRAPH_COMPUTE, - GET_DEVICE_MEMORY, -}; - #define GGML_RPC_MAX_SERVERS 16 // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const std::string & endpoint); +GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint); GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const std::string & endpoint); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); -GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const std::string & endpoint, size_t * free, size_t * total); +GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); -GGML_API GGML_CALL void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem); +GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); #ifdef __cplusplus } diff --git a/llama.cpp b/llama.cpp index 4b9643bde..7d26966e4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2279,7 +2279,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ #ifdef GGML_USE_RPC std::string endpoint = model.rpc_servers[gpu]; - buft = ggml_backend_rpc_buffer_type(endpoint); + buft = ggml_backend_rpc_buffer_type(endpoint.c_str()); #elif defined(GGML_USE_METAL) buft = ggml_backend_metal_buffer_type(); #elif defined(GGML_USE_CUDA) @@ -2348,7 +2348,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) { size_t total; size_t free; std::string endpoint = model.rpc_servers[device]; - ggml_backend_rpc_get_device_memory(endpoint, &free, &total); + ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total); return free; #elif defined(GGML_USE_CUDA) size_t total; @@ -15727,7 +15727,7 @@ struct llama_context * llama_new_context_with_model( // initialize backends #if defined(GGML_USE_RPC) for (auto & server : model->rpc_servers) { - ggml_backend_t backend = ggml_backend_rpc_init(server); + ggml_backend_t backend = ggml_backend_rpc_init(server.c_str()); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str()); llama_free(ctx);