Address review comments
This commit is contained in:
parent
df54adabea
commit
b4bf42a9fc
5 changed files with 191 additions and 178 deletions
|
@ -1,6 +1,6 @@
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
The `rpc-server` allows running a `ggml` backend on a remote host.
|
The `rpc-server` allows running `ggml` backend on a remote host.
|
||||||
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
|
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
|
||||||
This can be used for distributed LLM inference with `llama.cpp` in the following way:
|
This can be used for distributed LLM inference with `llama.cpp` in the following way:
|
||||||
|
|
||||||
|
@ -25,6 +25,7 @@ flowchart TD
|
||||||
```
|
```
|
||||||
|
|
||||||
Each host can run a different backend, e.g. one with CUDA and another with Metal.
|
Each host can run a different backend, e.g. one with CUDA and another with Metal.
|
||||||
|
You can also run multiple `rpc-server` instances on the same host, each with a different backend.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
@ -35,7 +36,7 @@ For example, to build the CUDA backend with RPC support:
|
||||||
mkdir build-rpc-cuda
|
mkdir build-rpc-cuda
|
||||||
cd build-rpc-cuda
|
cd build-rpc-cuda
|
||||||
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
|
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
|
||||||
make -j
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
Then, start the `rpc-server` with the backend:
|
Then, start the `rpc-server` with the backend:
|
||||||
|
@ -50,13 +51,20 @@ ggml_cuda_init: found 1 CUDA devices:
|
||||||
Starting RPC server on 0.0.0.0:50052
|
Starting RPC server on 0.0.0.0:50052
|
||||||
```
|
```
|
||||||
|
|
||||||
|
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
|
||||||
|
```bash
|
||||||
|
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
|
||||||
|
```
|
||||||
|
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||||
|
|
||||||
|
|
||||||
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
|
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
mkdir build-rpc
|
mkdir build-rpc
|
||||||
cd build-rpc
|
cd build-rpc
|
||||||
cmake .. -DLLAMA_RPC=ON
|
cmake .. -DLLAMA_RPC=ON
|
||||||
make -j
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||||
|
|
|
@ -7,19 +7,8 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "ggml-rpc.h"
|
#include "ggml-rpc.h"
|
||||||
#include <memory>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#ifndef _WIN32
|
|
||||||
# include <sys/socket.h>
|
|
||||||
# include <sys/types.h>
|
|
||||||
# include <arpa/inet.h>
|
|
||||||
# include <netinet/in.h>
|
|
||||||
# include <netinet/tcp.h>
|
|
||||||
# include <netdb.h>
|
|
||||||
# include <unistd.h>
|
|
||||||
#endif
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
static ggml_backend_t create_backend() {
|
static ggml_backend_t create_backend() {
|
||||||
ggml_backend_t backend = NULL;
|
ggml_backend_t backend = NULL;
|
||||||
|
@ -55,91 +44,27 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::shared_ptr<socket_t> make_socket(sockfd_t fd) {
|
|
||||||
#ifdef _WIN32
|
|
||||||
if (fd == INVALID_SOCKET) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
if (fd < 0) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
return std::make_shared<socket_t>(fd);
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::shared_ptr<socket_t> create_server_socket(const char * host, int port) {
|
|
||||||
auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
|
|
||||||
auto sock = make_socket(sockfd);
|
|
||||||
if (sock == nullptr) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct sockaddr_in serv_addr;
|
|
||||||
serv_addr.sin_family = AF_INET;
|
|
||||||
serv_addr.sin_addr.s_addr = inet_addr(host);
|
|
||||||
serv_addr.sin_port = htons(port);
|
|
||||||
|
|
||||||
if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
if (listen(sockfd, 5) < 0) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
return sock;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char * argv[])
|
int main(int argc, char * argv[])
|
||||||
{
|
{
|
||||||
#ifdef _WIN32
|
|
||||||
WSADATA wsaData;
|
|
||||||
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
|
||||||
if (res != 0) {
|
|
||||||
fprintf(stderr, "WSAStartup failed: %d\n", res);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
|
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
const char * host = argv[1];
|
const char * host = argv[1];
|
||||||
int port = std::stoi(argv[2]);
|
int port = std::stoi(argv[2]);
|
||||||
|
if (port <= 0 || port > 65535) {
|
||||||
|
fprintf(stderr, "Invalid port number: %d\n", port);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
ggml_backend_t backend = create_backend();
|
ggml_backend_t backend = create_backend();
|
||||||
if (!backend) {
|
if (!backend) {
|
||||||
fprintf(stderr, "Failed to create backend\n");
|
fprintf(stderr, "Failed to create backend\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("Starting RPC server on %s:%d\n", host, port);
|
printf("Starting RPC server on %s:%d\n", host, port);
|
||||||
auto server_socket = create_server_socket(host, port);
|
size_t free_mem, total_mem;
|
||||||
if (server_socket == nullptr) {
|
get_backend_memory(&free_mem, &total_mem);
|
||||||
fprintf(stderr, "Failed to create server socket\n");
|
std::string endpoint = std::string(host) + ":" + std::to_string(port);
|
||||||
return 1;
|
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
||||||
}
|
|
||||||
while (true) {
|
|
||||||
auto client_socket_fd = accept(server_socket->fd, NULL, NULL);
|
|
||||||
auto client_socket = make_socket(client_socket_fd);
|
|
||||||
if (client_socket == nullptr) {
|
|
||||||
fprintf(stderr, "Failed to accept client connection\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
// set TCP_NODELAY to disable Nagle's algorithm
|
|
||||||
int flag = 1;
|
|
||||||
int ret = setsockopt(client_socket->fd, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int));
|
|
||||||
if (ret < 0) {
|
|
||||||
fprintf(stderr, "Failed to set TCP_NODELAY\n");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
size_t free_mem, total_mem;
|
|
||||||
get_backend_memory(&free_mem, &total_mem);
|
|
||||||
printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem);
|
|
||||||
rpc_serve_client(backend, client_socket->fd, free_mem, total_mem);
|
|
||||||
printf("Client connection closed\n");
|
|
||||||
}
|
|
||||||
#ifdef _WIN32
|
|
||||||
WSACleanup();
|
|
||||||
#endif
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
193
ggml-rpc.cpp
193
ggml-rpc.cpp
|
@ -4,9 +4,18 @@
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#ifndef _WIN32
|
#ifdef _WIN32
|
||||||
|
# define WIN32_LEAN_AND_MEAN
|
||||||
|
# ifndef NOMINMAX
|
||||||
|
# define NOMINMAX
|
||||||
|
# endif
|
||||||
|
# include <windows.h>
|
||||||
|
# include <winsock2.h>
|
||||||
|
#else
|
||||||
|
# include <arpa/inet.h>
|
||||||
# include <sys/socket.h>
|
# include <sys/socket.h>
|
||||||
# include <sys/types.h>
|
# include <sys/types.h>
|
||||||
# include <netinet/in.h>
|
# include <netinet/in.h>
|
||||||
|
@ -26,9 +35,57 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
|
typedef SOCKET sockfd_t;
|
||||||
using ssize_t = __int64;
|
using ssize_t = __int64;
|
||||||
|
#else
|
||||||
|
typedef int sockfd_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// cross-platform socket
|
||||||
|
struct socket_t {
|
||||||
|
sockfd_t fd;
|
||||||
|
socket_t(sockfd_t fd) : fd(fd) {}
|
||||||
|
~socket_t() {
|
||||||
|
#ifdef _WIN32
|
||||||
|
closesocket(this->fd);
|
||||||
|
#else
|
||||||
|
close(this->fd);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ggml_tensor is serialized into rpc_tensor
|
||||||
|
struct rpc_tensor {
|
||||||
|
uint64_t id;
|
||||||
|
uint32_t type;
|
||||||
|
uint64_t buffer;
|
||||||
|
uint32_t ne[GGML_MAX_DIMS];
|
||||||
|
uint32_t nb[GGML_MAX_DIMS];
|
||||||
|
uint32_t op;
|
||||||
|
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
||||||
|
int32_t flags;
|
||||||
|
uint64_t src[GGML_MAX_SRC];
|
||||||
|
uint64_t view_src;
|
||||||
|
uint64_t view_offs;
|
||||||
|
uint64_t data;
|
||||||
|
char name[GGML_MAX_NAME];
|
||||||
|
};
|
||||||
|
|
||||||
|
// RPC commands
|
||||||
|
enum rpc_cmd {
|
||||||
|
ALLOC_BUFFER = 0,
|
||||||
|
GET_ALIGNMENT,
|
||||||
|
GET_MAX_SIZE,
|
||||||
|
BUFFER_GET_BASE,
|
||||||
|
FREE_BUFFER,
|
||||||
|
BUFFER_CLEAR,
|
||||||
|
SET_TENSOR,
|
||||||
|
GET_TENSOR,
|
||||||
|
COPY_TENSOR,
|
||||||
|
GRAPH_COMPUTE,
|
||||||
|
GET_DEVICE_MEMORY,
|
||||||
|
};
|
||||||
|
|
||||||
// RPC data structures
|
// RPC data structures
|
||||||
|
|
||||||
static ggml_guid_t ggml_backend_rpc_guid() {
|
static ggml_guid_t ggml_backend_rpc_guid() {
|
||||||
|
@ -59,14 +116,6 @@ struct ggml_backend_rpc_buffer_context {
|
||||||
|
|
||||||
// RPC helper functions
|
// RPC helper functions
|
||||||
|
|
||||||
socket_t::~socket_t() {
|
|
||||||
#ifdef _WIN32
|
|
||||||
closesocket(this->fd);
|
|
||||||
#else
|
|
||||||
close(this->fd);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::shared_ptr<socket_t> make_socket(sockfd_t fd) {
|
static std::shared_ptr<socket_t> make_socket(sockfd_t fd) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
if (fd == INVALID_SOCKET) {
|
if (fd == INVALID_SOCKET) {
|
||||||
|
@ -80,6 +129,13 @@ static std::shared_ptr<socket_t> make_socket(sockfd_t fd) {
|
||||||
return std::make_shared<socket_t>(fd);
|
return std::make_shared<socket_t>(fd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool set_no_delay(sockfd_t sockfd) {
|
||||||
|
int flag = 1;
|
||||||
|
// set TCP_NODELAY to disable Nagle's algorithm
|
||||||
|
int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
|
||||||
|
return ret >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
||||||
struct sockaddr_in addr;
|
struct sockaddr_in addr;
|
||||||
auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
|
auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
|
||||||
|
@ -87,10 +143,8 @@ static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
||||||
if (sock_ptr == nullptr) {
|
if (sock_ptr == nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
// set TCP_NODELAY to disable Nagle's algorithm
|
if (!set_no_delay(sockfd)) {
|
||||||
int flag = 1;
|
fprintf(stderr, "Failed to set TCP_NODELAY\n");
|
||||||
int ret = setsockopt(sock_ptr->fd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
|
|
||||||
if (ret < 0) {
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
addr.sin_family = AF_INET;
|
addr.sin_family = AF_INET;
|
||||||
|
@ -107,6 +161,40 @@ static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
||||||
return sock_ptr;
|
return sock_ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::shared_ptr<socket_t> socket_accept(sockfd_t srv_sockfd) {
|
||||||
|
auto client_socket_fd = accept(srv_sockfd, NULL, NULL);
|
||||||
|
auto client_socket = make_socket(client_socket_fd);
|
||||||
|
if (client_socket == nullptr) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
if (!set_no_delay(client_socket_fd)) {
|
||||||
|
fprintf(stderr, "Failed to set TCP_NODELAY\n");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return client_socket;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::shared_ptr<socket_t> create_server_socket(const char * host, int port) {
|
||||||
|
auto sockfd = socket(AF_INET, SOCK_STREAM, 0);
|
||||||
|
auto sock = make_socket(sockfd);
|
||||||
|
if (sock == nullptr) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct sockaddr_in serv_addr;
|
||||||
|
serv_addr.sin_family = AF_INET;
|
||||||
|
serv_addr.sin_addr.s_addr = inet_addr(host);
|
||||||
|
serv_addr.sin_port = htons(port);
|
||||||
|
|
||||||
|
if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
if (listen(sockfd, 1) < 0) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return sock;
|
||||||
|
}
|
||||||
|
|
||||||
static bool send_data(sockfd_t sockfd, const void * data, size_t size) {
|
static bool send_data(sockfd_t sockfd, const void * data, size_t size) {
|
||||||
size_t bytes_sent = 0;
|
size_t bytes_sent = 0;
|
||||||
while (bytes_sent < size) {
|
while (bytes_sent < size) {
|
||||||
|
@ -131,6 +219,17 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool parse_endpoint(const char * endpoint, std::string & host, int & port) {
|
||||||
|
std::string str(endpoint);
|
||||||
|
size_t pos = str.find(':');
|
||||||
|
if (pos == std::string::npos) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
host = str.substr(0, pos);
|
||||||
|
port = std::stoi(str.substr(pos + 1));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
|
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
|
||||||
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
|
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
|
||||||
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
|
||||||
|
@ -244,7 +343,7 @@ static ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_ten
|
||||||
}
|
}
|
||||||
result->flags = tensor->flags;
|
result->flags = tensor->flags;
|
||||||
result->data = reinterpret_cast<void *>(tensor->data);
|
result->data = reinterpret_cast<void *>(tensor->data);
|
||||||
snprintf(result->name, GGML_MAX_NAME, "%s", tensor->name);
|
ggml_set_name(result, tensor->name);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -259,7 +358,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t
|
||||||
GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||||
// input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
|
// input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
|
||||||
int input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size;
|
size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size;
|
||||||
std::vector<uint8_t> input(input_size, 0);
|
std::vector<uint8_t> input(input_size, 0);
|
||||||
rpc_tensor rpc_tensor = serialize_tensor(tensor);
|
rpc_tensor rpc_tensor = serialize_tensor(tensor);
|
||||||
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
|
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
|
||||||
|
@ -530,14 +629,15 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
||||||
|
|
||||||
static std::unordered_map<std::string, ggml_backend_t> instances;
|
static std::unordered_map<std::string, ggml_backend_t> instances;
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const std::string & endpoint) {
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
||||||
return backend != nullptr ? ggml_backend_rpc_get_default_buffer_type(backend) : nullptr;
|
return backend != nullptr ? ggml_backend_rpc_get_default_buffer_type(backend) : nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const std::string & endpoint) {
|
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||||
if (instances.find(endpoint) != instances.end()) {
|
std::string endpoint_str(endpoint);
|
||||||
return instances[endpoint];
|
if (instances.find(endpoint_str) != instances.end()) {
|
||||||
|
return instances[endpoint_str];
|
||||||
}
|
}
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
{
|
{
|
||||||
|
@ -548,11 +648,12 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const std::string & endpoint) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
GGML_PRINT_DEBUG("Connecting to %s\n", endpoint.c_str());
|
GGML_PRINT_DEBUG("Connecting to %s\n", endpoint);
|
||||||
// split the endpoint into host and port
|
std::string host;
|
||||||
size_t pos = endpoint.find(":");
|
int port;
|
||||||
std::string host = endpoint.substr(0, pos);
|
if (!parse_endpoint(endpoint, host, port)) {
|
||||||
int port = std::stoi(endpoint.substr(pos + 1));
|
return nullptr;
|
||||||
|
}
|
||||||
auto sock = socket_connect(host.c_str(), port);
|
auto sock = socket_connect(host.c_str(), port);
|
||||||
if (sock == nullptr) {
|
if (sock == nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -607,7 +708,7 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
|
||||||
*total = total_mem;
|
*total = total_mem;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const std::string & endpoint, size_t * free, size_t * total) {
|
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
*free = 0;
|
*free = 0;
|
||||||
|
@ -781,7 +882,7 @@ static void rpc_graph_compute(ggml_backend_t backend, const std::vector<uint8_t>
|
||||||
const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
|
const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
|
||||||
GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
||||||
|
|
||||||
static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead();
|
static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ buf_size,
|
/*.mem_size =*/ buf_size,
|
||||||
/*.mem_buffer =*/ NULL,
|
/*.mem_buffer =*/ NULL,
|
||||||
|
@ -805,7 +906,7 @@ static void rpc_graph_compute(ggml_backend_t backend, const std::vector<uint8_t>
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem) {
|
static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem) {
|
||||||
while (true) {
|
while (true) {
|
||||||
uint8_t cmd;
|
uint8_t cmd;
|
||||||
if (!recv_data(sockfd, &cmd, 1)) {
|
if (!recv_data(sockfd, &cmd, 1)) {
|
||||||
|
@ -871,7 +972,7 @@ void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem,
|
||||||
}
|
}
|
||||||
default: {
|
default: {
|
||||||
fprintf(stderr, "Unknown command: %d\n", cmd);
|
fprintf(stderr, "Unknown command: %d\n", cmd);
|
||||||
break;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
uint64_t output_size = output.size();
|
uint64_t output_size = output.size();
|
||||||
|
@ -883,3 +984,39 @@ void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
|
||||||
|
std::string host;
|
||||||
|
int port;
|
||||||
|
if (!parse_endpoint(endpoint, host, port)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#ifdef _WIN32
|
||||||
|
{
|
||||||
|
WSADATA wsaData;
|
||||||
|
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
||||||
|
if (res != 0) {
|
||||||
|
fprintf(stderr, "WSAStartup failed: %d\n", res);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
auto server_socket = create_server_socket(host.c_str(), port);
|
||||||
|
if (server_socket == nullptr) {
|
||||||
|
fprintf(stderr, "Failed to create server socket\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
while (true) {
|
||||||
|
auto client_socket = socket_accept(server_socket->fd);
|
||||||
|
if (client_socket == nullptr) {
|
||||||
|
fprintf(stderr, "Failed to accept client connection\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem);
|
||||||
|
rpc_serve_client(backend, client_socket->fd, free_mem, total_mem);
|
||||||
|
printf("Client connection closed\n");
|
||||||
|
}
|
||||||
|
#ifdef _WIN32
|
||||||
|
WSACleanup();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
65
ggml-rpc.h
65
ggml-rpc.h
|
@ -2,79 +2,22 @@
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include <string>
|
|
||||||
#include <memory>
|
|
||||||
#ifdef _WIN32
|
|
||||||
# define WIN32_LEAN_AND_MEAN
|
|
||||||
# ifndef NOMINMAX
|
|
||||||
# define NOMINMAX
|
|
||||||
# endif
|
|
||||||
# include <windows.h>
|
|
||||||
# include <winsock2.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// cross-platform socket fd
|
|
||||||
#ifdef _WIN32
|
|
||||||
typedef SOCKET sockfd_t;
|
|
||||||
#else
|
|
||||||
typedef int sockfd_t;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// cross-platform socket
|
|
||||||
struct socket_t {
|
|
||||||
sockfd_t fd;
|
|
||||||
socket_t(sockfd_t fd) : fd(fd) {}
|
|
||||||
~socket_t();
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
// ggml_tensor is serialized into rpc_tensor
|
|
||||||
struct rpc_tensor {
|
|
||||||
uint64_t id;
|
|
||||||
uint32_t type;
|
|
||||||
uint64_t buffer;
|
|
||||||
uint32_t ne[GGML_MAX_DIMS];
|
|
||||||
uint32_t nb[GGML_MAX_DIMS];
|
|
||||||
uint32_t op;
|
|
||||||
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
|
||||||
int32_t flags;
|
|
||||||
uint64_t src[GGML_MAX_SRC];
|
|
||||||
uint64_t view_src;
|
|
||||||
uint64_t view_offs;
|
|
||||||
uint64_t data;
|
|
||||||
char name[GGML_MAX_NAME];
|
|
||||||
};
|
|
||||||
|
|
||||||
// RPC commands
|
|
||||||
enum rpc_cmd {
|
|
||||||
ALLOC_BUFFER = 0,
|
|
||||||
GET_ALIGNMENT,
|
|
||||||
GET_MAX_SIZE,
|
|
||||||
BUFFER_GET_BASE,
|
|
||||||
FREE_BUFFER,
|
|
||||||
BUFFER_CLEAR,
|
|
||||||
SET_TENSOR,
|
|
||||||
GET_TENSOR,
|
|
||||||
COPY_TENSOR,
|
|
||||||
GRAPH_COMPUTE,
|
|
||||||
GET_DEVICE_MEMORY,
|
|
||||||
};
|
|
||||||
|
|
||||||
#define GGML_RPC_MAX_SERVERS 16
|
#define GGML_RPC_MAX_SERVERS 16
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const std::string & endpoint);
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
|
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const std::string & endpoint);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
||||||
|
|
||||||
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const std::string & endpoint, size_t * free, size_t * total);
|
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||||
|
|
||||||
GGML_API GGML_CALL void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t free_mem, size_t total_mem);
|
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -2279,7 +2279,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
||||||
|
|
||||||
#ifdef GGML_USE_RPC
|
#ifdef GGML_USE_RPC
|
||||||
std::string endpoint = model.rpc_servers[gpu];
|
std::string endpoint = model.rpc_servers[gpu];
|
||||||
buft = ggml_backend_rpc_buffer_type(endpoint);
|
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
||||||
#elif defined(GGML_USE_METAL)
|
#elif defined(GGML_USE_METAL)
|
||||||
buft = ggml_backend_metal_buffer_type();
|
buft = ggml_backend_metal_buffer_type();
|
||||||
#elif defined(GGML_USE_CUDA)
|
#elif defined(GGML_USE_CUDA)
|
||||||
|
@ -2348,7 +2348,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
std::string endpoint = model.rpc_servers[device];
|
std::string endpoint = model.rpc_servers[device];
|
||||||
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_CUDA)
|
#elif defined(GGML_USE_CUDA)
|
||||||
size_t total;
|
size_t total;
|
||||||
|
@ -15727,7 +15727,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
// initialize backends
|
// initialize backends
|
||||||
#if defined(GGML_USE_RPC)
|
#if defined(GGML_USE_RPC)
|
||||||
for (auto & server : model->rpc_servers) {
|
for (auto & server : model->rpc_servers) {
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(server);
|
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue