parallel : example for serving multiple users in parallel

2023-09-18 20:30:05 +03:00 · 2023-09-18 20:30:05 +03:00 · 0161372b9a
commit 0161372b9a
parent 1f17ea631c
9 changed files with 262 additions and 13 deletions
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -0,0 +1,244 @@
+// A basic application simulating a server with multiple clients.
+// The clients submite requests to the server and they are processed in parallel.
+
+#include "build-info.h"
+
+#include "common.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+// trim whitespace from the beginning and end of a string
+static std::string trim(const std::string & str) {
+    size_t start = 0;
+    size_t end = str.size();
+
+    while (start < end && isspace(str[start])) {
+        start += 1;
+    }
+
+    while (end > start && isspace(str[end - 1])) {
+        end -= 1;
+    }
+
+    return str.substr(start, end - start);
+}
+
+static std::string k_system = R"(
+Transcript of a dialog, where the User interacts with an Assistant.
+The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
+
+User: Hello, what is the temperature outside?
+Assistant: It is 72 degrees Fahrenheit.
+User: What is the definition of a prime number?
+Assistant: A prime number is a number that is divisible only by itself and 1.
+User: )";
+
+static std::vector<std::string> k_prompts = {
+    "What is the meaning of life?",
+    "What is the population of Europe?",
+    "List all planets in the Solar System.",
+    "What is the capital of France?",
+    "Tell me an interesting fact about llamas.",
+    "What is the best way to cook a steak?",
+    "Are you familiar with the Special Theory of Relativity and can you explain it to me?",
+    "Recommend some interesting books to read.",
+    "What is the best way to learn a new language?",
+    "How to get a job at Google?",
+    "If you could have any superpower, what would it be?",
+    "I want to learn how to play the piano.",
+};
+
+struct client {
+    int32_t id = 0;
+
+    llama_seq_id seq_id = -1;
+
+    llama_token sampled;
+
+    int32_t n_prompt  = 0;
+    int32_t n_decoded = 0;
+    int32_t i_batch   = -1;
+
+    std::string input;
+    std::string prompt;
+    std::string response;
+
+    std::vector<llama_token> last_tokens;
+};
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    const int n_clients = 16;
+
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("parallel", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
+    // init llama.cpp
+    llama_backend_init(params.numa);
+
+    llama_model * model = NULL;
+
+    llama_context * ctx = NULL;
+
+    // load the target model
+    params.logits_all = true;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+
+    fprintf(stderr, "\n\n");
+    fflush(stderr);
+
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_vocab = llama_n_vocab(ctx);
+
+    std::vector<client> clients(n_clients);
+    for (size_t i = 0; i < clients.size(); ++i) {
+        auto & client = clients[i];
+        client.id = i;
+        client.last_tokens.resize(n_ctx);
+        std::fill(client.last_tokens.begin(), client.last_tokens.end(), 0);
+    }
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+
+    auto t_main_start = ggml_time_us();
+
+    int64_t n_tokens_total = 0;
+
+    llama_seq_id g_seq_id = 0;
+
+    std::vector<llama_token>  batch_token;
+    std::vector<llama_pos>    batch_pos;
+    std::vector<llama_seq_id> batch_seq_id;
+    std::vector<client *>     batch_clients;
+
+    while (true) {
+        uint32_t n_tokens = 0;
+
+        batch_token.clear();
+        batch_pos.clear();
+        batch_seq_id.clear();
+
+        for (auto & client : clients) {
+            if (client.seq_id == -1) {
+                client.seq_id = g_seq_id;
+                client.input = k_prompts[rand() % k_prompts.size()];
+                client.prompt = k_system + client.input + "\nAssistant:";
+                client.response = "";
+                std::fill(client.last_tokens.begin(), client.last_tokens.end(), 0);
+
+                std::vector<llama_token> prompt_tokens;
+                prompt_tokens = ::llama_tokenize(ctx, client.prompt, true);
+
+                for (size_t i = 0; i < prompt_tokens.size(); ++i) {
+                    batch_token.push_back(prompt_tokens[i]);
+                    batch_pos.push_back(i);
+                    batch_seq_id.push_back(client.seq_id);
+                    batch_clients.push_back(&client);
+                }
+                client.n_prompt  = prompt_tokens.size();
+                client.n_decoded = prompt_tokens.size();
+                client.i_batch   = batch_token.size() - 1;
+
+                g_seq_id += 1;
+            } else {
+                batch_token.push_back(client.sampled);
+                batch_pos.push_back(client.n_decoded);
+                batch_seq_id.push_back(client.seq_id);
+                batch_clients.push_back(&client);
+                client.n_decoded += 1;
+                client.i_batch = batch_token.size() - 1;
+            }
+        }
+
+        // process in chunks of params.n_batch
+        for (size_t i = 0; i < batch_token.size(); i += params.n_batch) {
+            n_tokens = std::min(params.n_batch, (int32_t) (batch_token.size() - i));
+
+            llama_batch batch = {
+                n_tokens,
+                batch_token.data() + i,
+                nullptr,
+                batch_pos.data() + i,
+                batch_seq_id.data() + i,
+                0, 0, 0, // unused
+            };
+
+            if (llama_decode(ctx, batch, params.n_threads)) {
+                LOG_TEE("%s : failed to decode batch\n", __func__);
+                return 1;
+            }
+
+            for (auto & client : clients) {
+                if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
+                    continue;
+                }
+
+                const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.last_tokens, candidates, client.i_batch - i);
+
+                // remember which tokens were sampled - used for repetition penalties during sampling
+                client.last_tokens.erase(client.last_tokens.begin());
+                client.last_tokens.push_back(id);
+
+                const std::string token_str = llama_token_to_piece(ctx, id);
+                client.response += token_str;
+                client.sampled = id;
+
+                //printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n",
+                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
+
+                if (id == llama_token_eos(ctx) || client.n_decoded > params.n_predict || client.response.find("User:") != std::string::npos) {
+                    const size_t pos = client.response.find("User:");
+                    if (pos != std::string::npos) {
+                        client.response = client.response.substr(0, pos);
+                    }
+
+                    llama_kv_cache_rm_seq(ctx, client.seq_id, 0, n_ctx);
+
+                    const auto t_main_end = ggml_time_us();
+
+                    n_tokens_total += client.n_decoded - client.n_prompt;
+
+                    printf("\033[1mClient %d, seq %d, prompt %d t, response %d t, speed: %.2f t/s\033[0m: \n\nInput:    %s\nResponse: %s\n\n",
+                            client.id, client.seq_id, client.n_prompt, client.n_decoded - client.n_prompt,
+                            (double) n_tokens_total / (t_main_end - t_main_start) * 1e6,
+                            client.input.c_str(), ::trim(client.response).c_str());
+
+                    client.seq_id = -1;
+                }
+            }
+        }
+
+        static bool is_first = true;
+        if (is_first) {
+            t_main_start = ggml_time_us();
+            n_tokens_total = 0;
+            is_first = false;
+        }
+    }
+
+    LOG_TEE("\n\n");
+
+    llama_print_timings(ctx);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    fprintf(stderr, "\n\n");
+
+    return 0;
+}