readme

2024-05-24 23:56:48 -04:00 · 2024-05-24 23:56:48 -04:00 · 83aabb3fb7
commit 83aabb3fb7
parent 10d5aefed5
3 changed files with 83 additions and 52 deletions
--- a/.gitignore
+++ b/.gitignore
@ -48,6 +48,7 @@ models-mnt
 /beam-search
 /benchmark-matmult
 /convert-llama2c-to-ggml
 /duo
 /embd-input-test
 /embedding
 /eval-callback
--- a/examples/duo/README.md
+++ b/examples/duo/README.md
@ -1,7 +1,64 @@
 ## duo
-Minimal example. What's not implemented, but can be implemented separately in pieces:
+This is a demo of an approach of distributed evaluation/speculation using rpc.
-* tree-based speculation
+
-* correct sampling
+It is a fairly minimal app, and many more improvements could be made. 
-* support more than 2 instances
+
-* just one instance speculates 
+### Idea
 Idea is coming from discussion here: https://github.com/ggerganov/llama.cpp/discussions/6853#discussioncomment-9473494.
 When we run a large model and distribute the evaluation across multiple devices, they still evaluate model sequentially.
 In case of two identical devices and equal model split we would leave half of compute on the table, assuming individual use-case (e.g. personal chat).
 We can utilize this compute to speculate and then evaluate larger sequence of tokens.
 This demo is fairly limited:
 1. Expects two instances running main model
 2. One of these instances speculating
 3. Speculation is linear
 4. Sampling is greedy
 So, in the case of two identical devices and equal model split we still are not utilizing 25% of compute.
 Improvement of the above points is probably easier to do as separate changes, to make reviewing easier.
 ### Setup
 Devices:
 * Apple M1 16GB 
 * Apple M2 24GB
 * Connected with thunderbolt-4 cable and using TCP/IP over thunderbolt. 
 Models:
 * Meta-Llama-3-8B-Instruct-fp16 as main
 * Meta-Llama-3-8B-Instruct-v2.Q2_K as speculation
 We could use different models as well.
 On M1
 ```
 bin/rpc-server -p 10001 -m 10000
 ```
 On M2
 ```
 bin/rpc-server -p 10001 -m 10000
 bin/rpc-server -p 20002 -m 4000
 ```
 Also on M2:
 ```
 ./bin/duo -m ../../llms/gguf/Meta-Llama-3-8B-Instruct-fp16.gguf -md ../../llms/gguf/Meta-Llama-3-8B-Instruct-v2.Q2_K.gguf --rpc "localhost:10001,169.254.77.16:10001" -p "Please illustrate the difference between concurrency and parallelism in python." -n 256 -ngl 99 -t 1  --rpcd "localhost:20002"
 ...
 decoded 256 tokens in 32.03 s, speed: 7.99 t/s
 ```
 Compare that with running main with same 2 rpc servers:
 ```
 ./bin/main -m ../../llms/gguf/Meta-Llama-3-8B-Instruct-fp16.gguf  --rpc "localhost:10001,169.254.77.16:10001" -p "Please illustrate the difference between concurrency and parallelism in python." -n 256 -ngl 99 -t 1
 ...
 ```
--- a/examples/duo/duo.cpp
+++ b/examples/duo/duo.cpp
@ -48,7 +48,7 @@ using llama_tokens = std::vector<llama_token>;
 struct speculation_context
 {
    llama_tokens candidate;
-    int32_t      active_id;
+    int32_t      vacant_id; // not running main model
    std::mutex   mtx;
    bool         done;
 };
@ -60,8 +60,7 @@ static void split_done_cb(int split)
    if (split == 1 || split == 2)
    {
        std::lock_guard<std::mutex> guard(spec_ctx.mtx);
-        fprintf(stderr, "split_done = %d\n", split);
+        spec_ctx.vacant_id = split - 1;
        spec_ctx.active_id = split - 1;
    }
 }
@ -97,13 +96,11 @@ static std::vector<llama_token> greedy_tokens(
 }
 static int speculation(
-    std::vector<llama_model *> model,
+    llama_model * model,
    speculation_context * spec_ctx,
-    std::vector<llama_context *> ctx,
+    llama_context * ctx,
    llama_tokens input /* copy here */) {
    int32_t active = 1;
    llama_batch batch = llama_batch_init(512, 0, 1);
    for (size_t i = 0; i < input.size(); i++)
@ -113,7 +110,7 @@ static int speculation(
    batch.logits[batch.n_tokens - 1] = true;
-    if (llama_decode(ctx[active], batch) != 0) {
+    if (llama_decode(ctx, batch) != 0) {
        LOG_TEE("%s: llama_decode() failed\n", __func__);
        return 1;
    }
@ -129,7 +126,11 @@ static int speculation(
        bool wait = false;
        {
            std::lock_guard<std::mutex> g(spec_ctx->mtx);
-            if (spec_ctx->active_id != 0)
+            if (spec_ctx->done)
            {
                break;
            }
            if (spec_ctx->vacant_id != 0)
            {
                wait = true;
            }
@ -141,7 +142,7 @@ static int speculation(
        }
-        auto next_tokens = greedy_tokens(model[active], ctx[active], logit_idx, logit_idx + 1);
+        auto next_tokens = greedy_tokens(model, ctx, logit_idx, logit_idx + 1);
        if (next_tokens.size() != 1) {
            fprintf(stderr, "invalid next tokens\n");
            return 1;
@ -151,10 +152,6 @@ static int speculation(
        {
            std::lock_guard<std::mutex> _lock(spec_ctx->mtx);
            if (spec_ctx->done)
            {
                break;
            }
            auto& shared = spec_ctx->candidate;
            bool match = true;
            match_len = local.size() - 1;
@ -164,9 +161,7 @@ static int speculation(
                {
                    match = false;
                    match_len = i;
-                    // here we need to clear both contexts
+                    llama_kv_cache_seq_rm(ctx, 0, i, -1);
                    llama_kv_cache_seq_rm(ctx[0], 0, i, -1);
                    //llama_kv_cache_seq_rm(ctx[1], 0, i, -1);
                    break;
                }
            }
@ -178,11 +173,6 @@ static int speculation(
            {
                local = shared;
            }
            if (active != spec_ctx->active_id)
            {
                active = spec_ctx->active_id;
                fprintf(stderr, "updating active_id = %d\n", active);
            }
        }
        llama_batch_clear(batch);
@ -194,7 +184,7 @@ static int speculation(
        logit_idx = batch.n_tokens - 1;
-        if (llama_decode(ctx[active], batch) != 0)
+        if (llama_decode(ctx, batch) != 0)
        {
            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
@ -317,20 +307,15 @@ static int target(
            break;
        }
        fprintf(stderr, "\ntgt: input_seq.size() = %zu\n", input_seq.size());
        llama_batch_clear(batch);
        for (size_t i = 0; i < input_seq.size(); i++)
        {
            llama_batch_add(batch, input_seq[i], n_cur - 1 + i, { 0 }, true);
        }
        auto s_us = ggml_time_us();
        if (llama_decode(ctx, batch)) {
            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
        auto eval_us = ggml_time_us() - s_us;
        fprintf(stderr, "eval_time: %lld", eval_us);
        logits_from = 0;
        logits_to   = input_seq.size();
    }
@ -362,14 +347,6 @@ int main(int argc, char ** argv) {
        params.seed = time(NULL);
    }
    std::string draft_rpcs = params.rpc_servers_draft;
    size_t i = draft_rpcs.find(',');
    if (i == std::string::npos || draft_rpcs.find(',', i + 1) != std::string::npos)
    {
        fprintf(stderr, "drpc must contain exactly two servers\n");
        return 1;
    }
    llama_backend_init();
    llama_numa_init(params.numa);
@ -383,8 +360,8 @@ int main(int argc, char ** argv) {
    spec_ctx.candidate = input;
    // prepare draft model and contexts. No need for two model instances?
-    std::vector<llama_model *> draft_models = {nullptr, nullptr};
+    llama_model * draft_model = nullptr;
-    std::vector<llama_context *> draft_ctx  = {nullptr, nullptr};
+    llama_context * draft_ctx = nullptr;
    params.model = params.model_draft;
    params.n_gpu_layers = params.n_gpu_layers_draft;
@ -395,23 +372,19 @@ int main(int argc, char ** argv) {
    params.n_threads_batch = params.n_threads_batch_draft;
    params.cb_split_done = nullptr;
-    params.rpc_servers = draft_rpcs.substr(0, i);
+    params.rpc_servers = params.rpc_servers_draft;
-    std::tie(draft_models[0], draft_ctx[0]) = llama_init_from_gpt_params(params);
+    std::tie(draft_model, draft_ctx) = llama_init_from_gpt_params(params);
-    params.rpc_servers = draft_rpcs.substr(i + 1);
+    std::thread spec_thread = std::thread(speculation, draft_model, &spec_ctx, draft_ctx, input);
    std::tie(draft_models[1], draft_ctx[1]) = llama_init_from_gpt_params(params);
    std::thread spec_thread = std::thread(speculation, draft_models, &spec_ctx, draft_ctx, input);
    target(model, ctx, input, params.n_predict);
    spec_thread.join();
    llama_free(ctx);
-    llama_free(draft_ctx[0]);
+    llama_free(draft_ctx);
    llama_free(draft_ctx[1]);
    llama_free_model(model);
-    llama_free_model(draft_models[0]);
+    llama_free_model(draft_model);
    llama_free_model(draft_models[1]);
    llama_backend_free();