From 83aabb3fb74f9c2a4ed70e687a633bcbdecc5414 Mon Sep 17 00:00:00 2001 From: Oleksandr Kuvshynov <661042+okuvshynov@users.noreply.github.com> Date: Fri, 24 May 2024 23:56:48 -0400 Subject: [PATCH] readme --- .gitignore | 1 + examples/duo/README.md | 67 ++++++++++++++++++++++++++++++++++++++---- examples/duo/duo.cpp | 67 +++++++++++++----------------------------- 3 files changed, 83 insertions(+), 52 deletions(-) diff --git a/.gitignore b/.gitignore index 50ae0973a..f0169d1db 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ models-mnt /beam-search /benchmark-matmult /convert-llama2c-to-ggml +/duo /embd-input-test /embedding /eval-callback diff --git a/examples/duo/README.md b/examples/duo/README.md index dfcbb1023..7cb8e6e69 100644 --- a/examples/duo/README.md +++ b/examples/duo/README.md @@ -1,7 +1,64 @@ ## duo -Minimal example. What's not implemented, but can be implemented separately in pieces: -* tree-based speculation -* correct sampling -* support more than 2 instances -* just one instance speculates +This is a demo of an approach of distributed evaluation/speculation using rpc. + +It is a fairly minimal app, and many more improvements could be made. + +### Idea + +Idea is coming from discussion here: https://github.com/ggerganov/llama.cpp/discussions/6853#discussioncomment-9473494. +When we run a large model and distribute the evaluation across multiple devices, they still evaluate model sequentially. +In case of two identical devices and equal model split we would leave half of compute on the table, assuming individual use-case (e.g. personal chat). + +We can utilize this compute to speculate and then evaluate larger sequence of tokens. + +This demo is fairly limited: +1. Expects two instances running main model +2. One of these instances speculating +3. Speculation is linear +4. Sampling is greedy + +So, in the case of two identical devices and equal model split we still are not utilizing 25% of compute. +Improvement of the above points is probably easier to do as separate changes, to make reviewing easier. + +### Setup + +Devices: +* Apple M1 16GB +* Apple M2 24GB +* Connected with thunderbolt-4 cable and using TCP/IP over thunderbolt. + +Models: +* Meta-Llama-3-8B-Instruct-fp16 as main +* Meta-Llama-3-8B-Instruct-v2.Q2_K as speculation + +We could use different models as well. + +On M1 +``` +bin/rpc-server -p 10001 -m 10000 +``` + +On M2 +``` +bin/rpc-server -p 10001 -m 10000 +bin/rpc-server -p 20002 -m 4000 +``` + +Also on M2: +``` +./bin/duo -m ../../llms/gguf/Meta-Llama-3-8B-Instruct-fp16.gguf -md ../../llms/gguf/Meta-Llama-3-8B-Instruct-v2.Q2_K.gguf --rpc "localhost:10001,169.254.77.16:10001" -p "Please illustrate the difference between concurrency and parallelism in python." -n 256 -ngl 99 -t 1 --rpcd "localhost:20002" + +... +decoded 256 tokens in 32.03 s, speed: 7.99 t/s + +``` + +Compare that with running main with same 2 rpc servers: +``` +./bin/main -m ../../llms/gguf/Meta-Llama-3-8B-Instruct-fp16.gguf --rpc "localhost:10001,169.254.77.16:10001" -p "Please illustrate the difference between concurrency and parallelism in python." -n 256 -ngl 99 -t 1 +... + +``` + + diff --git a/examples/duo/duo.cpp b/examples/duo/duo.cpp index 4c65769de..1b02b4ad3 100644 --- a/examples/duo/duo.cpp +++ b/examples/duo/duo.cpp @@ -48,7 +48,7 @@ using llama_tokens = std::vector; struct speculation_context { llama_tokens candidate; - int32_t active_id; + int32_t vacant_id; // not running main model std::mutex mtx; bool done; }; @@ -60,8 +60,7 @@ static void split_done_cb(int split) if (split == 1 || split == 2) { std::lock_guard guard(spec_ctx.mtx); - fprintf(stderr, "split_done = %d\n", split); - spec_ctx.active_id = split - 1; + spec_ctx.vacant_id = split - 1; } } @@ -97,13 +96,11 @@ static std::vector greedy_tokens( } static int speculation( - std::vector model, + llama_model * model, speculation_context * spec_ctx, - std::vector ctx, + llama_context * ctx, llama_tokens input /* copy here */) { - int32_t active = 1; - llama_batch batch = llama_batch_init(512, 0, 1); for (size_t i = 0; i < input.size(); i++) @@ -113,7 +110,7 @@ static int speculation( batch.logits[batch.n_tokens - 1] = true; - if (llama_decode(ctx[active], batch) != 0) { + if (llama_decode(ctx, batch) != 0) { LOG_TEE("%s: llama_decode() failed\n", __func__); return 1; } @@ -129,7 +126,11 @@ static int speculation( bool wait = false; { std::lock_guard g(spec_ctx->mtx); - if (spec_ctx->active_id != 0) + if (spec_ctx->done) + { + break; + } + if (spec_ctx->vacant_id != 0) { wait = true; } @@ -141,7 +142,7 @@ static int speculation( } - auto next_tokens = greedy_tokens(model[active], ctx[active], logit_idx, logit_idx + 1); + auto next_tokens = greedy_tokens(model, ctx, logit_idx, logit_idx + 1); if (next_tokens.size() != 1) { fprintf(stderr, "invalid next tokens\n"); return 1; @@ -151,10 +152,6 @@ static int speculation( { std::lock_guard _lock(spec_ctx->mtx); - if (spec_ctx->done) - { - break; - } auto& shared = spec_ctx->candidate; bool match = true; match_len = local.size() - 1; @@ -164,9 +161,7 @@ static int speculation( { match = false; match_len = i; - // here we need to clear both contexts - llama_kv_cache_seq_rm(ctx[0], 0, i, -1); - //llama_kv_cache_seq_rm(ctx[1], 0, i, -1); + llama_kv_cache_seq_rm(ctx, 0, i, -1); break; } } @@ -178,11 +173,6 @@ static int speculation( { local = shared; } - if (active != spec_ctx->active_id) - { - active = spec_ctx->active_id; - fprintf(stderr, "updating active_id = %d\n", active); - } } llama_batch_clear(batch); @@ -194,7 +184,7 @@ static int speculation( logit_idx = batch.n_tokens - 1; - if (llama_decode(ctx[active], batch) != 0) + if (llama_decode(ctx, batch) != 0) { fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); return 1; @@ -317,20 +307,15 @@ static int target( break; } - fprintf(stderr, "\ntgt: input_seq.size() = %zu\n", input_seq.size()); - llama_batch_clear(batch); for (size_t i = 0; i < input_seq.size(); i++) { llama_batch_add(batch, input_seq[i], n_cur - 1 + i, { 0 }, true); } - auto s_us = ggml_time_us(); if (llama_decode(ctx, batch)) { fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); return 1; } - auto eval_us = ggml_time_us() - s_us; - fprintf(stderr, "eval_time: %lld", eval_us); logits_from = 0; logits_to = input_seq.size(); } @@ -362,14 +347,6 @@ int main(int argc, char ** argv) { params.seed = time(NULL); } - std::string draft_rpcs = params.rpc_servers_draft; - size_t i = draft_rpcs.find(','); - if (i == std::string::npos || draft_rpcs.find(',', i + 1) != std::string::npos) - { - fprintf(stderr, "drpc must contain exactly two servers\n"); - return 1; - } - llama_backend_init(); llama_numa_init(params.numa); @@ -383,8 +360,8 @@ int main(int argc, char ** argv) { spec_ctx.candidate = input; // prepare draft model and contexts. No need for two model instances? - std::vector draft_models = {nullptr, nullptr}; - std::vector draft_ctx = {nullptr, nullptr}; + llama_model * draft_model = nullptr; + llama_context * draft_ctx = nullptr; params.model = params.model_draft; params.n_gpu_layers = params.n_gpu_layers_draft; @@ -395,23 +372,19 @@ int main(int argc, char ** argv) { params.n_threads_batch = params.n_threads_batch_draft; params.cb_split_done = nullptr; - params.rpc_servers = draft_rpcs.substr(0, i); - std::tie(draft_models[0], draft_ctx[0]) = llama_init_from_gpt_params(params); - params.rpc_servers = draft_rpcs.substr(i + 1); - std::tie(draft_models[1], draft_ctx[1]) = llama_init_from_gpt_params(params); - std::thread spec_thread = std::thread(speculation, draft_models, &spec_ctx, draft_ctx, input); + params.rpc_servers = params.rpc_servers_draft; + std::tie(draft_model, draft_ctx) = llama_init_from_gpt_params(params); + std::thread spec_thread = std::thread(speculation, draft_model, &spec_ctx, draft_ctx, input); target(model, ctx, input, params.n_predict); spec_thread.join(); llama_free(ctx); - llama_free(draft_ctx[0]); - llama_free(draft_ctx[1]); + llama_free(draft_ctx); llama_free_model(model); - llama_free_model(draft_models[0]); - llama_free_model(draft_models[1]); + llama_free_model(draft_model); llama_backend_free();