This commit is contained in:
Oleksandr Kuvshynov 2024-05-24 23:56:48 -04:00
parent 10d5aefed5
commit 83aabb3fb7
3 changed files with 83 additions and 52 deletions

1
.gitignore vendored
View file

@ -48,6 +48,7 @@ models-mnt
/beam-search
/benchmark-matmult
/convert-llama2c-to-ggml
/duo
/embd-input-test
/embedding
/eval-callback

View file

@ -1,7 +1,64 @@
## duo
Minimal example. What's not implemented, but can be implemented separately in pieces:
* tree-based speculation
* correct sampling
* support more than 2 instances
* just one instance speculates
This is a demo of an approach of distributed evaluation/speculation using rpc.
It is a fairly minimal app, and many more improvements could be made.
### Idea
Idea is coming from discussion here: https://github.com/ggerganov/llama.cpp/discussions/6853#discussioncomment-9473494.
When we run a large model and distribute the evaluation across multiple devices, they still evaluate model sequentially.
In case of two identical devices and equal model split we would leave half of compute on the table, assuming individual use-case (e.g. personal chat).
We can utilize this compute to speculate and then evaluate larger sequence of tokens.
This demo is fairly limited:
1. Expects two instances running main model
2. One of these instances speculating
3. Speculation is linear
4. Sampling is greedy
So, in the case of two identical devices and equal model split we still are not utilizing 25% of compute.
Improvement of the above points is probably easier to do as separate changes, to make reviewing easier.
### Setup
Devices:
* Apple M1 16GB
* Apple M2 24GB
* Connected with thunderbolt-4 cable and using TCP/IP over thunderbolt.
Models:
* Meta-Llama-3-8B-Instruct-fp16 as main
* Meta-Llama-3-8B-Instruct-v2.Q2_K as speculation
We could use different models as well.
On M1
```
bin/rpc-server -p 10001 -m 10000
```
On M2
```
bin/rpc-server -p 10001 -m 10000
bin/rpc-server -p 20002 -m 4000
```
Also on M2:
```
./bin/duo -m ../../llms/gguf/Meta-Llama-3-8B-Instruct-fp16.gguf -md ../../llms/gguf/Meta-Llama-3-8B-Instruct-v2.Q2_K.gguf --rpc "localhost:10001,169.254.77.16:10001" -p "Please illustrate the difference between concurrency and parallelism in python." -n 256 -ngl 99 -t 1 --rpcd "localhost:20002"
...
decoded 256 tokens in 32.03 s, speed: 7.99 t/s
```
Compare that with running main with same 2 rpc servers:
```
./bin/main -m ../../llms/gguf/Meta-Llama-3-8B-Instruct-fp16.gguf --rpc "localhost:10001,169.254.77.16:10001" -p "Please illustrate the difference between concurrency and parallelism in python." -n 256 -ngl 99 -t 1
...
```

View file

@ -48,7 +48,7 @@ using llama_tokens = std::vector<llama_token>;
struct speculation_context
{
llama_tokens candidate;
int32_t active_id;
int32_t vacant_id; // not running main model
std::mutex mtx;
bool done;
};
@ -60,8 +60,7 @@ static void split_done_cb(int split)
if (split == 1 || split == 2)
{
std::lock_guard<std::mutex> guard(spec_ctx.mtx);
fprintf(stderr, "split_done = %d\n", split);
spec_ctx.active_id = split - 1;
spec_ctx.vacant_id = split - 1;
}
}
@ -97,13 +96,11 @@ static std::vector<llama_token> greedy_tokens(
}
static int speculation(
std::vector<llama_model *> model,
llama_model * model,
speculation_context * spec_ctx,
std::vector<llama_context *> ctx,
llama_context * ctx,
llama_tokens input /* copy here */) {
int32_t active = 1;
llama_batch batch = llama_batch_init(512, 0, 1);
for (size_t i = 0; i < input.size(); i++)
@ -113,7 +110,7 @@ static int speculation(
batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx[active], batch) != 0) {
if (llama_decode(ctx, batch) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
@ -129,7 +126,11 @@ static int speculation(
bool wait = false;
{
std::lock_guard<std::mutex> g(spec_ctx->mtx);
if (spec_ctx->active_id != 0)
if (spec_ctx->done)
{
break;
}
if (spec_ctx->vacant_id != 0)
{
wait = true;
}
@ -141,7 +142,7 @@ static int speculation(
}
auto next_tokens = greedy_tokens(model[active], ctx[active], logit_idx, logit_idx + 1);
auto next_tokens = greedy_tokens(model, ctx, logit_idx, logit_idx + 1);
if (next_tokens.size() != 1) {
fprintf(stderr, "invalid next tokens\n");
return 1;
@ -151,10 +152,6 @@ static int speculation(
{
std::lock_guard<std::mutex> _lock(spec_ctx->mtx);
if (spec_ctx->done)
{
break;
}
auto& shared = spec_ctx->candidate;
bool match = true;
match_len = local.size() - 1;
@ -164,9 +161,7 @@ static int speculation(
{
match = false;
match_len = i;
// here we need to clear both contexts
llama_kv_cache_seq_rm(ctx[0], 0, i, -1);
//llama_kv_cache_seq_rm(ctx[1], 0, i, -1);
llama_kv_cache_seq_rm(ctx, 0, i, -1);
break;
}
}
@ -178,11 +173,6 @@ static int speculation(
{
local = shared;
}
if (active != spec_ctx->active_id)
{
active = spec_ctx->active_id;
fprintf(stderr, "updating active_id = %d\n", active);
}
}
llama_batch_clear(batch);
@ -194,7 +184,7 @@ static int speculation(
logit_idx = batch.n_tokens - 1;
if (llama_decode(ctx[active], batch) != 0)
if (llama_decode(ctx, batch) != 0)
{
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return 1;
@ -317,20 +307,15 @@ static int target(
break;
}
fprintf(stderr, "\ntgt: input_seq.size() = %zu\n", input_seq.size());
llama_batch_clear(batch);
for (size_t i = 0; i < input_seq.size(); i++)
{
llama_batch_add(batch, input_seq[i], n_cur - 1 + i, { 0 }, true);
}
auto s_us = ggml_time_us();
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
auto eval_us = ggml_time_us() - s_us;
fprintf(stderr, "eval_time: %lld", eval_us);
logits_from = 0;
logits_to = input_seq.size();
}
@ -362,14 +347,6 @@ int main(int argc, char ** argv) {
params.seed = time(NULL);
}
std::string draft_rpcs = params.rpc_servers_draft;
size_t i = draft_rpcs.find(',');
if (i == std::string::npos || draft_rpcs.find(',', i + 1) != std::string::npos)
{
fprintf(stderr, "drpc must contain exactly two servers\n");
return 1;
}
llama_backend_init();
llama_numa_init(params.numa);
@ -383,8 +360,8 @@ int main(int argc, char ** argv) {
spec_ctx.candidate = input;
// prepare draft model and contexts. No need for two model instances?
std::vector<llama_model *> draft_models = {nullptr, nullptr};
std::vector<llama_context *> draft_ctx = {nullptr, nullptr};
llama_model * draft_model = nullptr;
llama_context * draft_ctx = nullptr;
params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft;
@ -395,23 +372,19 @@ int main(int argc, char ** argv) {
params.n_threads_batch = params.n_threads_batch_draft;
params.cb_split_done = nullptr;
params.rpc_servers = draft_rpcs.substr(0, i);
std::tie(draft_models[0], draft_ctx[0]) = llama_init_from_gpt_params(params);
params.rpc_servers = draft_rpcs.substr(i + 1);
std::tie(draft_models[1], draft_ctx[1]) = llama_init_from_gpt_params(params);
std::thread spec_thread = std::thread(speculation, draft_models, &spec_ctx, draft_ctx, input);
params.rpc_servers = params.rpc_servers_draft;
std::tie(draft_model, draft_ctx) = llama_init_from_gpt_params(params);
std::thread spec_thread = std::thread(speculation, draft_model, &spec_ctx, draft_ctx, input);
target(model, ctx, input, params.n_predict);
spec_thread.join();
llama_free(ctx);
llama_free(draft_ctx[0]);
llama_free(draft_ctx[1]);
llama_free(draft_ctx);
llama_free_model(model);
llama_free_model(draft_models[0]);
llama_free_model(draft_models[1]);
llama_free_model(draft_model);
llama_backend_free();