readme
This commit is contained in:
parent
10d5aefed5
commit
83aabb3fb7
3 changed files with 83 additions and 52 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -48,6 +48,7 @@ models-mnt
|
||||||
/beam-search
|
/beam-search
|
||||||
/benchmark-matmult
|
/benchmark-matmult
|
||||||
/convert-llama2c-to-ggml
|
/convert-llama2c-to-ggml
|
||||||
|
/duo
|
||||||
/embd-input-test
|
/embd-input-test
|
||||||
/embedding
|
/embedding
|
||||||
/eval-callback
|
/eval-callback
|
||||||
|
|
|
@ -1,7 +1,64 @@
|
||||||
## duo
|
## duo
|
||||||
|
|
||||||
Minimal example. What's not implemented, but can be implemented separately in pieces:
|
This is a demo of an approach of distributed evaluation/speculation using rpc.
|
||||||
* tree-based speculation
|
|
||||||
* correct sampling
|
It is a fairly minimal app, and many more improvements could be made.
|
||||||
* support more than 2 instances
|
|
||||||
* just one instance speculates
|
### Idea
|
||||||
|
|
||||||
|
Idea is coming from discussion here: https://github.com/ggerganov/llama.cpp/discussions/6853#discussioncomment-9473494.
|
||||||
|
When we run a large model and distribute the evaluation across multiple devices, they still evaluate model sequentially.
|
||||||
|
In case of two identical devices and equal model split we would leave half of compute on the table, assuming individual use-case (e.g. personal chat).
|
||||||
|
|
||||||
|
We can utilize this compute to speculate and then evaluate larger sequence of tokens.
|
||||||
|
|
||||||
|
This demo is fairly limited:
|
||||||
|
1. Expects two instances running main model
|
||||||
|
2. One of these instances speculating
|
||||||
|
3. Speculation is linear
|
||||||
|
4. Sampling is greedy
|
||||||
|
|
||||||
|
So, in the case of two identical devices and equal model split we still are not utilizing 25% of compute.
|
||||||
|
Improvement of the above points is probably easier to do as separate changes, to make reviewing easier.
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
Devices:
|
||||||
|
* Apple M1 16GB
|
||||||
|
* Apple M2 24GB
|
||||||
|
* Connected with thunderbolt-4 cable and using TCP/IP over thunderbolt.
|
||||||
|
|
||||||
|
Models:
|
||||||
|
* Meta-Llama-3-8B-Instruct-fp16 as main
|
||||||
|
* Meta-Llama-3-8B-Instruct-v2.Q2_K as speculation
|
||||||
|
|
||||||
|
We could use different models as well.
|
||||||
|
|
||||||
|
On M1
|
||||||
|
```
|
||||||
|
bin/rpc-server -p 10001 -m 10000
|
||||||
|
```
|
||||||
|
|
||||||
|
On M2
|
||||||
|
```
|
||||||
|
bin/rpc-server -p 10001 -m 10000
|
||||||
|
bin/rpc-server -p 20002 -m 4000
|
||||||
|
```
|
||||||
|
|
||||||
|
Also on M2:
|
||||||
|
```
|
||||||
|
./bin/duo -m ../../llms/gguf/Meta-Llama-3-8B-Instruct-fp16.gguf -md ../../llms/gguf/Meta-Llama-3-8B-Instruct-v2.Q2_K.gguf --rpc "localhost:10001,169.254.77.16:10001" -p "Please illustrate the difference between concurrency and parallelism in python." -n 256 -ngl 99 -t 1 --rpcd "localhost:20002"
|
||||||
|
|
||||||
|
...
|
||||||
|
decoded 256 tokens in 32.03 s, speed: 7.99 t/s
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
Compare that with running main with same 2 rpc servers:
|
||||||
|
```
|
||||||
|
./bin/main -m ../../llms/gguf/Meta-Llama-3-8B-Instruct-fp16.gguf --rpc "localhost:10001,169.254.77.16:10001" -p "Please illustrate the difference between concurrency and parallelism in python." -n 256 -ngl 99 -t 1
|
||||||
|
...
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,7 @@ using llama_tokens = std::vector<llama_token>;
|
||||||
struct speculation_context
|
struct speculation_context
|
||||||
{
|
{
|
||||||
llama_tokens candidate;
|
llama_tokens candidate;
|
||||||
int32_t active_id;
|
int32_t vacant_id; // not running main model
|
||||||
std::mutex mtx;
|
std::mutex mtx;
|
||||||
bool done;
|
bool done;
|
||||||
};
|
};
|
||||||
|
@ -60,8 +60,7 @@ static void split_done_cb(int split)
|
||||||
if (split == 1 || split == 2)
|
if (split == 1 || split == 2)
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> guard(spec_ctx.mtx);
|
std::lock_guard<std::mutex> guard(spec_ctx.mtx);
|
||||||
fprintf(stderr, "split_done = %d\n", split);
|
spec_ctx.vacant_id = split - 1;
|
||||||
spec_ctx.active_id = split - 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,13 +96,11 @@ static std::vector<llama_token> greedy_tokens(
|
||||||
}
|
}
|
||||||
|
|
||||||
static int speculation(
|
static int speculation(
|
||||||
std::vector<llama_model *> model,
|
llama_model * model,
|
||||||
speculation_context * spec_ctx,
|
speculation_context * spec_ctx,
|
||||||
std::vector<llama_context *> ctx,
|
llama_context * ctx,
|
||||||
llama_tokens input /* copy here */) {
|
llama_tokens input /* copy here */) {
|
||||||
|
|
||||||
int32_t active = 1;
|
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(512, 0, 1);
|
llama_batch batch = llama_batch_init(512, 0, 1);
|
||||||
|
|
||||||
for (size_t i = 0; i < input.size(); i++)
|
for (size_t i = 0; i < input.size(); i++)
|
||||||
|
@ -113,7 +110,7 @@ static int speculation(
|
||||||
|
|
||||||
batch.logits[batch.n_tokens - 1] = true;
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
|
|
||||||
if (llama_decode(ctx[active], batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -129,7 +126,11 @@ static int speculation(
|
||||||
bool wait = false;
|
bool wait = false;
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> g(spec_ctx->mtx);
|
std::lock_guard<std::mutex> g(spec_ctx->mtx);
|
||||||
if (spec_ctx->active_id != 0)
|
if (spec_ctx->done)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (spec_ctx->vacant_id != 0)
|
||||||
{
|
{
|
||||||
wait = true;
|
wait = true;
|
||||||
}
|
}
|
||||||
|
@ -141,7 +142,7 @@ static int speculation(
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
auto next_tokens = greedy_tokens(model[active], ctx[active], logit_idx, logit_idx + 1);
|
auto next_tokens = greedy_tokens(model, ctx, logit_idx, logit_idx + 1);
|
||||||
if (next_tokens.size() != 1) {
|
if (next_tokens.size() != 1) {
|
||||||
fprintf(stderr, "invalid next tokens\n");
|
fprintf(stderr, "invalid next tokens\n");
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -151,10 +152,6 @@ static int speculation(
|
||||||
|
|
||||||
{
|
{
|
||||||
std::lock_guard<std::mutex> _lock(spec_ctx->mtx);
|
std::lock_guard<std::mutex> _lock(spec_ctx->mtx);
|
||||||
if (spec_ctx->done)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
auto& shared = spec_ctx->candidate;
|
auto& shared = spec_ctx->candidate;
|
||||||
bool match = true;
|
bool match = true;
|
||||||
match_len = local.size() - 1;
|
match_len = local.size() - 1;
|
||||||
|
@ -164,9 +161,7 @@ static int speculation(
|
||||||
{
|
{
|
||||||
match = false;
|
match = false;
|
||||||
match_len = i;
|
match_len = i;
|
||||||
// here we need to clear both contexts
|
llama_kv_cache_seq_rm(ctx, 0, i, -1);
|
||||||
llama_kv_cache_seq_rm(ctx[0], 0, i, -1);
|
|
||||||
//llama_kv_cache_seq_rm(ctx[1], 0, i, -1);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -178,11 +173,6 @@ static int speculation(
|
||||||
{
|
{
|
||||||
local = shared;
|
local = shared;
|
||||||
}
|
}
|
||||||
if (active != spec_ctx->active_id)
|
|
||||||
{
|
|
||||||
active = spec_ctx->active_id;
|
|
||||||
fprintf(stderr, "updating active_id = %d\n", active);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
@ -194,7 +184,7 @@ static int speculation(
|
||||||
|
|
||||||
logit_idx = batch.n_tokens - 1;
|
logit_idx = batch.n_tokens - 1;
|
||||||
|
|
||||||
if (llama_decode(ctx[active], batch) != 0)
|
if (llama_decode(ctx, batch) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -317,20 +307,15 @@ static int target(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\ntgt: input_seq.size() = %zu\n", input_seq.size());
|
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
for (size_t i = 0; i < input_seq.size(); i++)
|
for (size_t i = 0; i < input_seq.size(); i++)
|
||||||
{
|
{
|
||||||
llama_batch_add(batch, input_seq[i], n_cur - 1 + i, { 0 }, true);
|
llama_batch_add(batch, input_seq[i], n_cur - 1 + i, { 0 }, true);
|
||||||
}
|
}
|
||||||
auto s_us = ggml_time_us();
|
|
||||||
if (llama_decode(ctx, batch)) {
|
if (llama_decode(ctx, batch)) {
|
||||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
auto eval_us = ggml_time_us() - s_us;
|
|
||||||
fprintf(stderr, "eval_time: %lld", eval_us);
|
|
||||||
logits_from = 0;
|
logits_from = 0;
|
||||||
logits_to = input_seq.size();
|
logits_to = input_seq.size();
|
||||||
}
|
}
|
||||||
|
@ -362,14 +347,6 @@ int main(int argc, char ** argv) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string draft_rpcs = params.rpc_servers_draft;
|
|
||||||
size_t i = draft_rpcs.find(',');
|
|
||||||
if (i == std::string::npos || draft_rpcs.find(',', i + 1) != std::string::npos)
|
|
||||||
{
|
|
||||||
fprintf(stderr, "drpc must contain exactly two servers\n");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
@ -383,8 +360,8 @@ int main(int argc, char ** argv) {
|
||||||
spec_ctx.candidate = input;
|
spec_ctx.candidate = input;
|
||||||
|
|
||||||
// prepare draft model and contexts. No need for two model instances?
|
// prepare draft model and contexts. No need for two model instances?
|
||||||
std::vector<llama_model *> draft_models = {nullptr, nullptr};
|
llama_model * draft_model = nullptr;
|
||||||
std::vector<llama_context *> draft_ctx = {nullptr, nullptr};
|
llama_context * draft_ctx = nullptr;
|
||||||
|
|
||||||
params.model = params.model_draft;
|
params.model = params.model_draft;
|
||||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||||
|
@ -395,23 +372,19 @@ int main(int argc, char ** argv) {
|
||||||
params.n_threads_batch = params.n_threads_batch_draft;
|
params.n_threads_batch = params.n_threads_batch_draft;
|
||||||
|
|
||||||
params.cb_split_done = nullptr;
|
params.cb_split_done = nullptr;
|
||||||
params.rpc_servers = draft_rpcs.substr(0, i);
|
params.rpc_servers = params.rpc_servers_draft;
|
||||||
std::tie(draft_models[0], draft_ctx[0]) = llama_init_from_gpt_params(params);
|
std::tie(draft_model, draft_ctx) = llama_init_from_gpt_params(params);
|
||||||
params.rpc_servers = draft_rpcs.substr(i + 1);
|
std::thread spec_thread = std::thread(speculation, draft_model, &spec_ctx, draft_ctx, input);
|
||||||
std::tie(draft_models[1], draft_ctx[1]) = llama_init_from_gpt_params(params);
|
|
||||||
std::thread spec_thread = std::thread(speculation, draft_models, &spec_ctx, draft_ctx, input);
|
|
||||||
|
|
||||||
target(model, ctx, input, params.n_predict);
|
target(model, ctx, input, params.n_predict);
|
||||||
|
|
||||||
spec_thread.join();
|
spec_thread.join();
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free(draft_ctx[0]);
|
llama_free(draft_ctx);
|
||||||
llama_free(draft_ctx[1]);
|
|
||||||
|
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
llama_free_model(draft_models[0]);
|
llama_free_model(draft_model);
|
||||||
llama_free_model(draft_models[1]);
|
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue