diff --git a/examples/lookup/README.md b/examples/lookup/README.md index e69de29bb..03a772c45 100644 --- a/examples/lookup/README.md +++ b/examples/lookup/README.md @@ -0,0 +1,13 @@ +# llama.cpp/examples/lookup + +Demonstration of Prompt Lookup Decoding + +https://github.com/apoorvumang/prompt-lookup-decoding + +The two key parameters for lookup decoding are `max_ngram_size` and `n_draft`. The first, determines how many ngrams to use when searching through the prompt for a match and the second specifies how many subsequent tokens to draft if a match is found. + +More info: + +https://github.com/ggerganov/llama.cpp/pull/4484 +https://github.com/ggerganov/llama.cpp/issues/4226 + diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 28b9c2c95..db97d241c 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -88,19 +88,16 @@ int main(int argc, char ** argv){ int i_dft = 0; while (true) { - //LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); - // sample from the target model - llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0); + llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft); llama_sampling_accept(ctx_sampling, ctx, id, true); - //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str()); - const std::string token_str = llama_token_to_piece(ctx, id); - printf("%s", token_str.c_str()); - fflush(stdout); + if (!params.use_color) { + printf("%s", token_str.c_str()); + } if (id == llama_token_eos(model)) { has_eos = true; @@ -114,9 +111,21 @@ int main(int argc, char ** argv){ ++n_accept; ++n_past; ++i_dft; - + inp.push_back(id); + + if (params.use_color) { + // color accepted draft token + printf("\033[34m%s\033[0m", token_str.c_str()); + fflush(stdout); + } continue; + } + + if (params.use_color) { + printf("%s", token_str.c_str()); } + fflush(stdout); + LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); @@ -176,9 +185,6 @@ int main(int argc, char ** argv){ ++n_past; draft.erase(draft.begin()); - - // we have our draft! - } auto t_dec_end = ggml_time_us();