diff --git a/common/speculative.cpp b/common/speculative.cpp index 3adb9d67a..4222234de 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -27,7 +27,7 @@ struct common_speculative * common_speculative_init( }; // TODO: optimize or pass from outside? -#if 1 +#if 0 { common_sampler_params sparams; sparams.no_perf = false; @@ -156,13 +156,27 @@ llama_tokens common_speculative_gen_draft( } } - LOG_DBG("%s: reuse_i = %d, reuse_n = %d\n", __func__, reuse_i, reuse_n); + LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size()); + + llama_tokens result; + result.reserve(params.n_draft); if (reuse_n == 0) { llama_kv_cache_clear(ctx); prompt.clear(); } else { + if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) { + for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) { + result.push_back(prompt[i]); + + if (result.size() >= params.n_draft) { + break; + } + } + return result; + } + llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i); llama_kv_cache_seq_rm (ctx, 0, reuse_i + reuse_n, -1); llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i); @@ -201,9 +215,6 @@ llama_tokens common_speculative_gen_draft( common_sampler_reset(smpl); - llama_tokens result; - result.reserve(params.n_draft); - // sample n_draft tokens from the draft model for (int i = 0; i < params.n_draft; ++i) { common_batch_clear(batch); diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index fb63435ab..98a9b35d4 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -134,6 +134,8 @@ int main(int argc, char ** argv) { // llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last); + //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str()); + // always have a token to evaluate from before - id_last common_batch_clear(batch_tgt); common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true);