speculative : refactor and add a simpler example (#10362)

* speculative : refactor and add a simpler example ggml-ci * speculative : clean-up and add comments and TODOs [no ci] * speculative : manage context in common_speculative ggml-ci * speculative : simplify ggml-ci * speculative : simplify (cont) ggml-ci * speculative : add --draft-min CLI arg * speculative : minor fixup * make : build fixes * speculative : do not redraft previous drafts ggml-ci * speculative : fix the draft sampling ggml-ci * speculative : fix compile warning * common : refactor args ggml-ci * common : change defaults [no ci] * common : final touches ggml-ci
2024-11-25 09:58:41 +02:00 · 2024-11-25 09:58:41 +02:00 · d9d54e498d
commit d9d54e498d
parent cce5a90075
28 changed files with 1028 additions and 326 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -175,7 +175,7 @@ struct server_slot {
    // sampling
    json json_schema;

-    struct common_sampler_params sparams;
+    struct common_params_sampling sparams;
    struct common_sampler * smpl = nullptr;

    llama_token sampled;
@ -687,7 +687,7 @@ struct server_context {

            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);

-            slot.sparams = params.sparams;
+            slot.sparams = params.sampling;

            slot.callback_on_release = [this](int) {
                queue_tasks.pop_deferred_task();
@ -743,7 +743,7 @@ struct server_context {
                }

                // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
-                int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
+                int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);

                // fraction of the common subsequence length compared to the current slot's prompt length
                float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
@ -788,7 +788,7 @@ struct server_context {
    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
        slot_params default_params;
        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
-        auto default_sparams = params.sparams;
+        auto default_sparams = params.sampling;
        const auto & data = task.data;

        if (data.count("__oaicompat") != 0) {
@ -1960,7 +1960,7 @@ struct server_context {

                            if (slot.params.cache_prompt) {
                                // reuse any previously computed tokens that are common with the new prompt
-                                slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
+                                slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);

                                // reuse chunks from the cached prompt by shifting their KV cache in the new position
                                if (params.n_cache_reuse > 0) {