llama : refactor sampling v2 (#9294)

- Add `struct llama_sampler` and `struct llama_sampler_i` - Add `llama_sampler_` API - Add `llama_sampler_chain_` API for chaining multiple samplers - Remove `LLAMA_API_INTERNAL` - Add `llama_perf_` API and remove old `llama_print_timings` and `llama_reset_timings`
2024-09-07 15:16:19 +03:00 · 2024-09-07 15:16:19 +03:00 · df270ef745
commit df270ef745
parent 947538acb8
48 changed files with 3497 additions and 2914 deletions
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -163,11 +163,11 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
 }

-static const char * sample(struct llama_sampling_context * ctx_sampling,
+static const char * sample(struct gpt_sampler * smpl,
                           struct llama_context * ctx_llama,
                           int * n_past) {
-    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
-    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
+    const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
+    gpt_sampler_accept(smpl, id, true);
    static std::string ret;
    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
@ -214,7 +214,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri
    return ctx_llava;
 }

-static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
+static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
    std::string user_prompt = prompt;
    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
    if (!is_first) {
@ -238,13 +238,13 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla

    LOG_TEE("\n");

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-    return ctx_sampling;
+    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
+    return smpl;
 }

-static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
+static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){

-    const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+    const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
    return tmp;
 }

@ -278,12 +278,12 @@ int main(int argc, char ** argv) {
        if (!params.prompt.empty()) {
            LOG_TEE("<user>%s\n", params.prompt.c_str());
            LOG_TEE("<assistant>");
-            auto ctx_sampling = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
+            auto smpl = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
            const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
            std::string response = "";
            bool have_tmp = false;
            for (int i = 0; i < max_tgt_len; i++) {
-                auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
+                auto tmp = llama_loop(ctx_llava, smpl, n_past);
                response += tmp;
                if (strcmp(tmp, "</s>") == 0){
                    if(!have_tmp)continue;
@ -296,18 +296,18 @@ int main(int argc, char ** argv) {

                fflush(stdout);
            }
-            llama_sampling_free(ctx_sampling);
+            gpt_sampler_free(smpl);
        }else {
            while (true) {
                LOG_TEE("<user>");
                std::string prompt;
                std::getline(std::cin, prompt);
                LOG_TEE("<assistant>");
-                auto ctx_sampling = llama_init(ctx_llava, &params, prompt, n_past, true);
+                auto smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
                const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
                std::string response = "";
                for (int i = 0; i < max_tgt_len; i++) {
-                    auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
+                    auto tmp = llama_loop(ctx_llava, smpl, n_past);
                    response += tmp;
                    if (strcmp(tmp, "</s>") == 0) break;
                    if (strstr(tmp, "###")) break; // Yi-VL behavior
@ -315,11 +315,11 @@ int main(int argc, char ** argv) {
                    if (strstr(response.c_str(), "<user>")) break; // minicpm-v
                    fflush(stdout);
                }
-                llama_sampling_free(ctx_sampling);
+                gpt_sampler_free(smpl);
            }
        }
        printf("\n");
-        llama_print_timings(ctx_llava->ctx_llama);
+        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);

        ctx_llava->model = NULL;
        llava_free(ctx_llava);