llama : refactor sampling v2 (#9294)

- Add `struct llama_sampler` and `struct llama_sampler_i` - Add `llama_sampler_` API - Add `llama_sampler_chain_` API for chaining multiple samplers - Remove `LLAMA_API_INTERNAL` - Add `llama_perf_` API and remove old `llama_print_timings` and `llama_reset_timings`
2024-09-07 15:16:19 +03:00 · 2024-09-07 15:16:19 +03:00 · df270ef745
commit df270ef745
parent 947538acb8
48 changed files with 3497 additions and 2914 deletions
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -3,13 +3,11 @@
 #include "common.h"
 #include "ngram-cache.h"

-#include <cmath>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
 #include <string>
 #include <vector>
-#include <unordered_map>

 int main(int argc, char ** argv){
    gpt_params params;
@ -106,7 +104,7 @@ int main(int argc, char ** argv){

    bool has_eos = false;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);

    std::vector<llama_token> draft;

@ -130,9 +128,9 @@ int main(int argc, char ** argv){
        int i_dft = 0;
        while (true) {
            // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
+            llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);

-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            gpt_sampler_accept(smpl, id, true);

            const std::string token_str = llama_token_to_piece(ctx, id);

@ -240,10 +238,12 @@ int main(int argc, char ** argv){
    LOG_TEE("n_accept     = %d\n", n_accept);
    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx);
+    LOG_TEE("\ntarget:\n\n");
+    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
+
+    gpt_sampler_free(smpl);

-    llama_sampling_free(ctx_sampling);
    llama_batch_free(batch_tgt);

    llama_free(ctx);