moving to using refl-cpp for llama as well

2023-11-22 11:40:25 -05:00 · 2023-11-22 11:40:25 -05:00 · ef4c0f572b
commit ef4c0f572b
parent 6fd690fae7
9 changed files with 1914 additions and 1840 deletions
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -122,14 +122,16 @@ int main(int argc, char ** argv) {
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
            llama_batch batch_view = {
-                n_tokens,
+	      .n_tokens=n_tokens,
-                batch.token    + i,
+	      .token=batch.token    + i,
-                nullptr,
+	      .embd=nullptr,
-                batch.pos      + i,
+	      .pos=batch.pos      + i,
-                batch.n_seq_id + i,
+              .n_seq_id=batch.n_seq_id + i,
-                batch.seq_id   + i,
+	      .seq_id=batch.seq_id   + i,
-                batch.logits   + i,
+	      .logits=batch.logits   + i,
-                0, 0, 0, // unused
+	      .all_pos_0=0,
 	      .all_pos_1=0,
 	      .all_seq_id=0, // unused
            };
            const int ret = llama_decode(ctx, batch_view);
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -169,10 +169,13 @@ int main(int argc, char ** argv) {
            candidates.reserve(n_vocab);
            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+                candidates.emplace_back(llama_token_data{
 		    .id=token_id,
 		    .logit=logits[token_id],
 		    .p=0.0f });
            }
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_token_data_array candidates_p = { .data=candidates.data(), .size=candidates.size(), .sorted=false };
            const int   top_k = 40;
            const float top_p = 0.9f;
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -75,7 +75,18 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        llama_batch batch = {
 	  .n_tokens=int32_t(n_eval),
 	  .token=nullptr,
 	  .embd=(image_embed->embed+i*n_embd),
 	  .pos=nullptr,
 	  .n_seq_id=nullptr,
 	  .seq_id=nullptr,
 	  .logits=nullptr,
 	  .all_pos_0=*n_past,
 	  .all_pos_1=1,
 	  .all_seq_id=0
 	};
        if (llama_decode(ctx_llama, batch)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -67,9 +67,12 @@ int main(int argc, char ** argv) {
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+            candidates.emplace_back(llama_token_data{
 		.id=token_id,
 		.logit=logits[token_id],
 		.p=0.0f});
        }
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        llama_token_data_array candidates_p(candidates.data(), candidates.size(), false );
        auto next_token = llama_sample_token(ctx, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx, next_token);
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1667,14 +1667,16 @@ struct llama_server_context
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
            llama_batch batch_view =
            {
-                n_tokens,
+                .n_tokens=n_tokens,
-                batch.token    + i,
+                .token=batch.token    + i,
-                nullptr,
+                .embd=nullptr,
-                batch.pos      + i,
+                .pos=batch.pos      + i,
-                batch.n_seq_id + i,
+                .n_seq_id=batch.n_seq_id + i,
-                batch.seq_id   + i,
+                .seq_id=batch.seq_id   + i,
-                batch.logits   + i,
+                .logits=batch.logits   + i,
-                0, 0, 0, // unused
+                .all_pos_0=.0,
 		.all_pos_1=0,
 		.all_seq_id=0, // unused
            };
            const int ret = llama_decode(ctx, batch_view);
@ -1722,7 +1724,10 @@ struct llama_server_context
                    slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
                }
-                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
+                llama_token_data_array cur_p = {
 		  .data=slot.ctx_sampling->cur.data(),
 		  .size=slot.ctx_sampling->cur.size(),
 		  .sorted=false };
                result.tok = id;
                const int32_t n_probs = slot.sparams.n_probs;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -124,10 +124,15 @@ int main(int argc, char ** argv) {
            candidates.reserve(n_vocab);
            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+                candidates.emplace_back(llama_token_data{ .id=token_id,
 							  .logit=logits[token_id],
 							  .p=0.0f });
            }
-            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            llama_token_data_array candidates_p = {
 	      .data=candidates.data(),
 	      .size=candidates.size(),
 	      .sorted=false };
            // sample the most likely token
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
--- a/llama.cpp
+++ b/llama.cpp
@ -6745,7 +6745,8 @@ struct llama_grammar * llama_grammar_init(
        for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
            vec_rules[i].push_back(*pos);
        }
-        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
+	llama_grammar_element ge = {.type=LLAMA_GRETYPE_END, .value=0};
        vec_rules[i].push_back(ge);
    }
    // loop over alternates of start rule to build initial stacks
@ -7368,7 +7369,15 @@ struct llama_beam {
            tokens.resize(tokens.size() - n);
        }
    }
-    llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
+    llama_beam_view view() const {
      llama_beam_view bv = {
 	.tokens =tokens.data(),
 	.n_tokens= tokens.size(),
 	.p=p,
 	.eob=eob
      };
      return bv;
    }
 };
 // A struct for calculating logit-related info.
@ -7389,7 +7398,12 @@ struct llama_logit_info {
      { }
    llama_token_data get_token_data(const llama_token token_id) const {
        constexpr auto p = std::numeric_limits<float>::quiet_NaN();  // never used
-        return {token_id, logits[token_id], p};
+	llama_token_data dd {
 	  .id = token_id,
 	  .logit = logits[token_id],
 	  .p = p
 	};
        return dd;
    }
    // Return top k token_data by logit.
    std::vector<llama_token_data> top_k(size_t k) {
@ -7529,7 +7543,13 @@ struct llama_beam_search_data {
            beam_views[i] = beams[i].view();
        }
        common_prefix_length = find_common_prefix_length();
-        return {beam_views.data(), beams.size(), common_prefix_length, last_call};
+        llama_beams_state a = {
 	  .beam_views=beam_views.data(),
 	  .n_beams = beams.size(),
 	  .common_prefix_length=common_prefix_length,
 	  .last_call=last_call
 	};
 	return a;
    }
    // Loop:
@ -8356,14 +8376,14 @@ static int llama_apply_lora_from_file_internal(
 //
 struct llama_model_params llama_model_default_params() {
    struct llama_model_params result = {
-        /*.n_gpu_layers                =*/ 0,
+        .n_gpu_layers                = 0,
-        /*.main_gpu                    =*/ 0,
+        .main_gpu                    = 0,
-        /*.tensor_split                =*/ nullptr,
+        .tensor_split                = nullptr,
-        /*.progress_callback           =*/ nullptr,
+        .progress_callback           = nullptr,
-        /*.progress_callback_user_data =*/ nullptr,
+        .progress_callback_user_data = nullptr,
-        /*.vocab_only                  =*/ false,
+        .vocab_only                  = false,
-        /*.use_mmap                    =*/ true,
+        .use_mmap                    = true,
-        /*.use_mlock                   =*/ false,
+        .use_mlock                   = false,
    };
 #ifdef GGML_USE_METAL
@ -8375,23 +8395,23 @@ struct llama_model_params llama_model_default_params() {
 struct llama_context_params llama_context_default_params() {
    struct llama_context_params result = {
-        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
+        .seed                        = LLAMA_DEFAULT_SEED,
-        /*.n_ctx                       =*/ 512,
+        .n_ctx                       = 512,
-        /*.n_batch                     =*/ 512,
+        .n_batch                     = 512,
-        /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
+        .n_threads                   = GGML_DEFAULT_N_THREADS, // TODO: better default
-        /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
+        .n_threads_batch             = GGML_DEFAULT_N_THREADS,
-        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
+        .rope_scaling_type           = LLAMA_ROPE_SCALING_UNSPECIFIED,
-        /*.rope_freq_base              =*/ 0.0f,
+        .rope_freq_base              = 0.0f,
-        /*.rope_freq_scale             =*/ 0.0f,
+        .rope_freq_scale             = 0.0f,
-        /*.yarn_ext_factor             =*/ -1.0f,
+        .yarn_ext_factor             = -1.0f,
-        /*.yarn_attn_factor            =*/ 1.0f,
+        .yarn_attn_factor            = 1.0f,
-        /*.yarn_beta_fast              =*/ 32.0f,
+        .yarn_beta_fast              = 32.0f,
-        /*.yarn_beta_slow              =*/ 1.0f,
+        .yarn_beta_slow              = 1.0f,
-        /*.yarn_orig_ctx               =*/ 0,
+        .yarn_orig_ctx               = 0,
-        /*.mul_mat_q                   =*/ true,
+        .mul_mat_q                   = true,
-        /*.f16_kv                      =*/ true,
+        .f16_kv                      = true,
-        /*.logits_all                  =*/ false,
+        .logits_all                  = false,
-        /*.embedding                   =*/ false,
+        .embedding                   = false,
    };
    return result;
@ -8399,12 +8419,12 @@ struct llama_context_params llama_context_default_params() {
 struct llama_model_quantize_params llama_model_quantize_default_params() {
    struct llama_model_quantize_params result = {
-        /*.nthread                     =*/ 0,
+        .nthread                     = 0,
-        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
+        .ftype                       = LLAMA_FTYPE_MOSTLY_Q5_1,
-        /*.allow_requantize            =*/ false,
+        .allow_requantize            = false,
-        /*.quantize_output_tensor      =*/ true,
+        .quantize_output_tensor      = true,
-        /*.only_copy                   =*/ false,
+        .only_copy                   = false,
-        /*.pure                        =*/ false,
+        .pure                        = false,
    };
    return result;
@ -9301,7 +9321,18 @@ int llama_eval_embd(
                             int   n_past) {
    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
+    llama_batch batch = {
      .n_tokens=n_tokens,
      .token=nullptr,
      .embd=embd,
      .pos=nullptr,
      .n_seq_id=nullptr,
      .seq_id=nullptr,
      .logits=nullptr,
      .all_pos_0=n_past,
      .all_pos_1=1,
      .all_seq_id=0
    };
    const int ret = llama_decode_internal(*ctx, batch);
    if (ret < 0) {
@ -9321,22 +9352,34 @@ struct llama_batch llama_batch_get_one(
                 int32_t   n_tokens,
               llama_pos   pos_0,
            llama_seq_id   seq_id) {
-    return {
+    llama_batch b ={
-        /*n_tokens       =*/ n_tokens,
+        .n_tokens       = n_tokens,
-        /*tokens         =*/ tokens,
+        .token          = tokens,
-        /*embd           =*/ nullptr,
+        .embd           = nullptr,
-        /*pos            =*/ nullptr,
+        .pos            = nullptr,
-        /*n_seq_id       =*/ nullptr,
+        .n_seq_id       = nullptr,
-        /*seq_id         =*/ nullptr,
+        .seq_id         = nullptr,
-        /*logits         =*/ nullptr,
+        .logits         = nullptr,
-        /*all_pos_0      =*/ pos_0,
+        .all_pos_0      = pos_0,
-        /*all_pos_1      =*/ 1,
+        .all_pos_1      = 1,
-        /*all_seq_id     =*/ seq_id,
+        .all_seq_id     = seq_id,
    };
    return b;
 }
 struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
-    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
+    llama_batch batch = {
      .n_tokens = 0,
      .embd=nullptr, 
      .pos=nullptr,
      .n_seq_id=nullptr,
      .seq_id=nullptr,
      .logits=nullptr,
      .all_pos_0=0,
      .all_pos_1=0,
      .all_seq_id=0
    };
    if (embd) {
        batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
@ -9533,16 +9576,15 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
 struct llama_timings llama_get_timings(struct llama_context * ctx) {
    struct llama_timings result = {
-        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
+      .t_start_ms  = 1e-3 * ctx->t_start_us,
-        /*.t_end_ms    =*/ 1.00 * ggml_time_ms(),
+      .t_end_ms    = 1.00 * ggml_time_ms(),
-        /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
+      .t_load_ms   = 1e-3 * ctx->t_load_us,
-        /*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
+      .t_sample_ms = 1e-3 * ctx->t_sample_us,
-        /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
+      .t_p_eval_ms = 1e-3 * ctx->t_p_eval_us,
-        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,
+      .t_eval_ms   = 1e-3 * ctx->t_eval_us,     
-
+      .n_sample = std::max(1, ctx->n_sample),
-        /*.n_sample =*/ std::max(1, ctx->n_sample),
+      .n_p_eval = std::max(1, ctx->n_p_eval),
-        /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
+      .n_eval   = std::max(1, ctx->n_eval),
        /*.n_eval   =*/ std::max(1, ctx->n_eval),
    };
    return result;
--- a/llama.h
+++ b/llama.h
@ -114,13 +114,19 @@ extern "C" {
        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
    };
-    typedef struct llama_token_data {
+    typedef struct llama_token_data : refl::attr::usage::type{
        llama_token id; // token id
        float logit;    // log-odds of the token
        float p;        // probability of the token
    } llama_token_data;
-    typedef struct llama_token_data_array {
+    typedef struct llama_token_data_array : refl::attr::usage::type{
      llama_token_data_array(llama_token_data * data,
 			     size_t size,
 			     bool sorted):
 	data(data),
 	size(size),
 	sorted(sorted){}
        llama_token_data * data;
        size_t size;
        bool sorted;
@ -138,7 +144,7 @@ extern "C" {
    // - seq_id : the sequence to which the respective token belongs
    // - logits : if zero, the logits for the respective token will not be output
    //
-    typedef struct llama_batch {
+    typedef struct llama_batch : refl::attr::usage::type{
        int32_t n_tokens;
        llama_token  *  token;
@ -158,7 +164,7 @@ extern "C" {
        llama_seq_id all_seq_id; // used if seq_id == NULL
    } llama_batch;
-    struct llama_model_params {
+    struct llama_model_params : refl::attr::usage::type{
        int32_t n_gpu_layers; // number of layers to store in VRAM
        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
@ -174,7 +180,7 @@ extern "C" {
        bool use_mlock;  // force system to keep model in RAM
    };
-    struct llama_context_params {
+    struct llama_context_params : refl::attr::usage::type{
        uint32_t seed;              // RNG seed, -1 for random
        uint32_t n_ctx;             // text context, 0 = from model
        uint32_t n_batch;           // prompt processing maximum batch size
@ -199,7 +205,7 @@ extern "C" {
    };
    // model quantization parameters
-    typedef struct llama_model_quantize_params {
+    typedef struct llama_model_quantize_params : refl::attr::usage::type{
        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype ftype;      // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
@ -237,13 +243,13 @@ extern "C" {
        LLAMA_GRETYPE_CHAR_ALT       = 6,
    };
-    typedef struct llama_grammar_element {
+    typedef struct llama_grammar_element : refl::attr::usage::type{
        enum llama_gretype type;
        uint32_t           value; // Unicode code point or rule ID
    } llama_grammar_element;
    // performance timing information
-    struct llama_timings {
+    struct llama_timings : refl::attr::usage::type{
        double t_start_ms;
        double t_end_ms;
        double t_load_ms;
@ -720,7 +726,7 @@ extern "C" {
    // Beam search
    //
-    struct llama_beam_view {
+    struct llama_beam_view : refl::attr::usage::type{
        const llama_token * tokens;
        size_t n_tokens;
@ -732,7 +738,7 @@ extern "C" {
    // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
    // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
    // These pointers are valid only during the synchronous callback, so should not be saved.
-    struct llama_beams_state {
+    struct llama_beams_state : refl::attr::usage::type{
        struct llama_beam_view * beam_views;
        size_t n_beams;               // Number of elements in beam_views[].
--- a/tests/test-c.c
+++ b/tests/test-c.c
@ -1,3 +0,0 @@
 #include "llama.h"
 int main(void) {}