now starting to refactor the code

2023-11-24 11:49:09 -05:00 · 2023-11-24 11:49:09 -05:00 · bc3b93b942
commit bc3b93b942
parent 9722cfd0bb
6 changed files with 895 additions and 722 deletions
--- a/ggml-internal.hpp
+++ b/ggml-internal.hpp
@ -90,3 +90,169 @@ struct ggml_allocr {
    ggml_tallocr_t talloc;
    ggml_gallocr_t galloc;
 };
+
+#define GGML_NUMA_MAX_NODES 8
+#define GGML_NUMA_MAX_CPUS 512
+
+struct ggml_numa_node {
+    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes {
+    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+};
+
+struct ggml_state {
+    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
+    struct ggml_numa_nodes numa;
+
+  ggml_state():contexts(), numa()
+  {
+    
+  }
+};
+
+struct gguf_str {
+    uint64_t n;  // GGUFv2
+    char * data;
+};
+
+struct ggml_map_custom1_op_params {
+    ggml_custom1_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+struct ggml_map_custom2_op_params {
+    ggml_custom2_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+
+struct ggml_map_custom3_op_params {
+    ggml_custom3_op_t fun;
+    int n_tasks;
+    void * userdata;
+};
+struct hash_map {
+    struct ggml_hash_set set;
+    struct ggml_tensor ** vals;
+};
+
+#if defined(_WIN32)
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+#else
+#include<atomic>
+using namespace std;
+#endif
+
+struct ggml_compute_state_shared {
+    const struct ggml_cgraph * cgraph;
+    const struct ggml_cplan  * cplan;
+
+    int64_t perf_node_start_cycles;
+    int64_t perf_node_start_time_us;
+
+    const int n_threads;
+
+    // synchronization primitives
+    atomic_int n_active; // num active threads
+    atomic_int node_n;   // active graph node
+
+    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    void * abort_callback_data;
+};
+typedef pthread_t ggml_thread_t;
+struct ggml_compute_state {
+    ggml_thread_t thrd;
+    int ith;
+    struct ggml_compute_state_shared * shared;
+};
+
+union gguf_value {
+    uint8_t  uint8;
+    int8_t   int8;
+    uint16_t uint16;
+    int16_t  int16;
+    uint32_t uint32;
+    int32_t  int32;
+    float    float32;
+    uint64_t uint64;
+    int64_t  int64;
+    double   float64;
+    bool     bool_;
+
+    struct gguf_str str;
+
+    struct gguf_array_T {
+        enum gguf_type type;
+
+        uint64_t n;  // GGUFv2
+        void * data;
+    } arr;
+};
+
+struct ggml_lbfgs_iteration_data {
+    float alpha;
+    float ys;
+    float * s;
+    float * y;
+};
+
+struct gguf_kv {
+    struct gguf_str key;
+
+    enum  gguf_type  type;
+    union gguf_value value;
+};
+
+
+
+struct gguf_header {
+    char magic[4];
+    uint32_t version;
+    uint64_t n_tensors; // GGUFv2
+    uint64_t n_kv;      // GGUFv2
+};
+
+struct gguf_tensor_info {
+    struct gguf_str name;
+
+    uint32_t n_dims;
+    uint64_t ne[GGML_MAX_DIMS];
+
+    enum ggml_type type;
+
+    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
+
+    // for writing API
+    const void * data;
+    size_t size;
+};
+
+struct gguf_context {
+    struct gguf_header header;
+
+    struct gguf_kv          * kv;
+    struct gguf_tensor_info * infos;
+
+    size_t alignment;
+    size_t offset;    // offset of `data` from beginning of file
+    size_t size;      // size of `data` in bytes
+
+    //uint8_t * padding;
+    void * data;
+};
+
+struct gguf_buf {
+    void * data;
+    size_t size;
+    size_t offset;
+};
+
+
+#include "ggml-backend-impl.h"
--- a/ggml.cpp
+++ b/ggml.cpp
@ -1625,33 +1625,12 @@ static void ggml_setup_op_has_task_pass(void) {
 // NUMA support
 //

-#define GGML_NUMA_MAX_NODES 8
-#define GGML_NUMA_MAX_CPUS 512

-struct ggml_numa_node {
-    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
-    uint32_t n_cpus;
-};
-
-struct ggml_numa_nodes {
-    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
-    uint32_t n_nodes;
-    uint32_t total_cpus; // hardware threads on system
-};

 //
 // ggml state
 //

-struct ggml_state {
-    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
-    struct ggml_numa_nodes numa;
-
-  ggml_state():contexts(), numa()
-  {
-    
-  }
-};

 // global state
 static struct ggml_state g_state;
@ -1986,10 +1965,6 @@ static inline int ggml_up(int n, int m) {
 ////////////////////////////////////////////////////////////////////////////////

 static  size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT]={};
-struct gguf_str {
-    uint64_t n;  // GGUFv2
-    char * data;
-};

 static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {};

@ -6084,11 +6059,6 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
 }

 // ggml_map_custom1
-struct ggml_map_custom1_op_params {
-    ggml_custom1_op_t fun;
-    int n_tasks;
-    void * userdata;
-};

 static struct ggml_tensor * ggml_map_custom1_impl(
        struct ggml_context          * ctx,
@ -6141,11 +6111,6 @@ struct ggml_tensor * ggml_map_custom1_inplace(

 // ggml_map_custom2

-struct ggml_map_custom2_op_params {
-    ggml_custom2_op_t fun;
-    int n_tasks;
-    void * userdata;
-};

 static struct ggml_tensor * ggml_map_custom2_impl(
        struct ggml_context          * ctx,
@ -6202,11 +6167,6 @@ struct ggml_tensor * ggml_map_custom2_inplace(

 // ggml_map_custom3

-struct ggml_map_custom3_op_params {
-    ggml_custom3_op_t fun;
-    int n_tasks;
-    void * userdata;
-};

 static struct ggml_tensor * ggml_map_custom3_impl(
        struct ggml_context          * ctx,
@ -14475,10 +14435,6 @@ static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
    free(hash_set.keys);
 }

-struct hash_map {
-    struct ggml_hash_set set;
-    struct ggml_tensor ** vals;
-};

 static struct hash_map * ggml_new_hash_map(size_t size) {
  struct hash_map * result = (hash_map *)malloc(sizeof(struct hash_map));
@ -15734,7 +15690,7 @@ typedef int ggml_lock_t;

 #define GGML_LOCK_INITIALIZER 0

-typedef pthread_t ggml_thread_t;
+

 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
@ -15824,28 +15780,7 @@ static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(threa
 static void clear_numa_thread_affinity(void) {}
 #endif

-struct ggml_compute_state_shared {
-    const struct ggml_cgraph * cgraph;
-    const struct ggml_cplan  * cplan;

-    int64_t perf_node_start_cycles;
-    int64_t perf_node_start_time_us;
-
-    const int n_threads;
-
-    // synchronization primitives
-    atomic_int n_active; // num active threads
-    atomic_int node_n;   // active graph node
-
-    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
-    void * abort_callback_data;
-};
-
-struct ggml_compute_state {
-    ggml_thread_t thrd;
-    int ith;
-    struct ggml_compute_state_shared * shared;
-};

 static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
    int64_t cycles_cur  = ggml_perf_cycles()  - st->perf_node_start_cycles;
@ -17456,12 +17391,6 @@ static enum ggml_opt_result ggml_opt_adam(
 //   https://github.com/chokkan/liblbfgs
 //

-struct ggml_lbfgs_iteration_data {
-    float alpha;
-    float ys;
-    float * s;
-    float * y;
-};

 static enum ggml_opt_result linesearch_backtracking(
        const struct ggml_opt_params * params,
@ -18328,71 +18257,6 @@ static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
 //};
 static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");

-union gguf_value {
-    uint8_t  uint8;
-    int8_t   int8;
-    uint16_t uint16;
-    int16_t  int16;
-    uint32_t uint32;
-    int32_t  int32;
-    float    float32;
-    uint64_t uint64;
-    int64_t  int64;
-    double   float64;
-    bool     bool_;
-
-    struct gguf_str str;
-
-    struct {
-        enum gguf_type type;
-
-        uint64_t n;  // GGUFv2
-        void * data;
-    } arr;
-};
-
-struct gguf_kv {
-    struct gguf_str key;
-
-    enum  gguf_type  type;
-    union gguf_value value;
-};
-
-struct gguf_header {
-    char magic[4];
-    uint32_t version;
-    uint64_t n_tensors; // GGUFv2
-    uint64_t n_kv;      // GGUFv2
-};
-
-struct gguf_tensor_info {
-    struct gguf_str name;
-
-    uint32_t n_dims;
-    uint64_t ne[GGML_MAX_DIMS];
-
-    enum ggml_type type;
-
-    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
-
-    // for writing API
-    const void * data;
-    size_t size;
-};
-
-struct gguf_context {
-    struct gguf_header header;
-
-    struct gguf_kv          * kv;
-    struct gguf_tensor_info * infos;
-
-    size_t alignment;
-    size_t offset;    // offset of `data` from beginning of file
-    size_t size;      // size of `data` in bytes
-
-    //uint8_t * padding;
-    void * data;
-};

 static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
    const size_t n = fread(dst, 1, size, file);
@ -19185,11 +19049,6 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
 //    fwrite(val, sizeof(char), size, file);
 //}

-struct gguf_buf {
-    void * data;
-    size_t size;
-    size_t offset;
-};

 static struct gguf_buf gguf_buf_init(size_t size) {
    struct gguf_buf buf = {
--- a/llama-internal.hpp
+++ b/llama-internal.hpp
@ -1,5 +1,5 @@
 #include <set>
-
+#include <queue>
 enum llm_arch {
    LLM_ARCH_LLAMA,
    LLM_ARCH_FALCON,
@ -90,7 +90,7 @@ enum llama_fver {

 struct LLM_KV {
  LLM_KV(llm_arch arch) : arch(arch) {}
-  
+
  llm_arch arch;

  std::string operator()(llm_kv kv) const; // moved to llama.cpp file
@ -196,7 +196,7 @@ struct llama_buffer {
    // useful in cases where CUDA can try to allocate PINNED memory
    bool fallback = false;

-  void resize(size_t n) ;	
+  void resize(size_t n) ;


  ~llama_buffer();
@ -293,9 +293,9 @@ struct llama_vocab {
 struct llama_mmap {
  void * addr;
  size_t size;
-  
+
  llama_mmap(const llama_mmap &) = delete;
-  
+
  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
  ~llama_mmap();

@ -371,8 +371,8 @@ struct llama_mlock {
 #undef MLOCK_SUGGESTION
  static void raw_unlock(void * addr, size_t size);
 #elif defined(_WIN32)
-  static constexpr bool SUPPORTED = true; 
-  static size_t lock_granularity();	
+  static constexpr bool SUPPORTED = true;
+  static size_t lock_granularity();
  bool raw_lock(void * ptr, size_t len) const ;
  static void raw_unlock(void * ptr, size_t len);
 #else
@ -516,3 +516,381 @@ struct LLM_TN {
  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;

 };
+
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+  llama_file(const char * fname, const char * mode) ;
+  size_t tell() const;
+  void seek(size_t offset, int whence) const;
+  void read_raw(void * ptr, size_t len) const;
+  uint32_t read_u32() const;
+  void write_raw(const void * ptr, size_t len) const ;
+  void write_u32(std::uint32_t val) const;
+  ~llama_file();
+
+};
+
+
+struct llama_state {
+  llama_state();
+    // We save the log callback globally
+    ggml_log_callback log_callback;
+    void * log_callback_user_data = nullptr;
+};
+
+
+
+struct llama_model_loader {
+    int n_kv      = 0;
+    int n_tensors = 0;
+    int n_created = 0;
+
+    int64_t n_elements = 0;
+    size_t  n_bytes    = 0;
+
+    bool use_mmap = false;
+
+    llama_file  file;
+    llama_ftype ftype;
+    llama_fver  fver;
+
+    std::unique_ptr<llama_mmap> mapping;
+
+    struct gguf_context * ctx_gguf = NULL;
+    struct ggml_context * ctx_meta = NULL;
+
+  llama_model_loader(const std::string & fname, bool use_mmap) ;
+
+  ~llama_model_loader();
+
+  std::string get_arch_name() const;
+
+  enum llm_arch get_arch() const ;
+  const char * get_tensor_name(int i) const;
+
+  struct ggml_tensor * get_tensor_meta(int i) const;
+
+  void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const;
+
+  struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) ;
+
+  struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) ;
+
+  void done_getting_tensors() const;
+
+  size_t file_offset(const char * name) const;
+
+
+  void load_data_for(struct ggml_tensor * cur) const ;
+  void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) ;
+};
+
+struct llama_data_context {
+    virtual void write(const void * src, size_t size) = 0;
+    virtual size_t get_size_written() = 0;
+    virtual ~llama_data_context() = default;
+};
+
+struct llama_data_buffer_context : llama_data_context {
+    uint8_t * ptr;
+    size_t size_written = 0;
+  llama_data_buffer_context(uint8_t * p) ;
+  void write(const void * src, size_t size) override ;
+  size_t get_size_written() override ;
+};
+
+struct llama_data_file_context : llama_data_context {
+    llama_file * file;
+    size_t size_written = 0;
+  llama_data_file_context(llama_file * f);
+  size_t get_size_written() override ;
+  void write(const void * src, size_t size);
+};
+
+
+struct llama_beam {
+  std::vector<llama_token> tokens;
+  float p;  // Cumulative beam probability (renormalized relative to all beams)
+  bool eob; // Initialize end-of-beam to false. Callback sets this to true.
+  // Sort beams by probability. In case of ties, prefer beams at eob.
+  bool operator<(const llama_beam & rhs) const ;
+  void shift_tokens(const size_t n) ;
+  llama_beam_view view() const;
+};
+
+// A struct for calculating logit-related info.
+struct llama_logit_info {
+    const float * const logits;
+    const int n_vocab;
+    const float max_l;
+    const float normalizer;
+    struct sum_exp {
+	float max_l;
+	float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
+    };
+  llama_logit_info(llama_context * ctx);
+  llama_token_data get_token_data(const llama_token token_id) const ;
+  std::vector<llama_token_data> top_k(size_t k) ;
+  float probability_from_logit(float logit) const ;
+};
+
+
+struct llama_beam_search_data {
+  llama_context * ctx;
+  size_t n_beams;
+  int n_past;
+  int n_predict;
+  std::vector<llama_beam> beams;
+  std::vector<llama_beam> next_beams;
+  size_t common_prefix_length;
+  std::vector<llama_beam_view> beam_views;
+  llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict);
+  void collapse_beams(const size_t beam_idx) ;
+  void fill_next_beams_by_top_probabilities(llama_beam & beam) ;
+  size_t find_common_prefix_length() ;
+  llama_beams_state get_beams_state(const bool last_call) ;
+  void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data);
+  static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) ;
+  size_t top_beam_index();
+  void update_beams_from_beam_views();
+};
+
+using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
+
+enum llm_rope_type {
+    LLM_ROPE,
+    LLM_ROPE_NEOX,
+    LLM_ROPE_GLM,
+};
+
+enum llm_ffn_op_type {
+    LLM_FFN_SILU,
+    LLM_FFN_GELU,
+    LLM_FFN_RELU,
+    LLM_FFN_RELU_SQR,
+};
+
+enum llm_ffn_gate_type {
+    LLM_FFN_SEQ,
+    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+enum llm_norm_type {
+    LLM_NORM,
+    LLM_NORM_RMS,
+};
+
+struct llm_build_context {
+    const llama_model    & model;
+    const llama_hparams  & hparams;
+    const llama_cparams  & cparams;
+    const llama_batch    & batch;
+    const llama_kv_cache & kv_self;
+
+    const int64_t n_embd;
+    const int64_t n_layer;
+    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t n_head;
+    const int64_t n_head_kv;
+    const int64_t n_embd_head;
+    const int64_t n_embd_gqa;
+
+    const float freq_base;
+    const float freq_scale;
+    const float ext_factor;
+    const float attn_factor;
+    const float beta_fast;
+    const float beta_slow;
+    const float norm_eps;
+    const float norm_rms_eps;
+
+    const int32_t n_tokens;
+    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
+    const int32_t kv_head;  // index of where we store new KV data in the cache
+    const int32_t n_orig_ctx;
+
+    const bool do_rope_shift;
+
+    const llm_build_cb & cb;
+
+    llama_buffer & buf_compute;
+
+    struct ggml_context * ctx0 = nullptr;
+
+    // TODO: consider making the entire interface noexcept
+    llm_build_context(
+	llama_context  & lctx,
+    const llama_batch  & batch,
+    const llm_build_cb & cb,
+	bool   worst_case);
+
+  void init() ;
+  void free() ;
+  struct ggml_cgraph * build_llama() ;
+  struct ggml_cgraph * build_baichuan() ;
+  struct ggml_cgraph * build_falcon() ;
+  struct ggml_cgraph * build_starcoder() ;
+  struct ggml_cgraph * build_persimmon() ;
+  struct ggml_cgraph * build_refact() ;
+  struct ggml_cgraph * build_bloom() ;
+  struct ggml_cgraph * build_mpt() ;
+  struct ggml_cgraph * build_stablelm();
+};
+
+
+enum llm_offload_func_e {
+    OFFLOAD_FUNC_NOP,
+    OFFLOAD_FUNC,
+    OFFLOAD_FUNC_KQ,
+    OFFLOAD_FUNC_V,
+    OFFLOAD_FUNC_NR,
+    OFFLOAD_FUNC_EMB,
+    OFFLOAD_FUNC_OUT,
+};
+
+struct llm_offload_trie {
+  struct node {
+    ~node() ;
+    node * children[256] = { nullptr };
+    llm_offload_func_e func = OFFLOAD_FUNC_NOP;
+  };
+  node * root = nullptr;
+  llm_offload_trie();
+  llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) ;
+  ~llm_offload_trie();
+  void add(const char * name, llm_offload_func_e func);
+  llm_offload_func_e find(const char * name) const;
+  
+};
+
+struct llm_symbol {
+    using index = int;
+    index prev;
+    index next;
+    const char * text;
+    size_t n;
+};
+
+
+struct llm_bigram_spm {
+    struct comparator {
+      bool operator()(llm_bigram_spm & l, llm_bigram_spm & r);
+    };
+    using queue_storage = std::vector<llm_bigram_spm>;
+    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    float score;
+    size_t size;
+};
+
+struct llm_tokenizer_spm {
+  llm_tokenizer_spm(const llama_vocab & vocab);
+    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
+
+
+private:
+  void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) ;
+  void try_add_bigram(int left, int right) ;
+  const llama_vocab & vocab;
+
+  std::vector<llm_symbol> symbols;
+  llm_bigram_spm::queue work_queue;
+
+    std::map<std::string, std::pair<int, int>> rev_merge;
+};
+
+// BPE tokenizer
+// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
+// tried to simplify unicode stuff, so most likely does not work 100% correctly!
+
+// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
+
+struct llm_bigram_bpe {
+    struct comparator {
+      bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const ;
+    };
+
+    using queue_storage = std::vector<llm_bigram_bpe>;
+    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+    llm_symbol::index left;
+    llm_symbol::index right;
+    std::string text;
+    int rank;
+    size_t size;
+};
+
+struct llm_tokenizer_bpe {
+  llm_tokenizer_bpe(const llama_vocab & vocab);
+
+  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output);
+
+private:
+  void add_new_bigram(int left, int right) ;
+
+  std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) ;
+
+  const llama_vocab & vocab;
+
+  std::vector<llm_symbol> symbols;
+  std::vector<llm_symbol> symbols_final;
+
+    llm_bigram_bpe::queue work_queue;
+};
+
+typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
+    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
+    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
+} FRAGMENT_BUFFER_VARIANT_TYPE;
+
+struct fragment_buffer_variant{
+  fragment_buffer_variant(llama_vocab::id _token);
+  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length);
+  const FRAGMENT_BUFFER_VARIANT_TYPE type;
+  const llama_vocab::id token;
+  const std::string _dummy;
+  const std::string & raw_text;
+  const uint64_t offset;
+  const uint64_t length;
+};
+
+struct llama_partial_utf8 {
+    uint32_t value;    // bit value so far (unshifted)
+    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
+};
+
+struct llama_grammar {
+    const std::vector<std::vector<llama_grammar_element>>   rules;
+    std::vector<std::vector<const llama_grammar_element *>> stacks;
+
+    // buffer for partially generated UTF-8 sequence from accepted tokens
+    llama_partial_utf8                                      partial_utf8;
+};
+
+struct llama_grammar_candidate {
+    size_t               index;
+    const uint32_t     * code_points;
+    llama_partial_utf8   partial_utf8;
+};
+
+struct quantize_state_internal {
+    const llama_model                 & model;
+    const llama_model_quantize_params * params;
+
+    int n_attention_wv    = 0;
+    int n_feed_forward_w2 = 0;
+    int i_attention_wv    = 0;
+    int i_feed_forward_w2 = 0;
+
+    int n_k_quantized     = 0;
+    int n_fallback        = 0;
+
+    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
+        : model(model)
+        , params(params)
+        {}
+};
--- a/llama.cpp
+++ b/llama.cpp
@ -639,12 +639,8 @@ llama_buffer::~llama_buffer() {
    }


-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;

-    llama_file(const char * fname, const char * mode) {
+llama_file::llama_file(const char * fname, const char * mode) {
 	fp = std::fopen(fname, mode);
 	if (fp == NULL) {
 	    throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@ -654,7 +650,7 @@ struct llama_file {
 	seek(0, SEEK_SET);
    }

-    size_t tell() const {
+size_t llama_file::tell() const {
 #ifdef _WIN32
 	__int64 ret = _ftelli64(fp);
 #else
@ -664,7 +660,8 @@ struct llama_file {
 	return (size_t) ret;
    }

-    void seek(size_t offset, int whence) const {
+void llama_file::seek(size_t offset, int whence) const {
+
 #ifdef _WIN32
 	int ret = _fseeki64(fp, (__int64) offset, whence);
 #else
@ -673,7 +670,7 @@ struct llama_file {
 	GGML_ASSERT(ret == 0); // same
    }

-    void read_raw(void * ptr, size_t len) const {
+void llama_file::read_raw(void * ptr, size_t len) const {
 	if (len == 0) {
 	    return;
 	}
@ -687,13 +684,13 @@ struct llama_file {
 	}
    }

-    uint32_t read_u32() const {
+uint32_t llama_file::read_u32() const {
 	uint32_t ret;
 	read_raw(&ret, sizeof(ret));
 	return ret;
    }

-    void write_raw(const void * ptr, size_t len) const {
+void llama_file::write_raw(const void * ptr, size_t len) const {
 	if (len == 0) {
 	    return;
 	}
@ -704,16 +701,16 @@ struct llama_file {
 	}
    }

-    void write_u32(std::uint32_t val) const {
+void llama_file::write_u32(std::uint32_t val) const {
 	write_raw(&val, sizeof(val));
    }

-    ~llama_file() {
+llama_file::~llama_file() {
 	if (fp) {
 	    std::fclose(fp);
 	}
    }
-};
+

 //

@ -985,12 +982,6 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
 // globals
 //

-struct llama_state {
-    // We save the log callback globally
-    ggml_log_callback log_callback = llama_log_callback_default;
-    void * log_callback_user_data = nullptr;
-};
-
 static llama_state g_state;


@ -1276,26 +1267,8 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
    return buf;
 }

-struct llama_model_loader {
-    int n_kv      = 0;
-    int n_tensors = 0;
-    int n_created = 0;

-    int64_t n_elements = 0;
-    size_t  n_bytes    = 0;
-
-    bool use_mmap = false;
-
-    llama_file  file;
-    llama_ftype ftype;
-    llama_fver  fver;
-
-    std::unique_ptr<llama_mmap> mapping;
-
-    struct gguf_context * ctx_gguf = NULL;
-    struct ggml_context * ctx_meta = NULL;
-
-    llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
+llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
      struct gguf_init_params params(
 				     /*.no_alloc =*/  true,
 				     /*.ctx      = */ &ctx_meta
@ -1409,7 +1382,7 @@ struct llama_model_loader {
 	this->use_mmap = use_mmap;
    }

-    ~llama_model_loader() {
+  llama_model_loader::~llama_model_loader() {
 	if (ctx_gguf) {
 	    gguf_free(ctx_gguf);
 	}
@ -1418,7 +1391,7 @@ struct llama_model_loader {
 	}
    }

-    std::string get_arch_name() const {
+  std::string llama_model_loader::get_arch_name() const {
 	const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);

 	std::string arch_name;
@ -1427,21 +1400,21 @@ struct llama_model_loader {
 	return arch_name;
    }

-    enum llm_arch get_arch() const {
+  enum llm_arch llama_model_loader::get_arch() const {
 	const std::string arch_name = get_arch_name();

 	return llm_arch_from_string(arch_name);
    }

-    const char * get_tensor_name(int i) const {
+  const char * llama_model_loader::get_tensor_name(int i) const {
 	return gguf_get_tensor_name(ctx_gguf, i);
    }

-    struct ggml_tensor * get_tensor_meta(int i) const {
+  struct ggml_tensor * llama_model_loader::get_tensor_meta(int i) const {
 	return ggml_get_tensor(ctx_meta, get_tensor_name(i));
    }

-    void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const {
+  void llama_model_loader::calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const {
 	ctx_size_p     = 0;
 	mmapped_size_p = 0;

@ -1452,7 +1425,7 @@ struct llama_model_loader {
 	}
    }

-    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
+  struct ggml_tensor * llama_model_loader::create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
 	if (backend != GGML_BACKEND_CPU) {
 	    ggml_set_no_alloc(ctx, true);
 	}
@ -1470,7 +1443,7 @@ struct llama_model_loader {
 	return tensor;
    }

-    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
+  struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
 	struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());

 	if (cur == NULL) {
@ -1503,13 +1476,13 @@ struct llama_model_loader {
 	return create_tensor_for(ctx, cur, backend);
    }

-    void done_getting_tensors() const {
+  void llama_model_loader::done_getting_tensors() const {
 	if (n_created != n_tensors) {
 	    throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
 	}
    }

-    size_t file_offset(const char * name) const {
+  size_t llama_model_loader::file_offset(const char * name) const {
 	const int idx = gguf_find_tensor(ctx_gguf, name);

 	if (idx < 0) {
@ -1519,7 +1492,7 @@ struct llama_model_loader {
 	return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
    }

-    void load_data_for(struct ggml_tensor * cur) const {
+  void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
 	const size_t offs = file_offset(ggml_get_name(cur));

 	if (use_mmap) {
@ -1530,7 +1503,7 @@ struct llama_model_loader {
 	}
    }

-    void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
+  void llama_model_loader::load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
 	size_t size_data = 0;
 	size_t size_lock = 0;
 	size_t size_pref = 0; // prefetch
@ -1606,7 +1579,7 @@ struct llama_model_loader {
 	    done_size += ggml_nbytes(cur);
 	}
    }
-};
+  //};

 //
 // load LLaMA models
@ -2940,30 +2913,6 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
 // llm_build
 //

-using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
-
-enum llm_rope_type {
-    LLM_ROPE,
-    LLM_ROPE_NEOX,
-    LLM_ROPE_GLM,
-};
-
-enum llm_ffn_op_type {
-    LLM_FFN_SILU,
-    LLM_FFN_GELU,
-    LLM_FFN_RELU,
-    LLM_FFN_RELU_SQR,
-};
-
-enum llm_ffn_gate_type {
-    LLM_FFN_SEQ,
-    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
-};
-
-enum llm_norm_type {
-    LLM_NORM,
-    LLM_NORM_RMS,
-};

 static struct ggml_tensor * llm_build_inp_embd(
 	struct ggml_context * ctx,
@ -3278,45 +3227,10 @@ static struct ggml_tensor * llm_build_kqv(
    return cur;
 }

-struct llm_build_context {
-    const llama_model    & model;
-    const llama_hparams  & hparams;
-    const llama_cparams  & cparams;
-    const llama_batch    & batch;
-    const llama_kv_cache & kv_self;
-
-    const int64_t n_embd;
-    const int64_t n_layer;
-    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
-    const int64_t n_head;
-    const int64_t n_head_kv;
-    const int64_t n_embd_head;
-    const int64_t n_embd_gqa;
-
-    const float freq_base;
-    const float freq_scale;
-    const float ext_factor;
-    const float attn_factor;
-    const float beta_fast;
-    const float beta_slow;
-    const float norm_eps;
-    const float norm_rms_eps;
-
-    const int32_t n_tokens;
-    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
-    const int32_t kv_head;  // index of where we store new KV data in the cache
-    const int32_t n_orig_ctx;
-
-    const bool do_rope_shift;
-
-    const llm_build_cb & cb;
-
-    llama_buffer & buf_compute;
-
-    struct ggml_context * ctx0 = nullptr;
+// struct llm_build_context {

    // TODO: consider making the entire interface noexcept
-    llm_build_context(
+llm_build_context::llm_build_context(
        llama_context  & lctx,
    const llama_batch  & batch,
    const llm_build_cb & cb,
@ -3353,7 +3267,7 @@ struct llm_build_context {
            // all initializations should be done in init()
        }

-    void init() {
+void llm_build_context::init() {
      struct ggml_init_params params(
 				     //.mem_size   =
 				     buf_compute.size,
@ -3366,14 +3280,14 @@ struct llm_build_context {
        ctx0 = ggml_init(params);
    }

-    void free() {
+    void llm_build_context::free() {
        if (ctx0) {
            ggml_free(ctx0);
            ctx0 = nullptr;
        }
    }

-    struct ggml_cgraph * build_llama() {
+    struct ggml_cgraph * llm_build_context::build_llama() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

        GGML_ASSERT(n_embd_head == hparams.n_rot);
@ -3485,7 +3399,7 @@ struct llm_build_context {
        return gf;
    }

-    struct ggml_cgraph * build_baichuan() {
+struct ggml_cgraph * llm_build_context::build_baichuan() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

        struct ggml_tensor * cur;
@ -3605,7 +3519,7 @@ struct llm_build_context {
        return gf;
    }

-    struct ggml_cgraph * build_falcon() {
+struct ggml_cgraph * llm_build_context::build_falcon() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

        struct ggml_tensor * cur;
@ -3727,7 +3641,7 @@ struct llm_build_context {
        return gf;
    }

-    struct ggml_cgraph * build_starcoder() {
+struct ggml_cgraph * llm_build_context::build_starcoder() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

        struct ggml_tensor * cur;
@ -3826,7 +3740,7 @@ struct llm_build_context {
        return gf;
    }

-    struct ggml_cgraph * build_persimmon() {
+    struct ggml_cgraph * llm_build_context::build_persimmon() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

        const int64_t n_rot = n_embd_head / 2;
@ -4036,7 +3950,7 @@ struct llm_build_context {
        return gf;
    }

-    struct ggml_cgraph * build_refact() {
+struct ggml_cgraph * llm_build_context::build_refact() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

        struct ggml_tensor * cur;
@ -4127,7 +4041,7 @@ struct llm_build_context {
        return gf;
    }

-    struct ggml_cgraph * build_bloom() {
+struct ggml_cgraph * llm_build_context::build_bloom() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

        struct ggml_tensor * cur;
@ -4221,7 +4135,7 @@ struct llm_build_context {
        return gf;
    }

-    struct ggml_cgraph * build_mpt() {
+struct ggml_cgraph * llm_build_context::build_mpt() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

        struct ggml_tensor * cur;
@ -4320,7 +4234,7 @@ struct llm_build_context {
        return gf;
    }

-    struct ggml_cgraph * build_stablelm() {
+struct ggml_cgraph * llm_build_context::build_stablelm() {
        struct ggml_cgraph * gf = ggml_new_graph(ctx0);

        struct ggml_tensor * cur;
@ -4432,27 +4346,18 @@ struct llm_build_context {

        return gf;
    }
-};
+

 //
 // tensor offloading helpers
 //
 // TODO: will be removed with backend v2

-enum llm_offload_func_e {
-    OFFLOAD_FUNC_NOP,
-    OFFLOAD_FUNC,
-    OFFLOAD_FUNC_KQ,
-    OFFLOAD_FUNC_V,
-    OFFLOAD_FUNC_NR,
-    OFFLOAD_FUNC_EMB,
-    OFFLOAD_FUNC_OUT,
-};

 // TODO: will be removed with backend v2
-struct llm_offload_trie {
-    struct node {
-        ~node() {
+//struct llm_offload_trie {
+//    struct node {
+llm_offload_trie::node::~node() {
            for (int i = 0; i < 256; ++i) {
                if (children[i]) {
                    delete children[i];
@ -4460,28 +4365,28 @@ struct llm_offload_trie {
            }
        }

-        node * children[256] = { nullptr };
-        llm_offload_func_e func = OFFLOAD_FUNC_NOP;
-    };
+//        node * children[256] = { nullptr };
+//        llm_offload_func_e func = OFFLOAD_FUNC_NOP;
+//    };

-    llm_offload_trie() {
+llm_offload_trie::llm_offload_trie() {
        root = new node;
    }

-    llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
-        root = new node;
+llm_offload_trie::llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
+  root = new node;
+  
+  for (const auto & kv : map) {
+    add(kv.first, kv.second);
+  }
+}

-        for (const auto & kv : map) {
-            add(kv.first, kv.second);
-        }
-    }
+llm_offload_trie::~llm_offload_trie() {
+  delete root;
+}

-    ~llm_offload_trie() {
-        delete root;
-    }
-
-    void add(const char * name, llm_offload_func_e func) {
-        node * cur = root;
+void llm_offload_trie::add(const char * name, llm_offload_func_e func) {
+  node * cur = root;

        for (int i = 0; ; ++i) {
            const uint8_t c = name[i];
@ -4500,7 +4405,7 @@ struct llm_offload_trie {
        cur->func = func;
    }

-    llm_offload_func_e find(const char * name) const {
+llm_offload_func_e llm_offload_trie::find(const char * name) const {
        const node * cur = root;

        for (int i = 0; ; ++i) {
@ -4520,8 +4425,8 @@ struct llm_offload_trie {
        return cur->func;
    }

-    node * root = nullptr;
-};
+//    node * root = nullptr;
+//};

 // TODO: will be removed with backend v2
 static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
@ -5255,13 +5160,6 @@ static void llama_unescape_whitespace(std::string & word) {
    replace_all(word, "\xe2\x96\x81", " ");
 }

-struct llm_symbol {
-    using index = int;
-    index prev;
-    index next;
-    const char * text;
-    size_t n;
-};

 static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");

@ -5269,24 +5167,16 @@ static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not
 // original implementation:
 // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4

-struct llm_bigram_spm {
-    struct comparator {
-        bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
-            return (l.score < r.score) || (l.score == r.score && l.left > r.left);
-        }
-    };
-    using queue_storage = std::vector<llm_bigram_spm>;
-    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
-    llm_symbol::index left;
-    llm_symbol::index right;
-    float score;
-    size_t size;
-};

-struct llm_tokenizer_spm {
-    llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
+bool llm_bigram_spm::comparator::operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
+  return (l.score < r.score) || (l.score == r.score && l.left > r.left);
+}

-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+
+// struct llm_tokenizer_spm {
+llm_tokenizer_spm::llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
+
+void llm_tokenizer_spm::tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
        // split string into utf8 chars
        int index = 0;
        size_t offs = 0;
@ -5344,8 +5234,8 @@ struct llm_tokenizer_spm {
        }
    }

-private:
-    void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
+//private:
+void llm_tokenizer_spm::resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
        auto text = std::string(symbol.text, symbol.n);
        auto token = vocab.token_to_id.find(text);

@ -5370,7 +5260,7 @@ private:
        resegment(symbols[p->second.second], output);
    }

-    void try_add_bigram(int left, int right) {
+void llm_tokenizer_spm::try_add_bigram(int left, int right) {
        if (left == -1 || right == -1) {
            return;
        }
@ -5400,13 +5290,6 @@ private:
        rev_merge[text] = std::make_pair(left, right);
    }

-    const llama_vocab & vocab;
-
-    std::vector<llm_symbol> symbols;
-    llm_bigram_spm::queue work_queue;
-
-    std::map<std::string, std::pair<int, int>> rev_merge;
-};

 // BPE tokenizer
 // adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
@ -5414,26 +5297,15 @@ private:

 // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused

-struct llm_bigram_bpe {
-    struct comparator {
-        bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
-            return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
-        }
-    };

-    using queue_storage = std::vector<llm_bigram_bpe>;
-    using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
-    llm_symbol::index left;
-    llm_symbol::index right;
-    std::string text;
-    int rank;
-    size_t size;
-};
+bool llm_bigram_bpe::comparator::operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
+  return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
+}

-struct llm_tokenizer_bpe {
-    llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
+//struct llm_tokenizer_bpe {
+llm_tokenizer_bpe::llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}

-    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+    void llm_tokenizer_bpe::tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
        int final_prev_index = -1;
        auto word_collection = bpe_gpt2_preprocess(text);

@ -5534,8 +5406,8 @@ struct llm_tokenizer_bpe {
        }
    }

-private:
-    void add_new_bigram(int left, int right) {
+//private:
+void llm_tokenizer_bpe::add_new_bigram(int left, int right) {
        if (left == -1 || right == -1) {
            return;
        }
@ -5562,7 +5434,7 @@ private:
        work_queue.push(bigram);
    }

-    std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
+    std::vector<std::string> llm_tokenizer_bpe::bpe_gpt2_preprocess(const std::string & text) {
        std::vector<std::string> bpe_words;
        std::vector<std::string> bpe_encoded_words;

@ -5701,28 +5573,17 @@ private:
        return bpe_encoded_words;
    }

-    const llama_vocab & vocab;

-    std::vector<llm_symbol> symbols;
-    std::vector<llm_symbol> symbols_final;

-    llm_bigram_bpe::queue work_queue;
-};
-
-typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
-    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
-    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
-} FRAGMENT_BUFFER_VARIANT_TYPE;
-
-struct fragment_buffer_variant{
-    fragment_buffer_variant(llama_vocab::id _token)
+//struct fragment_buffer_variant{
+fragment_buffer_variant::fragment_buffer_variant(llama_vocab::id _token)
    :
        type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
        token(_token),
        raw_text(_dummy),
        offset(0),
        length(0){}
-    fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
+fragment_buffer_variant::fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
    :
        type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
        token((llama_vocab::id)-1),
@ -5734,13 +5595,6 @@ struct fragment_buffer_variant{
            GGML_ASSERT( offset + length <= raw_text.length() );
        }

-    const FRAGMENT_BUFFER_VARIANT_TYPE type;
-    const llama_vocab::id token;
-    const std::string _dummy;
-    const std::string & raw_text;
-    const uint64_t offset;
-    const uint64_t length;
-};

 // #define PRETOKENIZERDEBUG

@ -5946,24 +5800,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
 // grammar - internal
 //

-struct llama_partial_utf8 {
-    uint32_t value;    // bit value so far (unshifted)
-    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
-};
-
-struct llama_grammar {
-    const std::vector<std::vector<llama_grammar_element>>   rules;
-    std::vector<std::vector<const llama_grammar_element *>> stacks;
-
-    // buffer for partially generated UTF-8 sequence from accepted tokens
-    llama_partial_utf8                                      partial_utf8;
-};
-
-struct llama_grammar_candidate {
-    size_t               index;
-    const uint32_t     * code_points;
-    llama_partial_utf8   partial_utf8;
-};

 // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
 // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
@ -6895,22 +6731,19 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
 // Beam search
 //

-struct llama_beam {
-    std::vector<llama_token> tokens;
-    float p;  // Cumulative beam probability (renormalized relative to all beams)
-    bool eob; // Initialize end-of-beam to false. Callback sets this to true.
-    // Sort beams by probability. In case of ties, prefer beams at eob.
-    bool operator<(const llama_beam & rhs) const {
+// llama_beam {
+
+bool llama_beam::operator<(const llama_beam & rhs) const {
        return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
    }
    // Shift off first n tokens and discard them.
-    void shift_tokens(const size_t n) {
+void llama_beam::shift_tokens(const size_t n) {
        if (n) {
            std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
            tokens.resize(tokens.size() - n);
        }
    }
-    llama_beam_view view() const {
+llama_beam_view llama_beam::view() const {
      llama_beam_view bv = {
 	.tokens =tokens.data(),
 	.n_tokens= tokens.size(),
@ -6919,25 +6752,25 @@ struct llama_beam {
      };
      return bv;
    }
-};
+

 // A struct for calculating logit-related info.
-struct llama_logit_info {
-    const float * const logits;
-    const int n_vocab;
-    const float max_l;
-    const float normalizer;
-    struct sum_exp {
-        float max_l;
-        float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
-    };
-    llama_logit_info(llama_context * ctx)
+//struct llama_logit_info {
+//    const float * const logits;
+//    const int n_vocab;
+//    const float max_l;
+//    const float normalizer;
+//    struct sum_exp {
+//        float max_l;
+//        float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
+//    };
+llama_logit_info::llama_logit_info(llama_context * ctx)
      : logits(llama_get_logits(ctx))
      , n_vocab(llama_n_vocab(llama_get_model(ctx)))
      , max_l(*std::max_element(logits, logits + n_vocab))
      , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
      { }
-    llama_token_data get_token_data(const llama_token token_id) const {
+llama_token_data llama_logit_info::get_token_data(const llama_token token_id) const {
        constexpr auto p = std::numeric_limits<float>::quiet_NaN();  // never used
 	llama_token_data dd(
 			    token_id,
@ -6947,7 +6780,7 @@ struct llama_logit_info {
        return dd;
    }
    // Return top k token_data by logit.
-    std::vector<llama_token_data> top_k(size_t k) {
+std::vector<llama_token_data> llama_logit_info::top_k(size_t k) {
        std::vector<llama_token_data> min_heap;  // min-heap by logit
        const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
        min_heap.reserve(k_min);
@ -6966,26 +6799,15 @@ struct llama_logit_info {
        }
        return min_heap;
    }
-    float probability_from_logit(float logit) const {
+float llama_logit_info::probability_from_logit(float logit) const {
        return normalizer * std::exp(logit - max_l);
    }
-};

-struct llama_beam_search_data {
-    llama_context * ctx;
-    size_t n_beams;
-    int n_past;
-    int n_predict;
-    std::vector<llama_beam> beams;
-    std::vector<llama_beam> next_beams;

-    // Re-calculated on each loop iteration
-    size_t common_prefix_length;
+//struct llama_beam_search_data {

-    // Used to communicate to/from callback on beams state.
-    std::vector<llama_beam_view> beam_views;

-    llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
+llama_beam_search_data::llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
      : ctx(ctx)
      , n_beams(n_beams)
      , n_past(n_past)
@ -6996,7 +6818,7 @@ struct llama_beam_search_data {
    }

    // Collapse beams to a single beam given by index.
-    void collapse_beams(const size_t beam_idx) {
+void llama_beam_search_data::collapse_beams(const size_t beam_idx) {
        if (0u < beam_idx) {
            std::swap(beams[0], beams[beam_idx]);
        }
@ -7008,7 +6830,7 @@ struct llama_beam_search_data {
    //  * Gather elements until the vector is full, then call std::make_heap() on it.
    //  * If the heap is full and a new element is found that should be included, pop the
    //    least element to the back(), replace it with the new, then push it into the heap.
-    void fill_next_beams_by_top_probabilities(llama_beam & beam) {
+void llama_beam_search_data::fill_next_beams_by_top_probabilities(llama_beam & beam) {
        // Min-heaps use a greater-than comparator.
        const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
        if (beam.eob) {
@ -7063,7 +6885,7 @@ struct llama_beam_search_data {

    // Find common_prefix_length based on beams.
    // Requires beams is not empty.
-    size_t find_common_prefix_length() {
+size_t llama_beam_search_data::find_common_prefix_length() {
        size_t common_prefix_length = beams[0].tokens.size();
        for (size_t i = 1 ; i < beams.size() ; ++i) {
            common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
@ -7079,7 +6901,7 @@ struct llama_beam_search_data {

    // Construct beams_state to send back to caller via the callback function.
    // Side effect: set common_prefix_length = find_common_prefix_length();
-    llama_beams_state get_beams_state(const bool last_call) {
+llama_beams_state llama_beam_search_data::get_beams_state(const bool last_call) {
        for (size_t i = 0 ; i < beams.size() ; ++i) {
            beam_views[i] = beams[i].view();
        }
@ -7098,7 +6920,7 @@ struct llama_beam_search_data {
    //  * any of the beams have not yet reached end-of-beam (eob), AND
    //  * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
    //    (since all other beam probabilities can only decrease)
-    void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
+void llama_beam_search_data::loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
        beams.push_back({{}, 1.0f, false});  // Start with one empty beam w/ probability = 1.0 and !eob.
        const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
        for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
@ -7125,25 +6947,25 @@ struct llama_beam_search_data {

    // As beams grow, the cumulative probabilities decrease.
    // Renormalize them to avoid floating point underflow.
-    static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
+void llama_beam_search_data::renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
        const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
        const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
        std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
    }

    // Assumes beams is non-empty.  Uses llama_beam::operator<() for ordering.
-    size_t top_beam_index() {
+size_t llama_beam_search_data::top_beam_index() {
        return std::max_element(beams.begin(), beams.end()) - beams.begin();
    }

    // Copy (p,eob) for each beam which may have been changed by the callback.
-    void update_beams_from_beam_views() {
+void llama_beam_search_data::update_beams_from_beam_views() {
        for (size_t i = 0 ; i < beams.size() ; ++i) {
            beams[i].p = beam_views[i].p;
            beams[i].eob = beam_views[i].eob;
        }
    }
-};
+

 void llama_beam_search(llama_context * ctx,
                       llama_beam_search_callback_fn_t callback, void * callback_data,
@ -7169,23 +6991,6 @@ struct no_init {
    no_init() { /* do nothing */ }
 };

-struct quantize_state_internal {
-    const llama_model                 & model;
-    const llama_model_quantize_params * params;
-
-    int n_attention_wv    = 0;
-    int n_feed_forward_w2 = 0;
-    int i_attention_wv    = 0;
-    int i_feed_forward_w2 = 0;
-
-    int n_k_quantized     = 0;
-    int n_fallback        = 0;
-
-    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
-        : model(model)
-        , params(params)
-        {}
-};

 static void llama_convert_tensor_internal(
    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
@ -8442,45 +8247,32 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
    return s_total;
 }

-// llama_context_data
-struct llama_data_context {
-    virtual void write(const void * src, size_t size) = 0;
-    virtual size_t get_size_written() = 0;
-    virtual ~llama_data_context() = default;
-};

-struct llama_data_buffer_context : llama_data_context {
-    uint8_t * ptr;
-    size_t size_written = 0;

-    llama_data_buffer_context(uint8_t * p) : ptr(p) {}
+  llama_data_buffer_context::llama_data_buffer_context(uint8_t * p) : ptr(p) {}

-    void write(const void * src, size_t size) override {
-        memcpy(ptr, src, size);
-        ptr += size;
-        size_written += size;
-    }
+void llama_data_buffer_context::write(const void * src, size_t size) {
+  memcpy(ptr, src, size);
+  ptr += size;
+  size_written += size;
+}

-    size_t get_size_written() override {
-        return size_written;
-    }
-};
+size_t llama_data_buffer_context::get_size_written()  {
+  return size_written;
+}

-struct llama_data_file_context : llama_data_context {
-    llama_file * file;
-    size_t size_written = 0;

-    llama_data_file_context(llama_file * f) : file(f) {}
+  
+llama_data_file_context::llama_data_file_context(llama_file * f) : file(f) {}

-    void write(const void * src, size_t size) override {
-        file->write_raw(src, size);
-        size_written += size;
-    }
+void llama_data_file_context::write(const void * src, size_t size) {
+  file->write_raw(src, size);
+  size_written += size;
+}

-    size_t get_size_written() override {
-        return size_written;
-    }
-};
+size_t llama_data_file_context::get_size_written()  {
+  return size_written;
+}

 /** copy state data into either a buffer or file depending on the passed in context
 *
@ -9287,3 +9079,6 @@ llama_context::~llama_context() {
 	    ggml_allocr_free(alloc);
 	}
    }
+llama_state::llama_state(){
+  log_callback= llama_log_callback_default;
+}
--- a/llama.h
+++ b/llama.h
@ -114,7 +114,7 @@
        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
    };

-    typedef struct llama_token_data : refl::attr::usage::type{
+    typedef struct llama_token_data {
      llama_token_data( llama_token id, float logit,     float p):
 	id( id),logit(logit),p(p){      }
        llama_token id; // token id
@ -122,7 +122,7 @@
        float p;        // probability of the token
    } llama_token_data;

-    typedef struct llama_token_data_array : refl::attr::usage::type{
+    typedef struct llama_token_data_array {
      llama_token_data_array(llama_token_data * data,
 			     size_t size,
 			     bool sorted):
@ -146,7 +146,7 @@
    // - seq_id : the sequence to which the respective token belongs
    // - logits : if zero, the logits for the respective token will not be output
    //
-    typedef struct llama_batch : refl::attr::usage::type{
+    typedef struct llama_batch {

      llama_batch(int32_t n_tokens,
 		  llama_token  *  token,
@ -205,7 +205,7 @@
        bool use_mlock;  // force system to keep model in RAM
    };

-    struct llama_context_params : refl::attr::usage::type{
+    struct llama_context_params{
        uint32_t seed;              // RNG seed, -1 for random
        uint32_t n_ctx;             // text context, 0 = from model
        uint32_t n_batch;           // prompt processing maximum batch size
@ -230,7 +230,7 @@
    };

    // model quantization parameters
-    typedef struct llama_model_quantize_params : refl::attr::usage::type{
+    typedef struct llama_model_quantize_params {
        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype ftype;      // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
@ -268,7 +268,7 @@
        LLAMA_GRETYPE_CHAR_ALT       = 6,
    };

-    typedef struct llama_grammar_element : refl::attr::usage::type {
+    typedef struct llama_grammar_element {
      llama_grammar_element(        enum llama_gretype type,
 				    uint32_t           value // Unicode code point or rule ID
 				    ):type(type), value(value){}
@ -278,7 +278,7 @@
    } llama_grammar_element;

    // performance timing information
-    struct llama_timings : refl::attr::usage::type{
+    struct llama_timings {
        double t_start_ms;
        double t_end_ms;
        double t_load_ms;
@ -755,7 +755,7 @@
    // Beam search
    //

-    struct llama_beam_view : refl::attr::usage::type{
+    struct llama_beam_view {
        const llama_token * tokens;

        size_t n_tokens;
@ -767,7 +767,7 @@
    // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
    // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
    // These pointers are valid only during the synchronous callback, so should not be saved.
-    struct llama_beams_state : refl::attr::usage::type{
+    struct llama_beams_state {
        struct llama_beam_view * beam_views;

        size_t n_beams;               // Number of elements in beam_views[].
@ -831,3 +831,5 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal


 #endif // LLAMA_H
+
+
--- a/print.hpp
+++ b/print.hpp
@ -1,7 +1,4 @@
-//template<typename T> void print_fields(const T& obj);
-
 #include <iostream>
-//#include <refl.hpp>
 #include "llama.h"
 #include "ggml-internal.hpp"
 #include "llama-internal.hpp"
@ -56,9 +53,9 @@ REFL_FIELD(prompt_file )
 REFL_FIELD(path_prompt_cache )
 REFL_FIELD(input_prefix )
 REFL_FIELD(input_suffix )
-//REFL_FIELD( antiprompt)
+REFL_FIELD( antiprompt)
 REFL_FIELD(logdir )
-//REFL_FIELD( lora_adapter)
+REFL_FIELD( lora_adapter)
 REFL_FIELD(lora_base )
 REFL_FIELD( ppl_stride )
 REFL_FIELD( ppl_output_type )
@ -95,9 +92,6 @@ REFL_END
 REFL_TYPE(llama_sampling_params)
 REFL_END

-REFL_TYPE(llama_buffer)
-REFL_END
-
 REFL_TYPE(llm_arch)
 REFL_END

@ -106,8 +100,8 @@ REFL_FIELD( params)
 REFL_FIELD( mirostat_mu)
 REFL_FIELD( grammar)
 REFL_FIELD( parsed_grammar)
-//REFL_FIELD( prev) vector of ints
-//REFL_FIELD( cur)
+REFL_FIELD( prev) 
+REFL_FIELD( cur)
 REFL_END

 REFL_TYPE(llama_token_data )
@ -183,87 +177,82 @@ REFL_TYPE(ggml_context_container)
  REFL_FIELD(context)
 REFL_END

-// REFL_TYPE(ggml_numa_node)
-//   REFL_FIELD(cpus)
-//   REFL_FIELD(n_cpus)
-// REFL_END
+ REFL_TYPE(ggml_numa_node)
+   REFL_FIELD(cpus)
+   REFL_FIELD(n_cpus)
+ REFL_END

-// REFL_TYPE(ggml_numa_nodes)
-//   REFL_FIELD(nodes)
-//   REFL_FIELD(n_nodes)
-// REFL_END
+ REFL_TYPE(ggml_numa_nodes)
+   REFL_FIELD(nodes)
+   REFL_FIELD(n_nodes)
+ REFL_END

-// REFL_TYPE(ggml_state)
-//   REFL_FIELD(contexts)
-//   REFL_FIELD(numa)
-//   REFL_END
+ REFL_TYPE(ggml_state)
+   REFL_FIELD(contexts)
+   REFL_FIELD(numa)
+   REFL_END

-// REFL_TYPE(gguf_str)
-//   REFL_FIELD(n)
-//   REFL_FIELD(data)
-// REFL_END
+ REFL_TYPE(gguf_str)
+   REFL_FIELD(n)
+   REFL_FIELD(data)
+ REFL_END

-// REFL_TYPE(ggml_map_custom1_op_params)
-//   REFL_FIELD(fun)
-//   REFL_FIELD(n_tasks)
-// REFL_END
+ REFL_TYPE(ggml_map_custom1_op_params)
+   REFL_FIELD(fun)
+   REFL_FIELD(n_tasks)
+ REFL_END

-// REFL_TYPE(ggml_map_custom2_op_params)
-//   REFL_FIELD(fun)
-//   REFL_FIELD(n_tasks)
-// REFL_END
-
-// REFL_TYPE(ggml_map_custom3_op_params)
-//   REFL_FIELD(fun)
-//   REFL_FIELD(n_tasks)
-// REFL_END
-
-// REFL_TYPE(hash_map)
-//   REFL_FIELD(set)
-//   REFL_FIELD(vals)
-// REFL_END
-// REFL_TYPE(ggml_compute_state_shared)
-//   REFL_FIELD(cgraph)
-//   REFL_FIELD(cplan)
-// REFL_END
-// REFL_TYPE(ggml_compute_state)
-//   REFL_FIELD(thrd)
-//   REFL_FIELD(ith)
-// REFL_END
-// REFL_TYPE(ggml_lbfgs_iteration_data)
-//   REFL_FIELD(alpha)
-//   REFL_FIELD(ys)
-// REFL_END
-//REFL_TYPE()
-//  REFL_FIELD(type)
-//REFL_END
-// REFL_TYPE(gguf_kv)
-//   REFL_FIELD(key)
-//   REFL_FIELD(type)
-// REFL_END
-
-// REFL_TYPE(gguf_header)
-//   REFL_FIELD(magic)
-//   REFL_FIELD(version)
-// REFL_END
-
-// REFL_TYPE(gguf_tensor_info)
-//   REFL_FIELD(name)
-//   REFL_FIELD(n_dims)
-// REFL_END
-
-REFL_TYPE(gguf_context)
-//  REFL_FIELD(header)
-//  REFL_FIELD(kv)
+REFL_TYPE(ggml_map_custom2_op_params)
+  REFL_FIELD(fun)
+  REFL_FIELD(n_tasks)
 REFL_END

-// REFL_TYPE(gguf_buf)
-//   REFL_FIELD(data)
-//   REFL_FIELD(size)
-// REFL_END
+REFL_TYPE(ggml_map_custom3_op_params)
+  REFL_FIELD(fun)
+  REFL_FIELD(n_tasks)
+REFL_END

-//REFL_TYPE(llama_token_data)
-//REFL_END
+REFL_TYPE(hash_map)
+  REFL_FIELD(set)
+  REFL_FIELD(vals)
+REFL_END
+REFL_TYPE(ggml_compute_state_shared)
+  REFL_FIELD(cgraph)
+  REFL_FIELD(cplan)
+REFL_END
+REFL_TYPE(ggml_compute_state)
+  REFL_FIELD(thrd)
+  REFL_FIELD(ith)
+REFL_END
+REFL_TYPE(ggml_lbfgs_iteration_data)
+  REFL_FIELD(alpha)
+  REFL_FIELD(ys)
+REFL_END
+
+REFL_TYPE(gguf_kv)
+  REFL_FIELD(key)
+  REFL_FIELD(type)
+REFL_END
+
+REFL_TYPE(gguf_header)
+  REFL_FIELD(magic)
+  REFL_FIELD(version)
+REFL_END
+
+REFL_TYPE(gguf_tensor_info)
+  REFL_FIELD(name)
+  REFL_FIELD(n_dims)
+REFL_END
+
+REFL_TYPE(gguf_context)
+  REFL_FIELD(header)
+  REFL_FIELD(kv)
+REFL_END
+
+REFL_TYPE(gguf_buf)
+  REFL_FIELD(data)
+  REFL_FIELD(size)
+REFL_END


 REFL_TYPE(llama_model_params)
@ -290,55 +279,55 @@ REFL_TYPE(llama_beams_state)
  REFL_FIELD(beam_views)
 REFL_END
  
-//REFL_TYPE(ggml_backend)
-//REFL_END
+REFL_TYPE(ggml_backend)
+REFL_END

 REFL_TYPE(ggml_backend_buffer)
 REFL_END

-//REFL_TYPE(ggml_allocr)
-//REFL_END
+REFL_TYPE(ggml_allocr)
+REFL_END

-//REFL_TYPE(ggml_tallocr)
-//REFL_END
+REFL_TYPE(ggml_tallocr)
+REFL_END

-//REFL_TYPE(ggml_gallocr)
-//REFL_END
+REFL_TYPE(ggml_gallocr)
+REFL_END


-//REFL_TYPE(llama_buffer)
-//REFL_FIELD(data)
-//REFL_FIELD(size)
-//REFL_END
+REFL_TYPE(llama_buffer)
+REFL_FIELD(data)
+REFL_FIELD(size)
+REFL_END
  

-// REFL_TYPE(llama_file)
-// REFL_FIELD(fp)
-// REFL_FIELD(size)
-// REFL_END
+REFL_TYPE(llama_file)
+REFL_FIELD(fp)
+REFL_FIELD(size)
+REFL_END
  

-// REFL_TYPE(llama_mmap)
-// REFL_FIELD(addr)
-// REFL_FIELD(size)
-// REFL_END
+REFL_TYPE(llama_mmap)
+REFL_FIELD(addr)
+REFL_FIELD(size)
+REFL_END


-// REFL_TYPE(llama_mlock)
-//   REFL_FIELD(addr)
-//   REFL_FIELD(size)
-// REFL_END
+REFL_TYPE(llama_mlock)
+  REFL_FIELD(addr)
+  REFL_FIELD(size)
+REFL_END

-//REFL_TYPE(llama_state)
-//  REFL_FIELD(log_callback)
-//  REFL_FIELD(log_callback_user_data)
-//  REFL_END
+REFL_TYPE(llama_state)
+ REFL_FIELD(log_callback)
+ REFL_FIELD(log_callback_user_data)
+ REFL_END
  

-// REFL_TYPE(llama_hparams)
-//   REFL_FIELD(vocab_only)
-//   REFL_FIELD(n_vocab)
-//   REFL_END
+REFL_TYPE(llama_hparams)
+  REFL_FIELD(vocab_only)
+  REFL_FIELD(n_vocab)
+  REFL_END


 REFL_TYPE(llama_cparams)
@ -346,24 +335,21 @@ REFL_TYPE(llama_cparams)
  REFL_FIELD(n_batch)
 REFL_END

-//REFL_TYPE(llama_layer)
-//  REFL_FIELD(attn_norm)
-//  REFL_FIELD(attn_norm_b)
-//REFL_END
+REFL_TYPE(llama_layer)
+ REFL_FIELD(attn_norm)
+ REFL_FIELD(attn_norm_b)
+REFL_END

-// REFL_TYPE(llama_kv_cell)
-//   REFL_FIELD(pos)
-//   REFL_FIELD(delta)
-// REFL_END
+REFL_TYPE(llama_kv_cell)
+  REFL_FIELD(pos)
+  REFL_FIELD(delta)
+REFL_END

 REFL_TYPE(llama_kv_cache)
   REFL_FIELD(has_shift)
   REFL_FIELD(head)
 REFL_END

-// REFL_TYPE(llama_vocab)
-// REFL_END
-
 REFL_TYPE(e_model)
 REFL_END

@ -389,29 +375,22 @@ REFL_FIELD(   output_norm)
 REFL_FIELD(  output_norm_b)
 REFL_FIELD(  output)

-//REFL_FIELD(  layers)
+REFL_FIELD(  layers)

 REFL_FIELD(  n_gpu_layers)

-//REFL_FIELD(  gguf_kv) unordered map
+  REFL_FIELD(  gguf_kv) //unordered map
  REFL_FIELD( ctx)
  REFL_FIELD( buf)
-//REFL_FIELD( mapping) std::unique_ptr 
-//REFL_FIELD( mlock_buf)
-//REFL_FIELD( mlock_mmap)
-//REFL_FIELD( tensors_by_name)
+ REFL_FIELD( mapping) //std::unique_ptr 
+REFL_FIELD( mlock_buf)
+REFL_FIELD( mlock_mmap)
+REFL_FIELD( tensors_by_name)
  REFL_FIELD( t_load_us)
 REFL_FIELD( t_start_us)

 REFL_END

-
-REFL_TYPE(llama_hparams)
-  REFL_END
-
-//REFL_TYPE(std::vector<int> >)
-//REFL_END
-
 REFL_TYPE(llama_vocab)
  REFL_END
  
@ -422,7 +401,7 @@ REFL_TYPE(llama_context)
 REFL_FIELD( cparams)
 //REFL_FIELD(model)
 REFL_FIELD(kv_self)
-//REFL_FIELD(rng) random numbers
+ REFL_FIELD(rng) //random numbers
 REFL_FIELD(has_evaluated_once )
 REFL_FIELD( t_start_us)
 REFL_FIELD( t_load_us)
@ -432,13 +411,13 @@ REFL_FIELD( t_p_eval_us )
 REFL_FIELD( n_sample )
 REFL_FIELD( n_p_eval )
  REFL_FIELD( n_eval  )
-//REFL_FIELD(  logits)
+REFL_FIELD(  logits)
 REFL_FIELD(  logits_all )
-//REFL_FIELD(  embedding)
-//REFL_FIELD(   work_buffer)
+REFL_FIELD(  embedding)
+REFL_FIELD(   work_buffer)
  REFL_FIELD(   buf_compute)
  REFL_FIELD( buf_alloc)
-//REFL_FIELD( alloc ) 
+REFL_FIELD( alloc ) 

 #ifdef GGML_USE_METAL
 REFL_FIELD( ctx_metal )
@ -450,108 +429,102 @@ REFL_FIELD( ctx_mpi )
 #endif
 REFL_END

-// REFL_TYPE(llama_model_loader)
-//   REFL_FIELD(n_kv)
-//   REFL_FIELD(n_tensors)
-// REFL_END
+REFL_TYPE(llama_model_loader)
+  REFL_FIELD(n_kv)
+  REFL_FIELD(n_tensors)
+REFL_END

-// REFL_TYPE(llm_build_context)
-//   REFL_FIELD(model)
-//   REFL_FIELD(hparams)
-// REFL_END
+REFL_TYPE(llm_build_context)
+// REFL_FIELD(model) cannot create pointer to reference member ‘llm_build_context::model’
+//  REFL_FIELD(hparams) cannot create pointer to reference member ‘llm_build_context::hparams’
+REFL_END

-// REFL_TYPE(llm_offload_trie)
-// REFL_END
+REFL_TYPE(llm_offload_trie)
+REFL_END

-// REFL_TYPE(llm_symbol)
-//   REFL_FIELD(prev)
-// REFL_END
+REFL_TYPE(llm_symbol)
+  REFL_FIELD(prev)
+REFL_END

-// REFL_TYPE(llm_bigram_spm)
-// REFL_END
+REFL_TYPE(llm_bigram_spm)
+REFL_END

-// REFL_TYPE(llm_tokenizer_spm)
-// REFL_END
+REFL_TYPE(llm_tokenizer_spm)
+REFL_END

-// REFL_TYPE(llm_bigram_bpe)
-// REFL_END
+REFL_TYPE(llm_bigram_bpe)
+REFL_END

-// REFL_TYPE(llm_tokenizer_bpe)
-// REFL_END
-  
-
-// REFL_TYPE(fragment_buffer_variant)
-// REFL_END
-  
-
-// REFL_TYPE(llama_partial_utf8)
-//   REFL_FIELD(value)
-//   REFL_FIELD(n_remain)
-// REFL_END
-  
-
-REFL_TYPE(llama_grammar)
-//  REFL_FIELD(rules)
-//  REFL_FIELD(stacks)
+REFL_TYPE(llm_tokenizer_bpe)
 REFL_END
  

-//REFL_TYPE(llama_grammar_candidate)
-//  REFL_FIELD(index)
-//  REFL_FIELD(code_points)
-//REFL_END
+REFL_TYPE(fragment_buffer_variant)
+REFL_END
  

-// REFL_TYPE(llama_beam)
-//   REFL_FIELD(tokens)
-//   REFL_FIELD(p)
-// REFL_END
+REFL_TYPE(llama_partial_utf8)
+  REFL_FIELD(value)
+  REFL_FIELD(n_remain)
+REFL_END
  

-// REFL_TYPE(llama_logit_info)
-//   REFL_FIELD(logits)
-//   REFL_FIELD(n_vocab)
-// REFL_END
-
-// REFL_TYPE(llama_beam_search_data)
-//   REFL_FIELD(ctx)
-//   REFL_FIELD(n_beams)
-// REFL_END
-
-
-// REFL_TYPE(quantize_state_internal)
-//   REFL_FIELD(model)
-//   REFL_FIELD(params)
-// REFL_END
-
-// REFL_TYPE(llama_data_context)
-// REFL_END
+REFL_TYPE(llama_grammar)
+ REFL_FIELD(rules)
+ REFL_FIELD(stacks)
+REFL_END
  
-// REFL_TYPE(llama_data_buffer_context)
-//   REFL_FIELD(ptr)
-// REFL_END

-// REFL_TYPE(llama_data_file_context)
-//   REFL_FIELD(file)
-// REFL_END
+REFL_TYPE(llama_grammar_candidate)
+ REFL_FIELD(index)
+ REFL_FIELD(code_points)
+REFL_END
  
-// // A simple struct with some fields and a function
-// // A custom attribute to mark some fields as hidden
-struct hidden : refl::attr::usage::field {};

-// // Another struct with some fields and a function, using the custom attribute
-// struct Person {
-//     std::string name;
-//     int age;
-//     [[hidden]] std::string password;
-//     void say_hello() const {
-//         std::cout << "Hello, I'm " << name << " and I'm " << age << " years old.\n";
-//     }
-// };
+REFL_TYPE(llama_beam)
+  REFL_FIELD(tokens)
+  REFL_FIELD(p)
+REFL_END
+  
+
+REFL_TYPE(llama_logit_info)
+  REFL_FIELD(logits)
+  REFL_FIELD(n_vocab)
+REFL_END
+
+REFL_TYPE(llama_beam_search_data)
+  REFL_FIELD(ctx)
+  REFL_FIELD(n_beams)
+REFL_END
+
+
+REFL_TYPE(quantize_state_internal)
+//  REFL_FIELD(model)
+  REFL_FIELD(params)
+REFL_FIELD( n_attention_wv )
+REFL_FIELD(    n_feed_forward_w2 )
+  REFL_FIELD(    i_attention_wv    )
+  REFL_FIELD(    i_feed_forward_w2 )
+REFL_FIELD(    n_k_quantized     )
+REFL_FIELD(     n_fallback        )
+
+REFL_END
+
+REFL_TYPE(llama_data_context)
+REFL_END
+  
+REFL_TYPE(llama_data_buffer_context)
+  REFL_FIELD(ptr)
+REFL_END
+
+REFL_TYPE(llama_data_file_context)
+  REFL_FIELD(file)
+REFL_END
+  

 // // A generic function to print out the fields of any object
 template<typename T>
-void print_fields(const T& t) {
+void print_fields(const T& ) {
  //return;
 //     // Get the type descriptor of the object
  constexpr auto type = refl::reflect<T>();