adding new header for llama internal

2023-11-24 08:31:12 -05:00 · 2023-11-24 08:31:12 -05:00 · ebea708561
commit ebea708561
parent e34fffc77b
4 changed files with 590 additions and 47 deletions
--- a/.gitignore
+++ b/.gitignore
@ -100,3 +100,5 @@ tests/test-tokenizer-0-falcon
 tests/test-tokenizer-1-llama
 tests/test-tokenizer-1-bpe
 /#llama.cpp#
+#*
+\\#*
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -108,10 +108,7 @@ int main(int argc, char ** argv) {
    g_params = &params;

    //using Td = type_descriptor<gpt_params>;
-    print_fields(params);

-    //constexpr auto tbl = descriptor::get_attribute<gpt_params>(Td{}); 
-    //constexpr auto tbl_name = REFL_MAKE_CONST_STRING(tbl.name);

    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
@ -126,6 +123,7 @@ int main(int argc, char ** argv) {

    // TODO: Dump params ?
    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+    print_fields(params);
    
    // save choice to use color for later
    // (note for later: this is a slightly awkward choice)
@ -184,10 +182,6 @@ int main(int argc, char ** argv) {
    g_model = &model;
    g_ctx = &ctx;
 	
-    print_fields(*model);
-    print_fields(*ctx);
-    print_fields(*ctx_guidance);
-	
    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
@ -247,7 +241,7 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd_inp;

-
+    print_fields(*model);
 	
    if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
        LOG("tokenize the prompt\n");
@ -268,7 +262,7 @@ int main(int argc, char ** argv) {
        embd_inp.push_back(llama_token_bos(model));
        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }
-    //print_fields(embd_inp);
+
    // Tokenize negative prompt
    std::vector<llama_token> guidance_inp;
    int guidance_offset = 0;
@ -292,7 +286,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
    }
-
+    print_fields(*ctx);
+    print_fields(*ctx_guidance);
    //print_fields(session_tokens);
    // debug message about similarity of saved session, if applicable
    size_t n_matching_session_tokens = 0;
@ -643,7 +638,7 @@ int main(int argc, char ** argv) {
            }

            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
-
+	    print_fields(id);
            llama_sampling_accept(ctx_sampling, ctx, id, true);

            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
--- a/llama-internal.hpp
+++ b/llama-internal.hpp
@ -0,0 +1,518 @@
+#include <set>
+
+enum llm_arch {
+    LLM_ARCH_LLAMA,
+    LLM_ARCH_FALCON,
+    LLM_ARCH_BAICHUAN,
+    LLM_ARCH_GPT2,
+    LLM_ARCH_GPTJ,
+    LLM_ARCH_GPTNEOX,
+    LLM_ARCH_MPT,
+    LLM_ARCH_STARCODER,
+    LLM_ARCH_PERSIMMON,
+    LLM_ARCH_REFACT,
+    LLM_ARCH_BLOOM,
+    LLM_ARCH_STABLELM,
+    LLM_ARCH_UNKNOWN,
+};
+
+enum llm_kv {
+    LLM_KV_GENERAL_ARCHITECTURE,
+    LLM_KV_GENERAL_QUANTIZATION_VERSION,
+    LLM_KV_GENERAL_ALIGNMENT,
+    LLM_KV_GENERAL_NAME,
+    LLM_KV_GENERAL_AUTHOR,
+    LLM_KV_GENERAL_URL,
+    LLM_KV_GENERAL_DESCRIPTION,
+    LLM_KV_GENERAL_LICENSE,
+    LLM_KV_GENERAL_SOURCE_URL,
+    LLM_KV_GENERAL_SOURCE_HF_REPO,
+
+    LLM_KV_CONTEXT_LENGTH,
+    LLM_KV_EMBEDDING_LENGTH,
+    LLM_KV_BLOCK_COUNT,
+    LLM_KV_FEED_FORWARD_LENGTH,
+    LLM_KV_USE_PARALLEL_RESIDUAL,
+    LLM_KV_TENSOR_DATA_LAYOUT,
+
+    LLM_KV_ATTENTION_HEAD_COUNT,
+    LLM_KV_ATTENTION_HEAD_COUNT_KV,
+    LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
+    LLM_KV_ATTENTION_CLAMP_KQV,
+    LLM_KV_ATTENTION_LAYERNORM_EPS,
+    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+
+    LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_FREQ_BASE,
+    LLM_KV_ROPE_SCALE_LINEAR,
+    LLM_KV_ROPE_SCALING_TYPE,
+    LLM_KV_ROPE_SCALING_FACTOR,
+    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
+    LLM_KV_ROPE_SCALING_FINETUNED,
+
+    LLM_KV_TOKENIZER_MODEL,
+    LLM_KV_TOKENIZER_LIST,
+    LLM_KV_TOKENIZER_TOKEN_TYPE,
+    LLM_KV_TOKENIZER_SCORES,
+    LLM_KV_TOKENIZER_MERGES,
+    LLM_KV_TOKENIZER_BOS_ID,
+    LLM_KV_TOKENIZER_EOS_ID,
+    LLM_KV_TOKENIZER_UNK_ID,
+    LLM_KV_TOKENIZER_SEP_ID,
+    LLM_KV_TOKENIZER_PAD_ID,
+    LLM_KV_TOKENIZER_ADD_BOS,
+    LLM_KV_TOKENIZER_ADD_EOS,
+    LLM_KV_TOKENIZER_HF_JSON,
+    LLM_KV_TOKENIZER_RWKV,
+};
+
+// available llama models
+enum e_model {
+    MODEL_UNKNOWN,
+    MODEL_1B,
+    MODEL_3B,
+    MODEL_7B,
+    MODEL_8B,
+    MODEL_13B,
+    MODEL_15B,
+    MODEL_30B,
+    MODEL_34B,
+    MODEL_40B,
+    MODEL_65B,
+    MODEL_70B,
+};
+
+enum llama_fver {
+    GGUF_FILE_VERSION_V1 = 1,
+    GGUF_FILE_VERSION_V2 = 2,
+    GGUF_FILE_VERSION_V3 = 3,
+};
+
+struct LLM_KV {
+  LLM_KV(llm_arch arch) : arch(arch) {}
+  
+  llm_arch arch;
+
+  std::string operator()(llm_kv kv) const; // moved to llama.cpp file
+
+};
+
+enum llm_tensor {
+    LLM_TENSOR_TOKEN_EMBD,
+    LLM_TENSOR_TOKEN_EMBD_NORM,
+    LLM_TENSOR_POS_EMBD,
+    LLM_TENSOR_OUTPUT,
+    LLM_TENSOR_OUTPUT_NORM,
+    LLM_TENSOR_ROPE_FREQS,
+    LLM_TENSOR_ATTN_Q,
+    LLM_TENSOR_ATTN_K,
+    LLM_TENSOR_ATTN_V,
+    LLM_TENSOR_ATTN_QKV,
+    LLM_TENSOR_ATTN_OUT,
+    LLM_TENSOR_ATTN_NORM,
+    LLM_TENSOR_ATTN_NORM_2,
+    LLM_TENSOR_ATTN_ROT_EMBD,
+    LLM_TENSOR_FFN_GATE,
+    LLM_TENSOR_FFN_DOWN,
+    LLM_TENSOR_FFN_UP,
+    LLM_TENSOR_FFN_NORM,
+    LLM_TENSOR_ATTN_Q_NORM,
+    LLM_TENSOR_ATTN_K_NORM,
+};
+
+
+struct llama_cparams {
+    uint32_t n_ctx;       // context size used during inference
+    uint32_t n_batch;
+    uint32_t n_threads;       // number of threads to use for generation
+    uint32_t n_threads_batch; // number of threads to use for batch processing
+
+    float    rope_freq_base;
+    float    rope_freq_scale;
+
+    uint32_t n_yarn_orig_ctx;
+    // These hyperparameters are not exposed in GGUF, because all
+    // existing YaRN models use the same values for them.
+    float yarn_ext_factor;
+    float yarn_attn_factor;
+    float yarn_beta_fast;
+    float yarn_beta_slow;
+
+    bool mul_mat_q;
+};
+
+struct llama_layer {
+    // normalization
+    struct ggml_tensor * attn_norm;
+    struct ggml_tensor * attn_norm_b;
+    struct ggml_tensor * attn_norm_2;
+    struct ggml_tensor * attn_norm_2_b;
+    struct ggml_tensor * attn_q_norm;
+    struct ggml_tensor * attn_q_norm_b;
+    struct ggml_tensor * attn_k_norm;
+    struct ggml_tensor * attn_k_norm_b;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+    struct ggml_tensor * wqkv;
+
+    // attention bias
+    struct ggml_tensor * bo;
+    struct ggml_tensor * bqkv;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+    struct ggml_tensor * ffn_norm_b;
+
+    // ff
+    struct ggml_tensor * ffn_gate; // w1
+    struct ggml_tensor * ffn_down; // w2
+    struct ggml_tensor * ffn_up;   // w3
+
+    // ff bias
+    struct ggml_tensor * ffn_down_b; // b2
+    struct ggml_tensor * ffn_up_b;   // b3
+};
+
+struct llama_kv_cell {
+    llama_pos pos   = -1;
+    llama_pos delta = 0;
+
+    std::set<llama_seq_id> seq_id;
+
+    bool has_seq_id(const llama_seq_id & id) const {
+	return seq_id.find(id) != seq_id.end();
+    }
+};
+
+struct llama_buffer {
+    void * data = NULL;
+    size_t size = 0;
+
+    // fallback to malloc / free
+    // useful in cases where CUDA can try to allocate PINNED memory
+    bool fallback = false;
+
+  void resize(size_t n) ;	
+
+
+  ~llama_buffer();
+
+};
+
+// ring-buffer of cached KV data
+struct llama_kv_cache {
+    bool has_shift = false;
+
+    // Note: The value of head isn't only used to optimize searching
+    // for a free KV slot. llama_decode_internal also uses it, so it
+    // cannot be freely changed after a slot has been allocated.
+    uint32_t head = 0;
+    uint32_t size = 0;
+
+    // computed before each graph build
+    uint32_t n = 0;
+
+    std::vector<llama_kv_cell> cells;
+
+    struct ggml_tensor * k = NULL;
+    struct ggml_tensor * v = NULL;
+
+    struct ggml_context * ctx = NULL;
+
+    llama_buffer buf;
+
+    ~llama_kv_cache() {
+	if (ctx) {
+	    ggml_free(ctx);
+	}
+
+#ifdef GGML_USE_CUBLAS
+	if (ggml_cublas_loaded()) {
+	    ggml_cuda_free_data(k);
+	    ggml_cuda_free_data(v);
+	}
+#endif
+    }
+};
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+    using ttype = llama_token_type;
+
+    struct token_data {
+	token text;
+	float score;
+	ttype type;
+    };
+
+    enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data>       id_to_token;
+
+    std::unordered_map<token, id> special_tokens_cache;
+
+    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+
+    // default LLaMA special tokens
+    id special_bos_id = 1;
+    id special_eos_id = 2;
+    id special_unk_id = 0;
+    id special_sep_id = -1;
+    id special_pad_id = -1;
+
+    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
+    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
+
+    id linefeed_id       = 13;
+    id special_prefix_id = 32007;
+    id special_middle_id = 32009;
+    id special_suffix_id = 32008;
+    id special_eot_id    = 32010;
+
+    int find_bpe_rank(std::string token_left, std::string token_right) const {
+	GGML_ASSERT(token_left.find(" ") == std::string::npos);
+	GGML_ASSERT(token_left.find("\n") == std::string::npos);
+	GGML_ASSERT(token_right.find(" ") == std::string::npos);
+	GGML_ASSERT(token_right.find("\n") == std::string::npos);
+
+	auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
+	if (it == bpe_ranks.end()) {
+	    return -1;
+	}
+
+	return it->second;
+    }
+};
+
+struct llama_mmap {
+  void * addr;
+  size_t size;
+  
+  llama_mmap(const llama_mmap &) = delete;
+  
+  llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false);
+  ~llama_mmap();
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+#else
+    static constexpr bool SUPPORTED = false;
+#endif
+};
+
+
+struct llama_hparams {
+    bool     vocab_only;
+    uint32_t n_vocab;
+    uint32_t n_ctx_train; // context size the model was trained on
+    uint32_t n_embd;
+    uint32_t n_head;
+    uint32_t n_head_kv;
+    uint32_t n_layer;
+    uint32_t n_rot;
+    uint32_t n_ff;
+
+    float f_norm_eps;
+    float f_norm_rms_eps;
+
+    float    rope_freq_base_train;
+    float    rope_freq_scale_train;
+    uint32_t n_yarn_orig_ctx;
+    int8_t   rope_scaling_type_train : 3;
+    bool     rope_finetuned : 1;
+
+    float f_clamp_kqv;
+    float f_max_alibi_bias;
+
+  bool operator!=(const llama_hparams & other) const;
+    uint32_t n_gqa() const {
+	return n_head/n_head_kv;
+    }
+
+    uint32_t n_embd_head() const {
+	return n_embd/n_head;
+    }
+
+    uint32_t n_embd_gqa() const {
+	return n_embd/n_gqa();
+    }
+};
+
+struct llama_mlock {
+  void * addr = NULL;
+  size_t size = 0;
+  bool failed_already = false;
+  llama_mlock() ;
+
+  llama_mlock(const llama_mlock &) = delete;
+  ~llama_mlock();
+  void init(void * ptr);
+  void grow_to(size_t target_size);
+#ifdef _POSIX_MEMLOCK_RANGE
+  static constexpr bool SUPPORTED = true;
+  static size_t lock_granularity();
+#ifdef __APPLE__
+#define MLOCK_SUGGESTION						\
+  "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+  "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+#else
+#define MLOCK_SUGGESTION						\
+  "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+#endif
+  bool raw_lock(const void * addr, size_t size) const ;
+#undef MLOCK_SUGGESTION
+  static void raw_unlock(void * addr, size_t size);
+#elif defined(_WIN32)
+  static constexpr bool SUPPORTED = true; 
+  static size_t lock_granularity();	
+  bool raw_lock(void * ptr, size_t len) const ;
+  static void raw_unlock(void * ptr, size_t len);
+#else
+    static constexpr bool SUPPORTED = false;
+  static size_t lock_granularity();
+  bool raw_lock(const void * addr, size_t len) const;
+  static void raw_unlock(const void * addr, size_t len);
+#endif
+};
+
+
+struct llama_model {
+    e_model     type  = MODEL_UNKNOWN;
+    llm_arch    arch  = LLM_ARCH_UNKNOWN;
+    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
+
+    std::string name = "n/a";
+
+    llama_hparams hparams = {};
+    llama_vocab   vocab;
+
+    struct ggml_tensor * tok_embd;
+    struct ggml_tensor * pos_embd;
+    struct ggml_tensor * tok_norm;
+    struct ggml_tensor * tok_norm_b;
+
+    struct ggml_tensor * output_norm;
+    struct ggml_tensor * output_norm_b;
+    struct ggml_tensor * output;
+
+    std::vector<llama_layer> layers;
+
+    int n_gpu_layers;
+
+    // gguf metadata
+    std::unordered_map<std::string, std::string> gguf_kv;
+
+    // context
+    struct ggml_context * ctx = NULL;
+
+    // the model memory buffer
+    llama_buffer buf;
+
+    // model memory mapped file
+    std::unique_ptr<llama_mmap> mapping;
+
+    // objects representing data potentially being locked in memory
+    llama_mlock mlock_buf;
+    llama_mlock mlock_mmap;
+
+    // for quantize-stats only
+    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
+
+    int64_t t_load_us = 0;
+    int64_t t_start_us = 0;
+
+    ~llama_model() {
+	if (ctx) {
+	    ggml_free(ctx);
+	}
+
+#ifdef GGML_USE_CUBLAS
+	if (ggml_cublas_loaded()) {
+	    for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+		ggml_cuda_free_data(tensors_by_name[i].second);
+	    }
+	    ggml_cuda_free_scratch();
+	}
+#endif
+
+#if defined(GGML_USE_CLBLAST)
+	for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+	    ggml_cl_free_data(tensors_by_name[i].second);
+	}
+#endif
+    }
+};
+
+struct llama_context {
+    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
+  ~llama_context();
+
+    llama_cparams cparams;
+
+    const llama_model & model;
+
+    // key + value cache for the self attention
+    struct llama_kv_cache kv_self;
+
+    std::mt19937 rng;
+
+    bool has_evaluated_once = false;
+
+    int64_t t_start_us;
+    int64_t t_load_us;
+    int64_t t_sample_us = 0;
+    int64_t t_p_eval_us = 0;
+    int64_t t_eval_us   = 0;
+
+    int32_t n_sample = 0; // number of tokens sampled
+    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    int32_t n_eval   = 0; // number of eval calls
+
+    // decode output (2-dimensional array: [n_tokens][n_vocab])
+    std::vector<float> logits;
+    bool logits_all = false;
+
+    // input embedding (1-dimensional array: [n_embd])
+    std::vector<float> embedding;
+
+    // reusable buffer for `struct ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
+
+    // memory buffers used to evaluate the model
+    llama_buffer buf_compute;
+
+    llama_buffer buf_alloc;
+    ggml_allocr * alloc = NULL;
+
+#ifdef GGML_USE_METAL
+    ggml_metal_context * ctx_metal = NULL;
+#endif
+
+#ifdef GGML_USE_MPI
+    ggml_mpi_context * ctx_mpi = NULL;
+#endif
+};
+
+
+struct LLM_TN {
+  LLM_TN(llm_arch arch) ;
+
+  llm_arch arch;
+
+  std::string operator()(llm_tensor tensor) const;
+
+  std::string operator()(llm_tensor tensor, const std::string & suffix) const ;
+
+  std::string operator()(llm_tensor tensor, int bid) const ;
+
+  std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const ;
+
+};
--- a/print.hpp
+++ b/print.hpp
@ -56,9 +56,9 @@ REFL_FIELD(prompt_file )
 REFL_FIELD(path_prompt_cache )
 REFL_FIELD(input_prefix )
 REFL_FIELD(input_suffix )
-REFL_FIELD( antiprompt)
+//REFL_FIELD( antiprompt)
 REFL_FIELD(logdir )
-REFL_FIELD( lora_adapter)
+//REFL_FIELD( lora_adapter)
 REFL_FIELD(lora_base )
 REFL_FIELD( ppl_stride )
 REFL_FIELD( ppl_output_type )
@ -92,14 +92,22 @@ REFL_FIELD( image)

 REFL_END

+REFL_TYPE(llama_sampling_params)
+REFL_END
+
+REFL_TYPE(llama_buffer)
+REFL_END
+
+REFL_TYPE(llm_arch)
+REFL_END

 REFL_TYPE(llama_sampling_context )
 REFL_FIELD( params)
 REFL_FIELD( mirostat_mu)
 REFL_FIELD( grammar)
 REFL_FIELD( parsed_grammar)
-REFL_FIELD( prev)
-REFL_FIELD( cur)
+//REFL_FIELD( prev) vector of ints
+//REFL_FIELD( cur)
 REFL_END

 REFL_TYPE(llama_token_data )
@ -333,10 +341,10 @@ REFL_END
 //   REFL_END


-//REFL_TYPE(llama_cparams)
-//  REFL_FIELD(n_ctx)
-//  REFL_FIELD(n_batch)
-//REFL_END
+REFL_TYPE(llama_cparams)
+  REFL_FIELD(n_ctx)
+  REFL_FIELD(n_batch)
+REFL_END

 //REFL_TYPE(llama_layer)
 //  REFL_FIELD(attn_norm)
@ -348,14 +356,20 @@ REFL_END
 //   REFL_FIELD(delta)
 // REFL_END

-// REFL_TYPE(llama_kv_cache)
-//   REFL_FIELD(has_shift)
-//   REFL_FIELD(head)
-// REFL_END
+REFL_TYPE(llama_kv_cache)
+   REFL_FIELD(has_shift)
+   REFL_FIELD(head)
+ REFL_END

 // REFL_TYPE(llama_vocab)
 // REFL_END

+REFL_TYPE(e_model)
+REFL_END
+
+REFL_TYPE(llama_ftype)
+REFL_END
+
 REFL_TYPE(llama_model)
  REFL_FIELD(type)
  REFL_FIELD(arch)
@ -375,27 +389,40 @@ REFL_FIELD(   output_norm)
 REFL_FIELD(  output_norm_b)
 REFL_FIELD(  output)

-REFL_FIELD(  layers)
+//REFL_FIELD(  layers)

 REFL_FIELD(  n_gpu_layers)

-REFL_FIELD(  gguf_kv)
+//REFL_FIELD(  gguf_kv) unordered map
  REFL_FIELD( ctx)
  REFL_FIELD( buf)
-  REFL_FIELD( mapping)
-REFL_FIELD( mlock_buf)
-REFL_FIELD( mlock_mmap)
-REFL_FIELD( tensors_by_name)
+//REFL_FIELD( mapping) std::unique_ptr 
+//REFL_FIELD( mlock_buf)
+//REFL_FIELD( mlock_mmap)
+//REFL_FIELD( tensors_by_name)
  REFL_FIELD( t_load_us)
 REFL_FIELD( t_start_us)

 REFL_END

+
+REFL_TYPE(llama_hparams)
+  REFL_END
+
+//REFL_TYPE(std::vector<int> >)
+//REFL_END
+
+REFL_TYPE(llama_vocab)
+  REFL_END
+  
+  REFL_TYPE(grammar_parser::parse_state)
+  REFL_END
+  
 REFL_TYPE(llama_context)
 REFL_FIELD( cparams)
 //REFL_FIELD(model)
 REFL_FIELD(kv_self)
-REFL_FIELD(rng)
+//REFL_FIELD(rng) random numbers
 REFL_FIELD(has_evaluated_once )
 REFL_FIELD( t_start_us)
 REFL_FIELD( t_load_us)
@ -405,10 +432,10 @@ REFL_FIELD( t_p_eval_us )
 REFL_FIELD( n_sample )
 REFL_FIELD( n_p_eval )
  REFL_FIELD( n_eval  )
-REFL_FIELD(  logits)
+//REFL_FIELD(  logits)
 REFL_FIELD(  logits_all )
-REFL_FIELD(  embedding)
-REFL_FIELD(   work_buffer)
+//REFL_FIELD(  embedding)
+//REFL_FIELD(   work_buffer)
  REFL_FIELD(   buf_compute)
  REFL_FIELD( buf_alloc)
 REFL_FIELD( alloc )
@ -524,7 +551,7 @@ struct hidden : refl::attr::usage::field {};

 // // A generic function to print out the fields of any object
 template<typename T>
-void print_fields(const T& ) {
+void print_fields(const T& t) {
 //     // Get the type descriptor of the object
  constexpr auto type = refl::reflect<T>();
  
@ -533,20 +560,21 @@ void print_fields(const T& ) {
  std::cout << "DEBUG Type: " << type.name.c_str() << "\n";

  //  T instance{};
-  for_each(refl::reflect<T>().members, [&](auto member) {
+  //for_each(refl::reflect<T>().members, [&](auto member) {

-    std::cout << "MEMBER:" <<     member.name.str() << "\n";
+  //std::cout << "MEMBER:" <<     member.name.str() << "\n";
      
-  });
+  //});

     refl::util::for_each(type.members, [&](auto member) {
-//         // Check if the member is a field and not hidden
-       //if ((refl::descriptor::is_field(member)) && (!member.has_attribute<hidden>()))) {
-       //if ((refl::descriptor::is_field(member))) {
-//             // Print the member name and value
-	 std::cout
-	   << "Auto:" << member.name << ": " << "\n";
-	 //	 refl::get(member, obj)
+
+	 auto member1 = member(t);
+	 //if(member1){
+	   std::cout  << "Auto:" << member.name <<"\n";
+	   print_fields(member1);
+	   //}
+	   //else {
+	   //std::cout  << "NULL:" << member.name <<"\n";
 	   //}
     });
     std::cout << "\n";