now has a model

2023-11-23 16:32:46 -05:00 · 2023-11-23 16:32:46 -05:00 · e34fffc77b
commit e34fffc77b
parent 8a8859ced4
6 changed files with 272 additions and 616 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -184,8 +184,9 @@ int main(int argc, char ** argv) {
    g_model = &model;
    g_ctx = &ctx;
-    print_fields(g_model);
+    print_fields(*model);
-    print_fields(g_ctx);
+    print_fields(*ctx);
    print_fields(*ctx_guidance);
    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
@ -488,7 +489,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd_guidance;
    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-    print_fields(ctx_sampling);
+    print_fields(*ctx_sampling);
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
@ -525,7 +526,7 @@ int main(int argc, char ** argv) {
                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                    n_past, n_left, n_ctx, params.n_keep, n_discard);
-		print_fields(ctx);
+		print_fields(*ctx);
                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
--- a/ggml-alloc.cpp
+++ b/ggml-alloc.cpp
@ -8,9 +8,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-
+#include "ggml-internal.hpp"
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MAX_FREE_BLOCKS 256
+
 //#define GGML_ALLOCATOR_DEBUG
@ -24,28 +24,7 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
    return offset + align;
 }
 struct free_block {
    void * addr;
    size_t size;
 };
 struct ggml_tallocr {
    struct ggml_backend_buffer * buffer;
    bool buffer_owned;
    void * base;
    size_t alignment;
    int n_free_blocks;
    struct free_block free_blocks[MAX_FREE_BLOCKS];
    size_t max_size;
    bool measure;
 #ifdef GGML_ALLOCATOR_DEBUG
    struct ggml_tensor * allocated_tensors[1024];
 #endif
 };
 #ifdef GGML_ALLOCATOR_DEBUG
 static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
@ -333,21 +312,6 @@ size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
 // graph allocator
 struct hash_node {
    int n_children;
    int n_views;
 };
 struct ggml_gallocr {
    ggml_tallocr_t talloc;
    struct ggml_hash_set hash_set;
    struct hash_node * hash_values;
    size_t hash_values_size;
    ggml_tallocr_t * hash_allocs;
    int * parse_seq;
    int parse_seq_len;
 };
 ggml_gallocr_t ggml_gallocr_new(void) {
    ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
@ -700,10 +664,6 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
 // legacy API wrapper
 struct ggml_allocr {
    ggml_tallocr_t talloc;
    ggml_gallocr_t galloc;
 };
 static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
    ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
--- a/ggml-internal.hpp
+++ b/ggml-internal.hpp
@ -38,56 +38,55 @@ struct ggml_context_container {
  }
 };
 typedef int ggml_lock_t;
 typedef pthread_t ggml_thread_t;
 typedef int ggml_lock_t;
 typedef pthread_t ggml_thread_t;
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 typedef HANDLE pthread_t;
 typedef DWORD thread_ret_t;
 typedef void * thread_ret_t;
 typedef double ggml_float;
 typedef void * thread_ret_t;
-#define ggml_lock_init(x)    UNUSED(x)
+#define MAX_FREE_BLOCKS 256
 #define ggml_lock_destroy(x) UNUSED(x)
 #define ggml_lock_lock(x)    UNUSED(x)
 #define ggml_lock_unlock(x)  UNUSED(x)
-#define GGML_LOCK_INITIALIZER 0
+struct free_block {
    void * addr;
    size_t size;
 };
 struct ggml_tallocr {
    struct ggml_backend_buffer * buffer;
    bool buffer_owned;
    void * base;
    size_t alignment;
    int n_free_blocks;
    struct free_block free_blocks[MAX_FREE_BLOCKS];
-#define ggml_thread_create pthread_create
+    size_t max_size;
 #define ggml_thread_join   pthread_join
    bool measure;
-
+#ifdef GGML_ALLOCATOR_DEBUG
-//typedef pthread_spinlock_t ggml_lock_t;
+    struct ggml_tensor * allocated_tensors[1024];
 //#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
 //#define ggml_lock_destroy pthread_spin_destroy
 //#define ggml_lock_lock    pthread_spin_lock
 //#define ggml_lock_unlock  pthread_spin_unlock
 #define ggml_lock_init(x)    UNUSED(x)
 #define ggml_lock_destroy(x) UNUSED(x)
 #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
 #define ggml_lock_lock(x)    _mm_pause()
 #else
 #define ggml_lock_lock(x)    UNUSED(x)
 #endif
-#define ggml_lock_unlock(x)  UNUSED(x)
+};
 #define GGML_LOCK_INITIALIZER 0
 struct hash_node {
    int n_children;
    int n_views;
 };
-#define ggml_thread_create pthread_create
+typedef struct ggml_tallocr * ggml_tallocr_t;
-#define ggml_thread_join   pthread_join
+typedef struct ggml_gallocr * ggml_gallocr_t;
 struct ggml_gallocr {
    ggml_tallocr_t talloc;
    struct ggml_hash_set hash_set;
    struct hash_node * hash_values;
    size_t hash_values_size;
    ggml_tallocr_t * hash_allocs;
    int * parse_seq;
    int parse_seq_len;
 };
 struct ggml_allocr {
    ggml_tallocr_t talloc;
    ggml_gallocr_t galloc;
 };
--- a/ggml.cpp
+++ b/ggml.cpp
@ -48,6 +48,8 @@
 #include <windows.h>
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 static void atomic_store(atomic_int * ptr, LONG val) {
    InterlockedExchange(ptr, val);
@ -15723,6 +15725,49 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
 //
 //#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
 typedef int ggml_lock_t;
 #define ggml_lock_init(x)    UNUSED(x)
 #define ggml_lock_destroy(x) UNUSED(x)
 #define ggml_lock_lock(x)    UNUSED(x)
 #define ggml_lock_unlock(x)  UNUSED(x)
 #define GGML_LOCK_INITIALIZER 0
 typedef pthread_t ggml_thread_t;
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
 #else
 //typedef pthread_spinlock_t ggml_lock_t;
 //#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
 //#define ggml_lock_destroy pthread_spin_destroy
 //#define ggml_lock_lock    pthread_spin_lock
 //#define ggml_lock_unlock  pthread_spin_unlock
 typedef int ggml_lock_t;
 #define ggml_lock_init(x)    UNUSED(x)
 #define ggml_lock_destroy(x) UNUSED(x)
 #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
 #define ggml_lock_lock(x)    _mm_pause()
 #else
 #define ggml_lock_lock(x)    UNUSED(x)
 #endif
 #define ggml_lock_unlock(x)  UNUSED(x)
 #define GGML_LOCK_INITIALIZER 0
 typedef pthread_t ggml_thread_t;
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
 #endif
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__linux__) && !defined(__BIONIC__)
--- a/llama.cpp
+++ b/llama.cpp
@ -77,6 +77,8 @@
 #include <thread>
 #include <unordered_map>
 #include "llama-internal.hpp"
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@ -176,25 +178,11 @@ static std::string format(const char * fmt, ...) {
    return std::string(buf.data(), size);
 }
 //
 // gguf constants (sync with gguf.py)
 //
 enum llm_arch {
    LLM_ARCH_LLAMA,
    LLM_ARCH_FALCON,
    LLM_ARCH_BAICHUAN,
    LLM_ARCH_GPT2,
    LLM_ARCH_GPTJ,
    LLM_ARCH_GPTNEOX,
    LLM_ARCH_MPT,
    LLM_ARCH_STARCODER,
    LLM_ARCH_PERSIMMON,
    LLM_ARCH_REFACT,
    LLM_ARCH_BLOOM,
    LLM_ARCH_STABLELM,
    LLM_ARCH_UNKNOWN,
 };
 static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
    { LLM_ARCH_LLAMA,           "llama"     },
@ -211,55 +199,6 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
    { LLM_ARCH_STABLELM,        "stablelm"  },
 };
 enum llm_kv {
    LLM_KV_GENERAL_ARCHITECTURE,
    LLM_KV_GENERAL_QUANTIZATION_VERSION,
    LLM_KV_GENERAL_ALIGNMENT,
    LLM_KV_GENERAL_NAME,
    LLM_KV_GENERAL_AUTHOR,
    LLM_KV_GENERAL_URL,
    LLM_KV_GENERAL_DESCRIPTION,
    LLM_KV_GENERAL_LICENSE,
    LLM_KV_GENERAL_SOURCE_URL,
    LLM_KV_GENERAL_SOURCE_HF_REPO,
    LLM_KV_CONTEXT_LENGTH,
    LLM_KV_EMBEDDING_LENGTH,
    LLM_KV_BLOCK_COUNT,
    LLM_KV_FEED_FORWARD_LENGTH,
    LLM_KV_USE_PARALLEL_RESIDUAL,
    LLM_KV_TENSOR_DATA_LAYOUT,
    LLM_KV_ATTENTION_HEAD_COUNT,
    LLM_KV_ATTENTION_HEAD_COUNT_KV,
    LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
    LLM_KV_ATTENTION_CLAMP_KQV,
    LLM_KV_ATTENTION_LAYERNORM_EPS,
    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_FREQ_BASE,
    LLM_KV_ROPE_SCALE_LINEAR,
    LLM_KV_ROPE_SCALING_TYPE,
    LLM_KV_ROPE_SCALING_FACTOR,
    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
    LLM_KV_ROPE_SCALING_FINETUNED,
    LLM_KV_TOKENIZER_MODEL,
    LLM_KV_TOKENIZER_LIST,
    LLM_KV_TOKENIZER_TOKEN_TYPE,
    LLM_KV_TOKENIZER_SCORES,
    LLM_KV_TOKENIZER_MERGES,
    LLM_KV_TOKENIZER_BOS_ID,
    LLM_KV_TOKENIZER_EOS_ID,
    LLM_KV_TOKENIZER_UNK_ID,
    LLM_KV_TOKENIZER_SEP_ID,
    LLM_KV_TOKENIZER_PAD_ID,
    LLM_KV_TOKENIZER_ADD_BOS,
    LLM_KV_TOKENIZER_ADD_EOS,
    LLM_KV_TOKENIZER_HF_JSON,
    LLM_KV_TOKENIZER_RWKV,
 };
 static std::map<llm_kv, std::string> LLM_KV_NAMES = {
    { LLM_KV_GENERAL_ARCHITECTURE,          "general.architecture"                  },
@ -311,38 +250,6 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
    { LLM_KV_TOKENIZER_RWKV,                "tokenizer.rwkv.world"              },
 };
 struct LLM_KV {
    LLM_KV(llm_arch arch) : arch(arch) {}
    llm_arch arch;
    std::string operator()(llm_kv kv) const {
 	return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
    }
 };
 enum llm_tensor {
    LLM_TENSOR_TOKEN_EMBD,
    LLM_TENSOR_TOKEN_EMBD_NORM,
    LLM_TENSOR_POS_EMBD,
    LLM_TENSOR_OUTPUT,
    LLM_TENSOR_OUTPUT_NORM,
    LLM_TENSOR_ROPE_FREQS,
    LLM_TENSOR_ATTN_Q,
    LLM_TENSOR_ATTN_K,
    LLM_TENSOR_ATTN_V,
    LLM_TENSOR_ATTN_QKV,
    LLM_TENSOR_ATTN_OUT,
    LLM_TENSOR_ATTN_NORM,
    LLM_TENSOR_ATTN_NORM_2,
    LLM_TENSOR_ATTN_ROT_EMBD,
    LLM_TENSOR_FFN_GATE,
    LLM_TENSOR_FFN_DOWN,
    LLM_TENSOR_FFN_UP,
    LLM_TENSOR_FFN_NORM,
    LLM_TENSOR_ATTN_Q_NORM,
    LLM_TENSOR_ATTN_K_NORM,
 };
 static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
    {
@ -547,27 +454,6 @@ static llm_arch llm_arch_from_string(const std::string & name) {
 //   std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias");         -> "token_embd.bias"
 //   std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3);     -> "blk.3.attn_norm.weight"
 //
 struct LLM_TN {
    LLM_TN(llm_arch arch) : arch(arch) {}
    llm_arch arch;
    std::string operator()(llm_tensor tensor) const {
 	return LLM_TENSOR_NAMES[arch].at(tensor);
    }
    std::string operator()(llm_tensor tensor, const std::string & suffix) const {
 	return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
    }
    std::string operator()(llm_tensor tensor, int bid) const {
 	return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
    }
    std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
 	return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
    }
 };
 //
 // gguf helpers
@ -723,15 +609,9 @@ static std::string llama_format_win_err(DWORD err) {
 }
 #endif
-struct llama_buffer {
+//struct llama_buffer {
    void * data = NULL;
    size_t size = 0;
-    // fallback to malloc / free
+void llama_buffer::resize(size_t n) {
    // useful in cases where CUDA can try to allocate PINNED memory
    bool fallback = false;
    void resize(size_t n) {
 	llama_host_free(data);
 	data = llama_host_malloc(n);
@ -746,7 +626,7 @@ struct llama_buffer {
 	size = n;
    }
-    ~llama_buffer() {
+llama_buffer::~llama_buffer() {
 	if (data) {
 	    if (fallback) { // NOLINT
 		free(data);
@ -757,7 +637,7 @@ struct llama_buffer {
 	data = NULL;
    }
-};
+
 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
@ -835,16 +715,11 @@ struct llama_file {
    }
 };
-struct llama_mmap {
+//
    void * addr;
    size_t size;
    llama_mmap(const llama_mmap &) = delete;
 #ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
+llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch , bool numa ) {
    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
 	size = file->size;
 	int fd = fileno(file->fp);
 	int flags = MAP_SHARED;
@ -875,13 +750,12 @@ struct llama_mmap {
 	}
    }
-    ~llama_mmap() {
+llama_mmap::~llama_mmap() {
 	munmap(addr, size);
    }
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
-    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
+llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch = 1, bool numa = false) {
 	(void) numa;
 	size = file->size;
@ -903,7 +777,7 @@ struct llama_mmap {
 	    throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
 	}
-	if (prefetch) {
+	if (prefetch == 1) {
 	    // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
 	    BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
 	    HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
@ -924,7 +798,7 @@ struct llama_mmap {
 	}
    }
-    ~llama_mmap() {
+llama_mmap::~llama_mmap() {
 	if (!UnmapViewOfFile(addr)) {
 	    fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
 		    llama_format_win_err(GetLastError()).c_str());
@ -941,31 +815,27 @@ struct llama_mmap {
 	throw std::runtime_error(std::string("mmap not supported"));
    }
 #endif
-};
+
 // Represents some region of memory being locked using mlock or VirtualLock;
 // will automatically unlock on destruction.
-struct llama_mlock {
+// llama_mlock 
    void * addr = NULL;
    size_t size = 0;
    bool failed_already = false;
-    llama_mlock() {}
+llama_mlock::llama_mlock() {}
    llama_mlock(const llama_mlock &) = delete;
-    ~llama_mlock() {
+llama_mlock::~llama_mlock() {
 	if (size) {
 	    raw_unlock(addr, size);
 	}
    }
-    void init(void * ptr) {
+void llama_mlock::init(void * ptr) {
 	GGML_ASSERT(addr == NULL && size == 0); // NOLINT
 	addr = ptr;
    }
-    void grow_to(size_t target_size) {
+void llama_mlock::grow_to(size_t target_size) {
 	GGML_ASSERT(addr);
 	if (failed_already) {
 	    return;
@ -982,9 +852,8 @@ struct llama_mlock {
    }
 #ifdef _POSIX_MEMLOCK_RANGE
    static constexpr bool SUPPORTED = true;
-    static size_t lock_granularity() {
+size_t llama_mlock::lock_granularity() {
 	return (size_t) sysconf(_SC_PAGESIZE);
    }
@ -997,7 +866,7 @@ struct llama_mlock {
 	    "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
    #endif
-    bool raw_lock(const void * addr, size_t size) const {
+    bool llama_mlock::raw_lock(const void * addr, size_t size) const {
 	if (!mlock(addr, size)) {
 	    return true;
 	}
@ -1021,21 +890,21 @@ struct llama_mlock {
    #undef MLOCK_SUGGESTION
-    static void raw_unlock(void * addr, size_t size) {
+ void llama_mlock::raw_unlock(void * addr, size_t size) {
 	if (munlock(addr, size)) {
 	    fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
 	}
    }
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
-    static size_t lock_granularity() {
+
 size_t llama_mlock::lock_granularity() {
 	SYSTEM_INFO si;
 	GetSystemInfo(&si);
 	return (size_t) si.dwPageSize;
    }
-    bool raw_lock(void * ptr, size_t len) const {
+    bool llama_mlock::raw_lock(void * ptr, size_t len) const {
 	for (int tries = 1; ; tries++) {
 	    if (VirtualLock(ptr, len)) {
 		return true;
@ -1070,27 +939,26 @@ struct llama_mlock {
 	}
    }
-    static void raw_unlock(void * ptr, size_t len) {
+    static void llama_mlock::raw_unlock(void * ptr, size_t len) {
 	if (!VirtualUnlock(ptr, len)) {
 	    fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
 		    llama_format_win_err(GetLastError()).c_str());
 	}
    }
 #else
    static constexpr bool SUPPORTED = false;
-    static size_t lock_granularity() {
+    static size_t llama_mlock::lock_granularity() {
 	return (size_t) 65536;
    }
-    bool raw_lock(const void * addr, size_t len) const {
+    bool llama_mlock::raw_lock(const void * addr, size_t len) const {
 	fprintf(stderr, "warning: mlock not supported on this system\n");
 	return false;
    }
-    static void raw_unlock(const void * addr, size_t len) {}
+    static void llama_mlock::raw_unlock(const void * addr, size_t len) {}
 #endif
-};
+
 typedef void (*offload_func_t)(struct ggml_tensor * tensor);
@ -1125,364 +993,12 @@ struct llama_state {
 static llama_state g_state;
 // available llama models
 enum e_model {
    MODEL_UNKNOWN,
    MODEL_1B,
    MODEL_3B,
    MODEL_7B,
    MODEL_8B,
    MODEL_13B,
    MODEL_15B,
    MODEL_30B,
    MODEL_34B,
    MODEL_40B,
    MODEL_65B,
    MODEL_70B,
 };
 static const size_t kiB = 1024;
 static const size_t MiB = 1024*kiB;
 static const size_t GiB = 1024*MiB;
 struct llama_hparams {
    bool     vocab_only;
    uint32_t n_vocab;
    uint32_t n_ctx_train; // context size the model was trained on
    uint32_t n_embd;
    uint32_t n_head;
    uint32_t n_head_kv;
    uint32_t n_layer;
    uint32_t n_rot;
    uint32_t n_ff;
    float f_norm_eps;
    float f_norm_rms_eps;
    float    rope_freq_base_train;
    float    rope_freq_scale_train;
    uint32_t n_yarn_orig_ctx;
    int8_t   rope_scaling_type_train : 3;
    bool     rope_finetuned : 1;
    float f_clamp_kqv;
    float f_max_alibi_bias;
    bool operator!=(const llama_hparams & other) const {
 	if (this->vocab_only  != other.vocab_only)  return true;
 	if (this->n_vocab     != other.n_vocab)     return true;
 	if (this->n_ctx_train != other.n_ctx_train) return true;
 	if (this->n_embd      != other.n_embd)      return true;
 	if (this->n_head      != other.n_head)      return true;
 	if (this->n_head_kv   != other.n_head_kv)   return true;
 	if (this->n_layer     != other.n_layer)     return true;
 	if (this->n_rot       != other.n_rot)       return true;
 	if (this->n_ff        != other.n_ff)        return true;
 	if (this->rope_finetuned  != other.rope_finetuned)  return true;
 	if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
 	const float EPSILON = 1e-9;
 	if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
 	if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
 	if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
 	if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
 	return false;
    }
    uint32_t n_gqa() const {
 	return n_head/n_head_kv;
    }
    uint32_t n_embd_head() const {
 	return n_embd/n_head;
    }
    uint32_t n_embd_gqa() const {
 	return n_embd/n_gqa();
    }
 };
 struct llama_cparams {
    uint32_t n_ctx;       // context size used during inference
    uint32_t n_batch;
    uint32_t n_threads;       // number of threads to use for generation
    uint32_t n_threads_batch; // number of threads to use for batch processing
    float    rope_freq_base;
    float    rope_freq_scale;
    uint32_t n_yarn_orig_ctx;
    // These hyperparameters are not exposed in GGUF, because all
    // existing YaRN models use the same values for them.
    float yarn_ext_factor;
    float yarn_attn_factor;
    float yarn_beta_fast;
    float yarn_beta_slow;
    bool mul_mat_q;
 };
 struct llama_layer {
    // normalization
    struct ggml_tensor * attn_norm;
    struct ggml_tensor * attn_norm_b;
    struct ggml_tensor * attn_norm_2;
    struct ggml_tensor * attn_norm_2_b;
    struct ggml_tensor * attn_q_norm;
    struct ggml_tensor * attn_q_norm_b;
    struct ggml_tensor * attn_k_norm;
    struct ggml_tensor * attn_k_norm_b;
    // attention
    struct ggml_tensor * wq;
    struct ggml_tensor * wk;
    struct ggml_tensor * wv;
    struct ggml_tensor * wo;
    struct ggml_tensor * wqkv;
    // attention bias
    struct ggml_tensor * bo;
    struct ggml_tensor * bqkv;
    // normalization
    struct ggml_tensor * ffn_norm;
    struct ggml_tensor * ffn_norm_b;
    // ff
    struct ggml_tensor * ffn_gate; // w1
    struct ggml_tensor * ffn_down; // w2
    struct ggml_tensor * ffn_up;   // w3
    // ff bias
    struct ggml_tensor * ffn_down_b; // b2
    struct ggml_tensor * ffn_up_b;   // b3
 };
 struct llama_kv_cell {
    llama_pos pos   = -1;
    llama_pos delta = 0;
    std::set<llama_seq_id> seq_id;
    bool has_seq_id(const llama_seq_id & id) const {
 	return seq_id.find(id) != seq_id.end();
    }
 };
 // ring-buffer of cached KV data
 struct llama_kv_cache {
    bool has_shift = false;
    // Note: The value of head isn't only used to optimize searching
    // for a free KV slot. llama_decode_internal also uses it, so it
    // cannot be freely changed after a slot has been allocated.
    uint32_t head = 0;
    uint32_t size = 0;
    // computed before each graph build
    uint32_t n = 0;
    std::vector<llama_kv_cell> cells;
    struct ggml_tensor * k = NULL;
    struct ggml_tensor * v = NULL;
    struct ggml_context * ctx = NULL;
    llama_buffer buf;
    ~llama_kv_cache() {
 	if (ctx) {
 	    ggml_free(ctx);
 	}
 #ifdef GGML_USE_CUBLAS
 	if (ggml_cublas_loaded()) {
 	    ggml_cuda_free_data(k);
 	    ggml_cuda_free_data(v);
 	}
 #endif
    }
 };
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
    using ttype = llama_token_type;
    struct token_data {
 	token text;
 	float score;
 	ttype type;
    };
    enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
    std::unordered_map<token, id> token_to_id;
    std::vector<token_data>       id_to_token;
    std::unordered_map<token, id> special_tokens_cache;
    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
    // default LLaMA special tokens
    id special_bos_id = 1;
    id special_eos_id = 2;
    id special_unk_id = 0;
    id special_sep_id = -1;
    id special_pad_id = -1;
    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
    id linefeed_id       = 13;
    id special_prefix_id = 32007;
    id special_middle_id = 32009;
    id special_suffix_id = 32008;
    id special_eot_id    = 32010;
    int find_bpe_rank(std::string token_left, std::string token_right) const {
 	GGML_ASSERT(token_left.find(" ") == std::string::npos);
 	GGML_ASSERT(token_left.find("\n") == std::string::npos);
 	GGML_ASSERT(token_right.find(" ") == std::string::npos);
 	GGML_ASSERT(token_right.find("\n") == std::string::npos);
 	auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
 	if (it == bpe_ranks.end()) {
 	    return -1;
 	}
 	return it->second;
    }
 };
 struct llama_model {
    e_model     type  = MODEL_UNKNOWN;
    llm_arch    arch  = LLM_ARCH_UNKNOWN;
    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
    std::string name = "n/a";
    llama_hparams hparams = {};
    llama_vocab   vocab;
    struct ggml_tensor * tok_embd;
    struct ggml_tensor * pos_embd;
    struct ggml_tensor * tok_norm;
    struct ggml_tensor * tok_norm_b;
    struct ggml_tensor * output_norm;
    struct ggml_tensor * output_norm_b;
    struct ggml_tensor * output;
    std::vector<llama_layer> layers;
    int n_gpu_layers;
    // gguf metadata
    std::unordered_map<std::string, std::string> gguf_kv;
    // context
    struct ggml_context * ctx = NULL;
    // the model memory buffer
    llama_buffer buf;
    // model memory mapped file
    std::unique_ptr<llama_mmap> mapping;
    // objects representing data potentially being locked in memory
    llama_mlock mlock_buf;
    llama_mlock mlock_mmap;
    // for quantize-stats only
    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
    int64_t t_load_us = 0;
    int64_t t_start_us = 0;
    ~llama_model() {
 	if (ctx) {
 	    ggml_free(ctx);
 	}
 #ifdef GGML_USE_CUBLAS
 	if (ggml_cublas_loaded()) {
 	    for (size_t i = 0; i < tensors_by_name.size(); ++i) {
 		ggml_cuda_free_data(tensors_by_name[i].second);
 	    }
 	    ggml_cuda_free_scratch();
 	}
 #endif
 #if defined(GGML_USE_CLBLAST)
 	for (size_t i = 0; i < tensors_by_name.size(); ++i) {
 	    ggml_cl_free_data(tensors_by_name[i].second);
 	}
 #endif
    }
 };
 struct llama_context {
    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
    ~llama_context() {
 #ifdef GGML_USE_METAL
 	if (ctx_metal) {
 	    ggml_metal_free(ctx_metal);
 	}
 #endif
 	if (alloc) {
 	    ggml_allocr_free(alloc);
 	}
    }
    llama_cparams cparams;
    const llama_model & model;
    // key + value cache for the self attention
    struct llama_kv_cache kv_self;
    std::mt19937 rng;
    bool has_evaluated_once = false;
    int64_t t_start_us;
    int64_t t_load_us;
    int64_t t_sample_us = 0;
    int64_t t_p_eval_us = 0;
    int64_t t_eval_us   = 0;
    int32_t n_sample = 0; // number of tokens sampled
    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
    int32_t n_eval   = 0; // number of eval calls
    // decode output (2-dimensional array: [n_tokens][n_vocab])
    std::vector<float> logits;
    bool logits_all = false;
    // input embedding (1-dimensional array: [n_embd])
    std::vector<float> embedding;
    // reusable buffer for `struct ggml_graph_plan.work_data`
    std::vector<uint8_t> work_buffer;
    // memory buffers used to evaluate the model
    llama_buffer buf_compute;
    llama_buffer buf_alloc;
    ggml_allocr * alloc = NULL;
 #ifdef GGML_USE_METAL
    ggml_metal_context * ctx_metal = NULL;
 #endif
 #ifdef GGML_USE_MPI
    ggml_mpi_context * ctx_mpi = NULL;
 #endif
 };
 //
 // kv cache helpers
@ -1731,11 +1247,6 @@ static void llama_kv_cache_seq_shift(
 // model loading and saving
 //
 enum llama_fver {
    GGUF_FILE_VERSION_V1 = 1,
    GGUF_FILE_VERSION_V2 = 2,
    GGUF_FILE_VERSION_V3 = 3,
 };
 static const char * llama_file_version_name(llama_fver version) {
    switch (version) {
@ -6328,6 +5839,32 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
    }
 }
 // struct 
 bool llama_hparams::operator!=(const llama_hparams & other) const {
  if (this->vocab_only  != other.vocab_only)  return true;
  if (this->n_vocab     != other.n_vocab)     return true;
  if (this->n_ctx_train != other.n_ctx_train) return true;
  if (this->n_embd      != other.n_embd)      return true;
 	if (this->n_head      != other.n_head)      return true;
 	if (this->n_head_kv   != other.n_head_kv)   return true;
 	if (this->n_layer     != other.n_layer)     return true;
 	if (this->n_rot       != other.n_rot)       return true;
 	if (this->n_ff        != other.n_ff)        return true;
 	if (this->rope_finetuned  != other.rope_finetuned)  return true;
 	if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
 	const float EPSILON = 1e-9;
 	if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
 	if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
 	if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
 	if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
 	return false;
    }
 static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
    std::vector<llama_vocab::id> output;
@ -9713,3 +9250,40 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
    fputs(text, stderr);
    fflush(stderr);
 }
 // LLM_TN 
 LLM_TN::LLM_TN(llm_arch arch) : arch(arch) {}
 std::string LLM_TN::operator()(llm_tensor tensor) const {
 	return LLM_TENSOR_NAMES[arch].at(tensor);
    }
    std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix) const {
 	return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
    }
    std::string LLM_TN::operator()(llm_tensor tensor, int bid) const {
 	return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
    }
    std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
 	return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
    }
 std::string LLM_KV::operator()(llm_kv kv) const {
  return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
 }
 llama_context::~llama_context() {
 #ifdef GGML_USE_METAL
 	if (ctx_metal) {
 	    ggml_metal_free(ctx_metal);
 	}
 #endif
 	if (alloc) {
 	    ggml_allocr_free(alloc);
 	}
    }
--- a/print.hpp
+++ b/print.hpp
@ -3,6 +3,8 @@
 #include <iostream>
 //#include <refl.hpp>
 #include "llama.h"
 #include "ggml-internal.hpp"
 #include "llama-internal.hpp"
 REFL_TYPE(ggml_init_params )
 REFL_END
@ -92,6 +94,12 @@ REFL_END
 REFL_TYPE(llama_sampling_context )
 REFL_FIELD( params)
 REFL_FIELD( mirostat_mu)
 REFL_FIELD( grammar)
 REFL_FIELD( parsed_grammar)
 REFL_FIELD( prev)
 REFL_FIELD( cur)
 REFL_END
 REFL_TYPE(llama_token_data )
@ -148,15 +156,24 @@ REFL_TYPE(ggml_something)
  REFL_FIELD(type_name)
 REFL_END
-// REFL_TYPE(ggml_context)
+REFL_TYPE(ggml_context)
-//   REFL_FIELD(mem_size)
+  REFL_FIELD(mem_size)
-//   REFL_FIELD(mem_buffer)
+REFL_FIELD(mem_buffer)
-// REFL_END
+REFL_FIELD(mem_buffer_owned)
 REFL_FIELD(    no_alloc)
 REFL_FIELD(    no_alloc_save)
 REFL_FIELD(    n_objects)
 REFL_FIELD(    objects_begin)
 REFL_FIELD(    objects_end)
 REFL_FIELD(    scratch)
 REFL_FIELD(    scratch_save)
-//REFL_TYPE(ggml_context_container)
+REFL_END
-//  REFL_FIELD(used)
+
-//  REFL_FIELD(context)
+REFL_TYPE(ggml_context_container)
-//REFL_END
+  REFL_FIELD(used)
  REFL_FIELD(context)
 REFL_END
 // REFL_TYPE(ggml_numa_node)
 //   REFL_FIELD(cpus)
@ -340,11 +357,70 @@ REFL_END
 // REFL_END
 REFL_TYPE(llama_model)
-//  REFL_FIELD(type)
+  REFL_FIELD(type)
-//  REFL_FIELD(arch)
+  REFL_FIELD(arch)
 REFL_FIELD(ftype )
 REFL_FIELD(  name )
  REFL_FIELD(   hparams )
 REFL_FIELD(    vocab)
 REFL_FIELD(   tok_embd)
 REFL_FIELD(   pos_embd)
 REFL_FIELD(   tok_norm)
 REFL_FIELD(   tok_norm_b)
 REFL_FIELD(   output_norm)
 REFL_FIELD(  output_norm_b)
 REFL_FIELD(  output)
 REFL_FIELD(  layers)
 REFL_FIELD(  n_gpu_layers)
 REFL_FIELD(  gguf_kv)
  REFL_FIELD( ctx)
  REFL_FIELD( buf)
  REFL_FIELD( mapping)
 REFL_FIELD( mlock_buf)
 REFL_FIELD( mlock_mmap)
 REFL_FIELD( tensors_by_name)
  REFL_FIELD( t_load_us)
 REFL_FIELD( t_start_us)
 REFL_END
 REFL_TYPE(llama_context)
 REFL_FIELD( cparams)
 //REFL_FIELD(model)
 REFL_FIELD(kv_self)
 REFL_FIELD(rng)
 REFL_FIELD(has_evaluated_once )
 REFL_FIELD( t_start_us)
 REFL_FIELD( t_load_us)
  REFL_FIELD( t_sample_us )
 REFL_FIELD( t_p_eval_us )
  REFL_FIELD( t_eval_us)
 REFL_FIELD( n_sample )
 REFL_FIELD( n_p_eval )
  REFL_FIELD( n_eval  )
 REFL_FIELD(  logits)
 REFL_FIELD(  logits_all )
 REFL_FIELD(  embedding)
 REFL_FIELD(   work_buffer)
  REFL_FIELD(   buf_compute)
  REFL_FIELD( buf_alloc)
 REFL_FIELD( alloc )
 #ifdef GGML_USE_METAL
 REFL_FIELD( ctx_metal )
 #endif
 #ifdef GGML_USE_MPI
 REFL_FIELD( ctx_mpi )
 #endif
 REFL_END
 // REFL_TYPE(llama_model_loader)
@ -459,7 +535,7 @@ void print_fields(const T& ) {
  //  T instance{};
  for_each(refl::reflect<T>().members, [&](auto member) {
-    std::cout << "MEMBER" <<     member.name.str() << "\n";
+    std::cout << "MEMBER:" <<     member.name.str() << "\n";
  });
@ -468,7 +544,8 @@ void print_fields(const T& ) {
       //if ((refl::descriptor::is_field(member)) && (!member.has_attribute<hidden>()))) {
       //if ((refl::descriptor::is_field(member))) {
 //             // Print the member name and value
-	 std::cout << member.name << ": " << "\n";
+	 std::cout
 	   << "Auto:" << member.name << ": " << "\n";
 	 //	 refl::get(member, obj)
 	 //}
     });