diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 4f0a1684e..b54b96ec1 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -184,8 +184,9 @@ int main(int argc, char ** argv) {
     g_model = &model;
     g_ctx = &ctx;
 
-    print_fields(g_model);
-    print_fields(g_ctx);
+    print_fields(*model);
+    print_fields(*ctx);
+    print_fields(*ctx_guidance);
 	
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
@@ -488,7 +489,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd_guidance;
 
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-    print_fields(ctx_sampling);
+    print_fields(*ctx_sampling);
     
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
@@ -525,7 +526,7 @@ int main(int argc, char ** argv) {
                 LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-		print_fields(ctx);
+		print_fields(*ctx);
                 llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
                 llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
diff --git a/ggml-alloc.cpp b/ggml-alloc.cpp
index 4c0c914d7..6dc45efb5 100644
--- a/ggml-alloc.cpp
+++ b/ggml-alloc.cpp
@@ -8,9 +8,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-
+#include "ggml-internal.hpp"
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MAX_FREE_BLOCKS 256
+
 
 //#define GGML_ALLOCATOR_DEBUG
 
@@ -24,28 +24,7 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
     return offset + align;
 }
 
-struct free_block {
-    void * addr;
-    size_t size;
-};
 
-struct ggml_tallocr {
-    struct ggml_backend_buffer * buffer;
-    bool buffer_owned;
-    void * base;
-    size_t alignment;
-
-    int n_free_blocks;
-    struct free_block free_blocks[MAX_FREE_BLOCKS];
-
-    size_t max_size;
-
-    bool measure;
-
-#ifdef GGML_ALLOCATOR_DEBUG
-    struct ggml_tensor * allocated_tensors[1024];
-#endif
-};
 
 #ifdef GGML_ALLOCATOR_DEBUG
 static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
@@ -333,21 +312,6 @@ size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
 
 // graph allocator
 
-struct hash_node {
-    int n_children;
-    int n_views;
-};
-
-struct ggml_gallocr {
-    ggml_tallocr_t talloc;
-    struct ggml_hash_set hash_set;
-    struct hash_node * hash_values;
-    size_t hash_values_size;
-    ggml_tallocr_t * hash_allocs;
-    int * parse_seq;
-    int parse_seq_len;
-};
-
 ggml_gallocr_t ggml_gallocr_new(void) {
     ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
 
@@ -700,10 +664,6 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
 
 // legacy API wrapper
 
-struct ggml_allocr {
-    ggml_tallocr_t talloc;
-    ggml_gallocr_t galloc;
-};
 
 static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
     ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
diff --git a/ggml-internal.hpp b/ggml-internal.hpp
index b44826a37..29ae01198 100644
--- a/ggml-internal.hpp
+++ b/ggml-internal.hpp
@@ -38,56 +38,55 @@ struct ggml_context_container {
     
   }
 };
-typedef int ggml_lock_t;
-typedef pthread_t ggml_thread_t;
-typedef int ggml_lock_t;
-typedef pthread_t ggml_thread_t;
-typedef volatile LONG atomic_int;
-typedef atomic_int atomic_bool;
-typedef HANDLE pthread_t;
 
-typedef DWORD thread_ret_t;
-typedef void * thread_ret_t;
 typedef double ggml_float;
+typedef void * thread_ret_t;
 
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
+#define MAX_FREE_BLOCKS 256
 
-#define GGML_LOCK_INITIALIZER 0
+struct free_block {
+    void * addr;
+    size_t size;
+};
 
+struct ggml_tallocr {
+    struct ggml_backend_buffer * buffer;
+    bool buffer_owned;
+    void * base;
+    size_t alignment;
 
+    int n_free_blocks;
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
 
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
+    size_t max_size;
 
+    bool measure;
 
-
-//typedef pthread_spinlock_t ggml_lock_t;
-
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock    pthread_spin_lock
-//#define ggml_lock_unlock  pthread_spin_unlock
-
-
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x)    _mm_pause()
-#else
-#define ggml_lock_lock(x)    UNUSED(x)
+#ifdef GGML_ALLOCATOR_DEBUG
+    struct ggml_tensor * allocated_tensors[1024];
 #endif
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
+};
 
 
+struct hash_node {
+    int n_children;
+    int n_views;
+};
 
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
+typedef struct ggml_tallocr * ggml_tallocr_t;
+typedef struct ggml_gallocr * ggml_gallocr_t;
 
+struct ggml_gallocr {
+    ggml_tallocr_t talloc;
+    struct ggml_hash_set hash_set;
+    struct hash_node * hash_values;
+    size_t hash_values_size;
+    ggml_tallocr_t * hash_allocs;
+    int * parse_seq;
+    int parse_seq_len;
+};
 
+struct ggml_allocr {
+    ggml_tallocr_t talloc;
+    ggml_gallocr_t galloc;
+};
diff --git a/ggml.cpp b/ggml.cpp
index 65b0d11c3..53e312ac3 100644
--- a/ggml.cpp
+++ b/ggml.cpp
@@ -48,6 +48,8 @@
 
 #include <windows.h>
 
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
 
 static void atomic_store(atomic_int * ptr, LONG val) {
     InterlockedExchange(ptr, val);
@@ -15723,6 +15725,49 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
 //
 //#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
 
+typedef int ggml_lock_t;
+
+#define ggml_lock_init(x)    UNUSED(x)
+#define ggml_lock_destroy(x) UNUSED(x)
+#define ggml_lock_lock(x)    UNUSED(x)
+#define ggml_lock_unlock(x)  UNUSED(x)
+
+#define GGML_LOCK_INITIALIZER 0
+
+typedef pthread_t ggml_thread_t;
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#else
+
+//typedef pthread_spinlock_t ggml_lock_t;
+
+//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
+//#define ggml_lock_destroy pthread_spin_destroy
+//#define ggml_lock_lock    pthread_spin_lock
+//#define ggml_lock_unlock  pthread_spin_unlock
+
+typedef int ggml_lock_t;
+
+#define ggml_lock_init(x)    UNUSED(x)
+#define ggml_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define ggml_lock_lock(x)    _mm_pause()
+#else
+#define ggml_lock_lock(x)    UNUSED(x)
+#endif
+#define ggml_lock_unlock(x)  UNUSED(x)
+
+#define GGML_LOCK_INITIALIZER 0
+
+typedef pthread_t ggml_thread_t;
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#endif
+
 
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__linux__) && !defined(__BIONIC__)
diff --git a/llama.cpp b/llama.cpp
index 2d0d2b30f..675d147c8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -77,6 +77,8 @@
 #include <thread>
 #include <unordered_map>
 
+#include "llama-internal.hpp"
+
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@@ -176,25 +178,11 @@ static std::string format(const char * fmt, ...) {
     return std::string(buf.data(), size);
 }
 
+
 //
 // gguf constants (sync with gguf.py)
 //
 
-enum llm_arch {
-    LLM_ARCH_LLAMA,
-    LLM_ARCH_FALCON,
-    LLM_ARCH_BAICHUAN,
-    LLM_ARCH_GPT2,
-    LLM_ARCH_GPTJ,
-    LLM_ARCH_GPTNEOX,
-    LLM_ARCH_MPT,
-    LLM_ARCH_STARCODER,
-    LLM_ARCH_PERSIMMON,
-    LLM_ARCH_REFACT,
-    LLM_ARCH_BLOOM,
-    LLM_ARCH_STABLELM,
-    LLM_ARCH_UNKNOWN,
-};
 
 static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_LLAMA,           "llama"     },
@@ -211,55 +199,6 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_STABLELM,        "stablelm"  },
 };
 
-enum llm_kv {
-    LLM_KV_GENERAL_ARCHITECTURE,
-    LLM_KV_GENERAL_QUANTIZATION_VERSION,
-    LLM_KV_GENERAL_ALIGNMENT,
-    LLM_KV_GENERAL_NAME,
-    LLM_KV_GENERAL_AUTHOR,
-    LLM_KV_GENERAL_URL,
-    LLM_KV_GENERAL_DESCRIPTION,
-    LLM_KV_GENERAL_LICENSE,
-    LLM_KV_GENERAL_SOURCE_URL,
-    LLM_KV_GENERAL_SOURCE_HF_REPO,
-
-    LLM_KV_CONTEXT_LENGTH,
-    LLM_KV_EMBEDDING_LENGTH,
-    LLM_KV_BLOCK_COUNT,
-    LLM_KV_FEED_FORWARD_LENGTH,
-    LLM_KV_USE_PARALLEL_RESIDUAL,
-    LLM_KV_TENSOR_DATA_LAYOUT,
-
-    LLM_KV_ATTENTION_HEAD_COUNT,
-    LLM_KV_ATTENTION_HEAD_COUNT_KV,
-    LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
-    LLM_KV_ATTENTION_CLAMP_KQV,
-    LLM_KV_ATTENTION_LAYERNORM_EPS,
-    LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
-
-    LLM_KV_ROPE_DIMENSION_COUNT,
-    LLM_KV_ROPE_FREQ_BASE,
-    LLM_KV_ROPE_SCALE_LINEAR,
-    LLM_KV_ROPE_SCALING_TYPE,
-    LLM_KV_ROPE_SCALING_FACTOR,
-    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
-    LLM_KV_ROPE_SCALING_FINETUNED,
-
-    LLM_KV_TOKENIZER_MODEL,
-    LLM_KV_TOKENIZER_LIST,
-    LLM_KV_TOKENIZER_TOKEN_TYPE,
-    LLM_KV_TOKENIZER_SCORES,
-    LLM_KV_TOKENIZER_MERGES,
-    LLM_KV_TOKENIZER_BOS_ID,
-    LLM_KV_TOKENIZER_EOS_ID,
-    LLM_KV_TOKENIZER_UNK_ID,
-    LLM_KV_TOKENIZER_SEP_ID,
-    LLM_KV_TOKENIZER_PAD_ID,
-    LLM_KV_TOKENIZER_ADD_BOS,
-    LLM_KV_TOKENIZER_ADD_EOS,
-    LLM_KV_TOKENIZER_HF_JSON,
-    LLM_KV_TOKENIZER_RWKV,
-};
 
 static std::map<llm_kv, std::string> LLM_KV_NAMES = {
     { LLM_KV_GENERAL_ARCHITECTURE,          "general.architecture"                  },
@@ -311,38 +250,6 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_RWKV,                "tokenizer.rwkv.world"              },
 };
 
-struct LLM_KV {
-    LLM_KV(llm_arch arch) : arch(arch) {}
-
-    llm_arch arch;
-
-    std::string operator()(llm_kv kv) const {
-	return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
-    }
-};
-
-enum llm_tensor {
-    LLM_TENSOR_TOKEN_EMBD,
-    LLM_TENSOR_TOKEN_EMBD_NORM,
-    LLM_TENSOR_POS_EMBD,
-    LLM_TENSOR_OUTPUT,
-    LLM_TENSOR_OUTPUT_NORM,
-    LLM_TENSOR_ROPE_FREQS,
-    LLM_TENSOR_ATTN_Q,
-    LLM_TENSOR_ATTN_K,
-    LLM_TENSOR_ATTN_V,
-    LLM_TENSOR_ATTN_QKV,
-    LLM_TENSOR_ATTN_OUT,
-    LLM_TENSOR_ATTN_NORM,
-    LLM_TENSOR_ATTN_NORM_2,
-    LLM_TENSOR_ATTN_ROT_EMBD,
-    LLM_TENSOR_FFN_GATE,
-    LLM_TENSOR_FFN_DOWN,
-    LLM_TENSOR_FFN_UP,
-    LLM_TENSOR_FFN_NORM,
-    LLM_TENSOR_ATTN_Q_NORM,
-    LLM_TENSOR_ATTN_K_NORM,
-};
 
 static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
     {
@@ -547,27 +454,6 @@ static llm_arch llm_arch_from_string(const std::string & name) {
 //   std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias");         -> "token_embd.bias"
 //   std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3);     -> "blk.3.attn_norm.weight"
 //
-struct LLM_TN {
-    LLM_TN(llm_arch arch) : arch(arch) {}
-
-    llm_arch arch;
-
-    std::string operator()(llm_tensor tensor) const {
-	return LLM_TENSOR_NAMES[arch].at(tensor);
-    }
-
-    std::string operator()(llm_tensor tensor, const std::string & suffix) const {
-	return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
-    }
-
-    std::string operator()(llm_tensor tensor, int bid) const {
-	return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
-    }
-
-    std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
-	return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
-    }
-};
 
 //
 // gguf helpers
@@ -723,15 +609,9 @@ static std::string llama_format_win_err(DWORD err) {
 }
 #endif
 
-struct llama_buffer {
-    void * data = NULL;
-    size_t size = 0;
+//struct llama_buffer {
 
-    // fallback to malloc / free
-    // useful in cases where CUDA can try to allocate PINNED memory
-    bool fallback = false;
-
-    void resize(size_t n) {
+void llama_buffer::resize(size_t n) {
 	llama_host_free(data);
 
 	data = llama_host_malloc(n);
@@ -746,7 +626,7 @@ struct llama_buffer {
 	size = n;
     }
 
-    ~llama_buffer() {
+llama_buffer::~llama_buffer() {
 	if (data) {
 	    if (fallback) { // NOLINT
 		free(data);
@@ -757,7 +637,7 @@ struct llama_buffer {
 
 	data = NULL;
     }
-};
+
 
 struct llama_file {
     // use FILE * so we don't have to re-open the file to mmap
@@ -835,16 +715,11 @@ struct llama_file {
     }
 };
 
-struct llama_mmap {
-    void * addr;
-    size_t size;
+//
 
-    llama_mmap(const llama_mmap &) = delete;
 
 #ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
+llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch , bool numa ) {
 	size = file->size;
 	int fd = fileno(file->fp);
 	int flags = MAP_SHARED;
@@ -875,13 +750,12 @@ struct llama_mmap {
 	}
     }
 
-    ~llama_mmap() {
+llama_mmap::~llama_mmap() {
 	munmap(addr, size);
     }
 #elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
 
-    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
+llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch = 1, bool numa = false) {
 	(void) numa;
 
 	size = file->size;
@@ -903,7 +777,7 @@ struct llama_mmap {
 	    throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
 	}
 
-	if (prefetch) {
+	if (prefetch == 1) {
 	    // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
 	    BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
 	    HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
@@ -924,7 +798,7 @@ struct llama_mmap {
 	}
     }
 
-    ~llama_mmap() {
+llama_mmap::~llama_mmap() {
 	if (!UnmapViewOfFile(addr)) {
 	    fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
 		    llama_format_win_err(GetLastError()).c_str());
@@ -941,31 +815,27 @@ struct llama_mmap {
 	throw std::runtime_error(std::string("mmap not supported"));
     }
 #endif
-};
+
 
 // Represents some region of memory being locked using mlock or VirtualLock;
 // will automatically unlock on destruction.
-struct llama_mlock {
-    void * addr = NULL;
-    size_t size = 0;
+// llama_mlock 
 
-    bool failed_already = false;
 
-    llama_mlock() {}
-    llama_mlock(const llama_mlock &) = delete;
+llama_mlock::llama_mlock() {}
 
-    ~llama_mlock() {
+llama_mlock::~llama_mlock() {
 	if (size) {
 	    raw_unlock(addr, size);
 	}
     }
 
-    void init(void * ptr) {
+void llama_mlock::init(void * ptr) {
 	GGML_ASSERT(addr == NULL && size == 0); // NOLINT
 	addr = ptr;
     }
 
-    void grow_to(size_t target_size) {
+void llama_mlock::grow_to(size_t target_size) {
 	GGML_ASSERT(addr);
 	if (failed_already) {
 	    return;
@@ -982,9 +852,8 @@ struct llama_mlock {
     }
 
 #ifdef _POSIX_MEMLOCK_RANGE
-    static constexpr bool SUPPORTED = true;
 
-    static size_t lock_granularity() {
+size_t llama_mlock::lock_granularity() {
 	return (size_t) sysconf(_SC_PAGESIZE);
     }
 
@@ -997,7 +866,7 @@ struct llama_mlock {
 	    "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
     #endif
 
-    bool raw_lock(const void * addr, size_t size) const {
+    bool llama_mlock::raw_lock(const void * addr, size_t size) const {
 	if (!mlock(addr, size)) {
 	    return true;
 	}
@@ -1021,21 +890,21 @@ struct llama_mlock {
 
     #undef MLOCK_SUGGESTION
 
-    static void raw_unlock(void * addr, size_t size) {
+ void llama_mlock::raw_unlock(void * addr, size_t size) {
 	if (munlock(addr, size)) {
 	    fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
 	}
     }
 #elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
+    
 
-    static size_t lock_granularity() {
+ size_t llama_mlock::lock_granularity() {
 	SYSTEM_INFO si;
 	GetSystemInfo(&si);
 	return (size_t) si.dwPageSize;
     }
 
-    bool raw_lock(void * ptr, size_t len) const {
+    bool llama_mlock::raw_lock(void * ptr, size_t len) const {
 	for (int tries = 1; ; tries++) {
 	    if (VirtualLock(ptr, len)) {
 		return true;
@@ -1070,27 +939,26 @@ struct llama_mlock {
 	}
     }
 
-    static void raw_unlock(void * ptr, size_t len) {
+    static void llama_mlock::raw_unlock(void * ptr, size_t len) {
 	if (!VirtualUnlock(ptr, len)) {
 	    fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
 		    llama_format_win_err(GetLastError()).c_str());
 	}
     }
 #else
-    static constexpr bool SUPPORTED = false;
-
-    static size_t lock_granularity() {
+    
+    static size_t llama_mlock::lock_granularity() {
 	return (size_t) 65536;
     }
 
-    bool raw_lock(const void * addr, size_t len) const {
+    bool llama_mlock::raw_lock(const void * addr, size_t len) const {
 	fprintf(stderr, "warning: mlock not supported on this system\n");
 	return false;
     }
 
-    static void raw_unlock(const void * addr, size_t len) {}
+    static void llama_mlock::raw_unlock(const void * addr, size_t len) {}
 #endif
-};
+
 
 typedef void (*offload_func_t)(struct ggml_tensor * tensor);
 
@@ -1125,364 +993,12 @@ struct llama_state {
 
 static llama_state g_state;
 
-// available llama models
-enum e_model {
-    MODEL_UNKNOWN,
-    MODEL_1B,
-    MODEL_3B,
-    MODEL_7B,
-    MODEL_8B,
-    MODEL_13B,
-    MODEL_15B,
-    MODEL_30B,
-    MODEL_34B,
-    MODEL_40B,
-    MODEL_65B,
-    MODEL_70B,
-};
 
 static const size_t kiB = 1024;
 static const size_t MiB = 1024*kiB;
 static const size_t GiB = 1024*MiB;
 
-struct llama_hparams {
-    bool     vocab_only;
-    uint32_t n_vocab;
-    uint32_t n_ctx_train; // context size the model was trained on
-    uint32_t n_embd;
-    uint32_t n_head;
-    uint32_t n_head_kv;
-    uint32_t n_layer;
-    uint32_t n_rot;
-    uint32_t n_ff;
 
-    float f_norm_eps;
-    float f_norm_rms_eps;
-
-    float    rope_freq_base_train;
-    float    rope_freq_scale_train;
-    uint32_t n_yarn_orig_ctx;
-    int8_t   rope_scaling_type_train : 3;
-    bool     rope_finetuned : 1;
-
-    float f_clamp_kqv;
-    float f_max_alibi_bias;
-
-    bool operator!=(const llama_hparams & other) const {
-	if (this->vocab_only  != other.vocab_only)  return true;
-	if (this->n_vocab     != other.n_vocab)     return true;
-	if (this->n_ctx_train != other.n_ctx_train) return true;
-	if (this->n_embd      != other.n_embd)      return true;
-	if (this->n_head      != other.n_head)      return true;
-	if (this->n_head_kv   != other.n_head_kv)   return true;
-	if (this->n_layer     != other.n_layer)     return true;
-	if (this->n_rot       != other.n_rot)       return true;
-	if (this->n_ff        != other.n_ff)        return true;
-	if (this->rope_finetuned  != other.rope_finetuned)  return true;
-	if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
-
-	const float EPSILON = 1e-9;
-
-	if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
-	if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
-	if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
-	if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
-
-	return false;
-    }
-
-    uint32_t n_gqa() const {
-	return n_head/n_head_kv;
-    }
-
-    uint32_t n_embd_head() const {
-	return n_embd/n_head;
-    }
-
-    uint32_t n_embd_gqa() const {
-	return n_embd/n_gqa();
-    }
-};
-
-struct llama_cparams {
-    uint32_t n_ctx;       // context size used during inference
-    uint32_t n_batch;
-    uint32_t n_threads;       // number of threads to use for generation
-    uint32_t n_threads_batch; // number of threads to use for batch processing
-
-    float    rope_freq_base;
-    float    rope_freq_scale;
-
-    uint32_t n_yarn_orig_ctx;
-    // These hyperparameters are not exposed in GGUF, because all
-    // existing YaRN models use the same values for them.
-    float yarn_ext_factor;
-    float yarn_attn_factor;
-    float yarn_beta_fast;
-    float yarn_beta_slow;
-
-    bool mul_mat_q;
-};
-
-struct llama_layer {
-    // normalization
-    struct ggml_tensor * attn_norm;
-    struct ggml_tensor * attn_norm_b;
-    struct ggml_tensor * attn_norm_2;
-    struct ggml_tensor * attn_norm_2_b;
-    struct ggml_tensor * attn_q_norm;
-    struct ggml_tensor * attn_q_norm_b;
-    struct ggml_tensor * attn_k_norm;
-    struct ggml_tensor * attn_k_norm_b;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-    struct ggml_tensor * wqkv;
-
-    // attention bias
-    struct ggml_tensor * bo;
-    struct ggml_tensor * bqkv;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-    struct ggml_tensor * ffn_norm_b;
-
-    // ff
-    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * ffn_up;   // w3
-
-    // ff bias
-    struct ggml_tensor * ffn_down_b; // b2
-    struct ggml_tensor * ffn_up_b;   // b3
-};
-
-struct llama_kv_cell {
-    llama_pos pos   = -1;
-    llama_pos delta = 0;
-
-    std::set<llama_seq_id> seq_id;
-
-    bool has_seq_id(const llama_seq_id & id) const {
-	return seq_id.find(id) != seq_id.end();
-    }
-};
-
-// ring-buffer of cached KV data
-struct llama_kv_cache {
-    bool has_shift = false;
-
-    // Note: The value of head isn't only used to optimize searching
-    // for a free KV slot. llama_decode_internal also uses it, so it
-    // cannot be freely changed after a slot has been allocated.
-    uint32_t head = 0;
-    uint32_t size = 0;
-
-    // computed before each graph build
-    uint32_t n = 0;
-
-    std::vector<llama_kv_cell> cells;
-
-    struct ggml_tensor * k = NULL;
-    struct ggml_tensor * v = NULL;
-
-    struct ggml_context * ctx = NULL;
-
-    llama_buffer buf;
-
-    ~llama_kv_cache() {
-	if (ctx) {
-	    ggml_free(ctx);
-	}
-
-#ifdef GGML_USE_CUBLAS
-	if (ggml_cublas_loaded()) {
-	    ggml_cuda_free_data(k);
-	    ggml_cuda_free_data(v);
-	}
-#endif
-    }
-};
-
-struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-    using ttype = llama_token_type;
-
-    struct token_data {
-	token text;
-	float score;
-	ttype type;
-    };
-
-    enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_data>       id_to_token;
-
-    std::unordered_map<token, id> special_tokens_cache;
-
-    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
-
-    // default LLaMA special tokens
-    id special_bos_id = 1;
-    id special_eos_id = 2;
-    id special_unk_id = 0;
-    id special_sep_id = -1;
-    id special_pad_id = -1;
-
-    int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
-    int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
-
-    id linefeed_id       = 13;
-    id special_prefix_id = 32007;
-    id special_middle_id = 32009;
-    id special_suffix_id = 32008;
-    id special_eot_id    = 32010;
-
-    int find_bpe_rank(std::string token_left, std::string token_right) const {
-	GGML_ASSERT(token_left.find(" ") == std::string::npos);
-	GGML_ASSERT(token_left.find("\n") == std::string::npos);
-	GGML_ASSERT(token_right.find(" ") == std::string::npos);
-	GGML_ASSERT(token_right.find("\n") == std::string::npos);
-
-	auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
-	if (it == bpe_ranks.end()) {
-	    return -1;
-	}
-
-	return it->second;
-    }
-};
-
-struct llama_model {
-    e_model     type  = MODEL_UNKNOWN;
-    llm_arch    arch  = LLM_ARCH_UNKNOWN;
-    llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
-
-    std::string name = "n/a";
-
-    llama_hparams hparams = {};
-    llama_vocab   vocab;
-
-    struct ggml_tensor * tok_embd;
-    struct ggml_tensor * pos_embd;
-    struct ggml_tensor * tok_norm;
-    struct ggml_tensor * tok_norm_b;
-
-    struct ggml_tensor * output_norm;
-    struct ggml_tensor * output_norm_b;
-    struct ggml_tensor * output;
-
-    std::vector<llama_layer> layers;
-
-    int n_gpu_layers;
-
-    // gguf metadata
-    std::unordered_map<std::string, std::string> gguf_kv;
-
-    // context
-    struct ggml_context * ctx = NULL;
-
-    // the model memory buffer
-    llama_buffer buf;
-
-    // model memory mapped file
-    std::unique_ptr<llama_mmap> mapping;
-
-    // objects representing data potentially being locked in memory
-    llama_mlock mlock_buf;
-    llama_mlock mlock_mmap;
-
-    // for quantize-stats only
-    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
-
-    int64_t t_load_us = 0;
-    int64_t t_start_us = 0;
-
-    ~llama_model() {
-	if (ctx) {
-	    ggml_free(ctx);
-	}
-
-#ifdef GGML_USE_CUBLAS
-	if (ggml_cublas_loaded()) {
-	    for (size_t i = 0; i < tensors_by_name.size(); ++i) {
-		ggml_cuda_free_data(tensors_by_name[i].second);
-	    }
-	    ggml_cuda_free_scratch();
-	}
-#endif
-
-#if defined(GGML_USE_CLBLAST)
-	for (size_t i = 0; i < tensors_by_name.size(); ++i) {
-	    ggml_cl_free_data(tensors_by_name[i].second);
-	}
-#endif
-    }
-};
-
-struct llama_context {
-    llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
-    ~llama_context() {
-#ifdef GGML_USE_METAL
-	if (ctx_metal) {
-	    ggml_metal_free(ctx_metal);
-	}
-#endif
-	if (alloc) {
-	    ggml_allocr_free(alloc);
-	}
-    }
-
-    llama_cparams cparams;
-
-    const llama_model & model;
-
-    // key + value cache for the self attention
-    struct llama_kv_cache kv_self;
-
-    std::mt19937 rng;
-
-    bool has_evaluated_once = false;
-
-    int64_t t_start_us;
-    int64_t t_load_us;
-    int64_t t_sample_us = 0;
-    int64_t t_p_eval_us = 0;
-    int64_t t_eval_us   = 0;
-
-    int32_t n_sample = 0; // number of tokens sampled
-    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
-    int32_t n_eval   = 0; // number of eval calls
-
-    // decode output (2-dimensional array: [n_tokens][n_vocab])
-    std::vector<float> logits;
-    bool logits_all = false;
-
-    // input embedding (1-dimensional array: [n_embd])
-    std::vector<float> embedding;
-
-    // reusable buffer for `struct ggml_graph_plan.work_data`
-    std::vector<uint8_t> work_buffer;
-
-    // memory buffers used to evaluate the model
-    llama_buffer buf_compute;
-
-    llama_buffer buf_alloc;
-    ggml_allocr * alloc = NULL;
-
-#ifdef GGML_USE_METAL
-    ggml_metal_context * ctx_metal = NULL;
-#endif
-
-#ifdef GGML_USE_MPI
-    ggml_mpi_context * ctx_mpi = NULL;
-#endif
-};
 
 //
 // kv cache helpers
@@ -1731,11 +1247,6 @@ static void llama_kv_cache_seq_shift(
 // model loading and saving
 //
 
-enum llama_fver {
-    GGUF_FILE_VERSION_V1 = 1,
-    GGUF_FILE_VERSION_V2 = 2,
-    GGUF_FILE_VERSION_V3 = 3,
-};
 
 static const char * llama_file_version_name(llama_fver version) {
     switch (version) {
@@ -6328,6 +5839,32 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
     }
 }
 
+// struct 
+
+bool llama_hparams::operator!=(const llama_hparams & other) const {
+  if (this->vocab_only  != other.vocab_only)  return true;
+  if (this->n_vocab     != other.n_vocab)     return true;
+  if (this->n_ctx_train != other.n_ctx_train) return true;
+  if (this->n_embd      != other.n_embd)      return true;
+	if (this->n_head      != other.n_head)      return true;
+	if (this->n_head_kv   != other.n_head_kv)   return true;
+	if (this->n_layer     != other.n_layer)     return true;
+	if (this->n_rot       != other.n_rot)       return true;
+	if (this->n_ff        != other.n_ff)        return true;
+	if (this->rope_finetuned  != other.rope_finetuned)  return true;
+	if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
+
+	const float EPSILON = 1e-9;
+
+	if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
+	if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
+	if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
+	if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
+
+	return false;
+    }
+
+
 static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
     std::vector<llama_vocab::id> output;
 
@@ -9713,3 +9250,40 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
     fputs(text, stderr);
     fflush(stderr);
 }
+
+
+// LLM_TN 
+LLM_TN::LLM_TN(llm_arch arch) : arch(arch) {}
+
+
+std::string LLM_TN::operator()(llm_tensor tensor) const {
+	return LLM_TENSOR_NAMES[arch].at(tensor);
+    }
+
+    std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix) const {
+	return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
+    }
+
+    std::string LLM_TN::operator()(llm_tensor tensor, int bid) const {
+	return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
+    }
+
+    std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
+	return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
+    }
+
+std::string LLM_KV::operator()(llm_kv kv) const {
+  return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
+}
+
+
+llama_context::~llama_context() {
+#ifdef GGML_USE_METAL
+	if (ctx_metal) {
+	    ggml_metal_free(ctx_metal);
+	}
+#endif
+	if (alloc) {
+	    ggml_allocr_free(alloc);
+	}
+    }
diff --git a/print.hpp b/print.hpp
index 83ae1e10b..40dcdd802 100644
--- a/print.hpp
+++ b/print.hpp
@@ -3,6 +3,8 @@
 #include <iostream>
 //#include <refl.hpp>
 #include "llama.h"
+#include "ggml-internal.hpp"
+#include "llama-internal.hpp"
 
 REFL_TYPE(ggml_init_params )
 REFL_END
@@ -92,6 +94,12 @@ REFL_END
 
 
 REFL_TYPE(llama_sampling_context )
+REFL_FIELD( params)
+REFL_FIELD( mirostat_mu)
+REFL_FIELD( grammar)
+REFL_FIELD( parsed_grammar)
+REFL_FIELD( prev)
+REFL_FIELD( cur)
 REFL_END
 
 REFL_TYPE(llama_token_data )
@@ -148,15 +156,24 @@ REFL_TYPE(ggml_something)
   REFL_FIELD(type_name)
 REFL_END
 
-// REFL_TYPE(ggml_context)
-//   REFL_FIELD(mem_size)
-//   REFL_FIELD(mem_buffer)
-// REFL_END
+REFL_TYPE(ggml_context)
+  REFL_FIELD(mem_size)
+REFL_FIELD(mem_buffer)
+REFL_FIELD(mem_buffer_owned)
+REFL_FIELD(    no_alloc)
+REFL_FIELD(    no_alloc_save)
+REFL_FIELD(    n_objects)
+REFL_FIELD(    objects_begin)
+REFL_FIELD(    objects_end)
+REFL_FIELD(    scratch)
+REFL_FIELD(    scratch_save)
 
-//REFL_TYPE(ggml_context_container)
-//  REFL_FIELD(used)
-//  REFL_FIELD(context)
-//REFL_END
+REFL_END
+
+REFL_TYPE(ggml_context_container)
+  REFL_FIELD(used)
+  REFL_FIELD(context)
+REFL_END
 
 // REFL_TYPE(ggml_numa_node)
 //   REFL_FIELD(cpus)
@@ -340,11 +357,70 @@ REFL_END
 // REFL_END
 
 REFL_TYPE(llama_model)
-//  REFL_FIELD(type)
-//  REFL_FIELD(arch)
+  REFL_FIELD(type)
+  REFL_FIELD(arch)
+REFL_FIELD(ftype )
+
+REFL_FIELD(  name )
+
+  REFL_FIELD(   hparams )
+REFL_FIELD(    vocab)
+
+REFL_FIELD(   tok_embd)
+REFL_FIELD(   pos_embd)
+REFL_FIELD(   tok_norm)
+REFL_FIELD(   tok_norm_b)
+
+REFL_FIELD(   output_norm)
+REFL_FIELD(  output_norm_b)
+REFL_FIELD(  output)
+
+REFL_FIELD(  layers)
+
+REFL_FIELD(  n_gpu_layers)
+
+REFL_FIELD(  gguf_kv)
+  REFL_FIELD( ctx)
+  REFL_FIELD( buf)
+  REFL_FIELD( mapping)
+REFL_FIELD( mlock_buf)
+REFL_FIELD( mlock_mmap)
+REFL_FIELD( tensors_by_name)
+  REFL_FIELD( t_load_us)
+REFL_FIELD( t_start_us)
+
 REFL_END
 
 REFL_TYPE(llama_context)
+REFL_FIELD( cparams)
+//REFL_FIELD(model)
+REFL_FIELD(kv_self)
+REFL_FIELD(rng)
+REFL_FIELD(has_evaluated_once )
+REFL_FIELD( t_start_us)
+REFL_FIELD( t_load_us)
+  REFL_FIELD( t_sample_us )
+REFL_FIELD( t_p_eval_us )
+  REFL_FIELD( t_eval_us)
+REFL_FIELD( n_sample )
+REFL_FIELD( n_p_eval )
+  REFL_FIELD( n_eval  )
+REFL_FIELD(  logits)
+REFL_FIELD(  logits_all )
+REFL_FIELD(  embedding)
+REFL_FIELD(   work_buffer)
+  REFL_FIELD(   buf_compute)
+  REFL_FIELD( buf_alloc)
+REFL_FIELD( alloc )
+
+#ifdef GGML_USE_METAL
+REFL_FIELD( ctx_metal )
+#endif
+
+#ifdef GGML_USE_MPI
+REFL_FIELD( ctx_mpi )
+
+#endif
 REFL_END
 
 // REFL_TYPE(llama_model_loader)
@@ -459,7 +535,7 @@ void print_fields(const T& ) {
   //  T instance{};
   for_each(refl::reflect<T>().members, [&](auto member) {
 
-    std::cout << "MEMBER" <<     member.name.str() << "\n";
+    std::cout << "MEMBER:" <<     member.name.str() << "\n";
       
   });
 
@@ -468,7 +544,8 @@ void print_fields(const T& ) {
        //if ((refl::descriptor::is_field(member)) && (!member.has_attribute<hidden>()))) {
        //if ((refl::descriptor::is_field(member))) {
 //             // Print the member name and value
-	 std::cout << member.name << ": " << "\n";
+	 std::cout
+	   << "Auto:" << member.name << ": " << "\n";
 	 //	 refl::get(member, obj)
 	 //}
      });