diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 4f0a1684e..b54b96ec1 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -184,8 +184,9 @@ int main(int argc, char ** argv) { g_model = &model; g_ctx = &ctx; - print_fields(g_model); - print_fields(g_ctx); + print_fields(*model); + print_fields(*ctx); + print_fields(*ctx_guidance); // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); @@ -488,7 +489,7 @@ int main(int argc, char ** argv) { std::vector embd_guidance; struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); - print_fields(ctx_sampling); + print_fields(*ctx_sampling); while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict @@ -525,7 +526,7 @@ int main(int argc, char ** argv) { LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - print_fields(ctx); + print_fields(*ctx); llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); diff --git a/ggml-alloc.cpp b/ggml-alloc.cpp index 4c0c914d7..6dc45efb5 100644 --- a/ggml-alloc.cpp +++ b/ggml-alloc.cpp @@ -8,9 +8,9 @@ #include #include #include - +#include "ggml-internal.hpp" #define MAX(a, b) ((a) > (b) ? (a) : (b)) -#define MAX_FREE_BLOCKS 256 + //#define GGML_ALLOCATOR_DEBUG @@ -24,28 +24,7 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen return offset + align; } -struct free_block { - void * addr; - size_t size; -}; -struct ggml_tallocr { - struct ggml_backend_buffer * buffer; - bool buffer_owned; - void * base; - size_t alignment; - - int n_free_blocks; - struct free_block free_blocks[MAX_FREE_BLOCKS]; - - size_t max_size; - - bool measure; - -#ifdef GGML_ALLOCATOR_DEBUG - struct ggml_tensor * allocated_tensors[1024]; -#endif -}; #ifdef GGML_ALLOCATOR_DEBUG static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { @@ -333,21 +312,6 @@ size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) { // graph allocator -struct hash_node { - int n_children; - int n_views; -}; - -struct ggml_gallocr { - ggml_tallocr_t talloc; - struct ggml_hash_set hash_set; - struct hash_node * hash_values; - size_t hash_values_size; - ggml_tallocr_t * hash_allocs; - int * parse_seq; - int parse_seq_len; -}; - ggml_gallocr_t ggml_gallocr_new(void) { ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr)); @@ -700,10 +664,6 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap // legacy API wrapper -struct ggml_allocr { - ggml_tallocr_t talloc; - ggml_gallocr_t galloc; -}; static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) { ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr)); diff --git a/ggml-internal.hpp b/ggml-internal.hpp index b44826a37..29ae01198 100644 --- a/ggml-internal.hpp +++ b/ggml-internal.hpp @@ -38,56 +38,55 @@ struct ggml_context_container { } }; -typedef int ggml_lock_t; -typedef pthread_t ggml_thread_t; -typedef int ggml_lock_t; -typedef pthread_t ggml_thread_t; -typedef volatile LONG atomic_int; -typedef atomic_int atomic_bool; -typedef HANDLE pthread_t; -typedef DWORD thread_ret_t; -typedef void * thread_ret_t; typedef double ggml_float; +typedef void * thread_ret_t; -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#define ggml_lock_lock(x) UNUSED(x) -#define ggml_lock_unlock(x) UNUSED(x) +#define MAX_FREE_BLOCKS 256 -#define GGML_LOCK_INITIALIZER 0 +struct free_block { + void * addr; + size_t size; +}; +struct ggml_tallocr { + struct ggml_backend_buffer * buffer; + bool buffer_owned; + void * base; + size_t alignment; + int n_free_blocks; + struct free_block free_blocks[MAX_FREE_BLOCKS]; -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join + size_t max_size; + bool measure; - -//typedef pthread_spinlock_t ggml_lock_t; - -//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) -//#define ggml_lock_destroy pthread_spin_destroy -//#define ggml_lock_lock pthread_spin_lock -//#define ggml_lock_unlock pthread_spin_unlock - - - -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) -#define ggml_lock_lock(x) _mm_pause() -#else -#define ggml_lock_lock(x) UNUSED(x) +#ifdef GGML_ALLOCATOR_DEBUG + struct ggml_tensor * allocated_tensors[1024]; #endif -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 +}; +struct hash_node { + int n_children; + int n_views; +}; -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - +typedef struct ggml_tallocr * ggml_tallocr_t; +typedef struct ggml_gallocr * ggml_gallocr_t; +struct ggml_gallocr { + ggml_tallocr_t talloc; + struct ggml_hash_set hash_set; + struct hash_node * hash_values; + size_t hash_values_size; + ggml_tallocr_t * hash_allocs; + int * parse_seq; + int parse_seq_len; +}; +struct ggml_allocr { + ggml_tallocr_t talloc; + ggml_gallocr_t galloc; +}; diff --git a/ggml.cpp b/ggml.cpp index 65b0d11c3..53e312ac3 100644 --- a/ggml.cpp +++ b/ggml.cpp @@ -48,6 +48,8 @@ #include +typedef volatile LONG atomic_int; +typedef atomic_int atomic_bool; static void atomic_store(atomic_int * ptr, LONG val) { InterlockedExchange(ptr, val); @@ -15723,6 +15725,49 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) { // //#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT +typedef int ggml_lock_t; + +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#define ggml_lock_lock(x) UNUSED(x) +#define ggml_lock_unlock(x) UNUSED(x) + +#define GGML_LOCK_INITIALIZER 0 + +typedef pthread_t ggml_thread_t; + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#else + +//typedef pthread_spinlock_t ggml_lock_t; + +//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) +//#define ggml_lock_destroy pthread_spin_destroy +//#define ggml_lock_lock pthread_spin_lock +//#define ggml_lock_unlock pthread_spin_unlock + +typedef int ggml_lock_t; + +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#define ggml_lock_lock(x) _mm_pause() +#else +#define ggml_lock_lock(x) UNUSED(x) +#endif +#define ggml_lock_unlock(x) UNUSED(x) + +#define GGML_LOCK_INITIALIZER 0 + +typedef pthread_t ggml_thread_t; + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#endif + // Android's libc implementation "bionic" does not support setting affinity #if defined(__linux__) && !defined(__BIONIC__) diff --git a/llama.cpp b/llama.cpp index 2d0d2b30f..675d147c8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -77,6 +77,8 @@ #include #include +#include "llama-internal.hpp" + #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif @@ -176,25 +178,11 @@ static std::string format(const char * fmt, ...) { return std::string(buf.data(), size); } + // // gguf constants (sync with gguf.py) // -enum llm_arch { - LLM_ARCH_LLAMA, - LLM_ARCH_FALCON, - LLM_ARCH_BAICHUAN, - LLM_ARCH_GPT2, - LLM_ARCH_GPTJ, - LLM_ARCH_GPTNEOX, - LLM_ARCH_MPT, - LLM_ARCH_STARCODER, - LLM_ARCH_PERSIMMON, - LLM_ARCH_REFACT, - LLM_ARCH_BLOOM, - LLM_ARCH_STABLELM, - LLM_ARCH_UNKNOWN, -}; static std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA, "llama" }, @@ -211,55 +199,6 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_STABLELM, "stablelm" }, }; -enum llm_kv { - LLM_KV_GENERAL_ARCHITECTURE, - LLM_KV_GENERAL_QUANTIZATION_VERSION, - LLM_KV_GENERAL_ALIGNMENT, - LLM_KV_GENERAL_NAME, - LLM_KV_GENERAL_AUTHOR, - LLM_KV_GENERAL_URL, - LLM_KV_GENERAL_DESCRIPTION, - LLM_KV_GENERAL_LICENSE, - LLM_KV_GENERAL_SOURCE_URL, - LLM_KV_GENERAL_SOURCE_HF_REPO, - - LLM_KV_CONTEXT_LENGTH, - LLM_KV_EMBEDDING_LENGTH, - LLM_KV_BLOCK_COUNT, - LLM_KV_FEED_FORWARD_LENGTH, - LLM_KV_USE_PARALLEL_RESIDUAL, - LLM_KV_TENSOR_DATA_LAYOUT, - - LLM_KV_ATTENTION_HEAD_COUNT, - LLM_KV_ATTENTION_HEAD_COUNT_KV, - LLM_KV_ATTENTION_MAX_ALIBI_BIAS, - LLM_KV_ATTENTION_CLAMP_KQV, - LLM_KV_ATTENTION_LAYERNORM_EPS, - LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, - - LLM_KV_ROPE_DIMENSION_COUNT, - LLM_KV_ROPE_FREQ_BASE, - LLM_KV_ROPE_SCALE_LINEAR, - LLM_KV_ROPE_SCALING_TYPE, - LLM_KV_ROPE_SCALING_FACTOR, - LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, - LLM_KV_ROPE_SCALING_FINETUNED, - - LLM_KV_TOKENIZER_MODEL, - LLM_KV_TOKENIZER_LIST, - LLM_KV_TOKENIZER_TOKEN_TYPE, - LLM_KV_TOKENIZER_SCORES, - LLM_KV_TOKENIZER_MERGES, - LLM_KV_TOKENIZER_BOS_ID, - LLM_KV_TOKENIZER_EOS_ID, - LLM_KV_TOKENIZER_UNK_ID, - LLM_KV_TOKENIZER_SEP_ID, - LLM_KV_TOKENIZER_PAD_ID, - LLM_KV_TOKENIZER_ADD_BOS, - LLM_KV_TOKENIZER_ADD_EOS, - LLM_KV_TOKENIZER_HF_JSON, - LLM_KV_TOKENIZER_RWKV, -}; static std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, @@ -311,38 +250,6 @@ static std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, }; -struct LLM_KV { - LLM_KV(llm_arch arch) : arch(arch) {} - - llm_arch arch; - - std::string operator()(llm_kv kv) const { - return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str()); - } -}; - -enum llm_tensor { - LLM_TENSOR_TOKEN_EMBD, - LLM_TENSOR_TOKEN_EMBD_NORM, - LLM_TENSOR_POS_EMBD, - LLM_TENSOR_OUTPUT, - LLM_TENSOR_OUTPUT_NORM, - LLM_TENSOR_ROPE_FREQS, - LLM_TENSOR_ATTN_Q, - LLM_TENSOR_ATTN_K, - LLM_TENSOR_ATTN_V, - LLM_TENSOR_ATTN_QKV, - LLM_TENSOR_ATTN_OUT, - LLM_TENSOR_ATTN_NORM, - LLM_TENSOR_ATTN_NORM_2, - LLM_TENSOR_ATTN_ROT_EMBD, - LLM_TENSOR_FFN_GATE, - LLM_TENSOR_FFN_DOWN, - LLM_TENSOR_FFN_UP, - LLM_TENSOR_FFN_NORM, - LLM_TENSOR_ATTN_Q_NORM, - LLM_TENSOR_ATTN_K_NORM, -}; static std::map> LLM_TENSOR_NAMES = { { @@ -547,27 +454,6 @@ static llm_arch llm_arch_from_string(const std::string & name) { // std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias" // std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight" // -struct LLM_TN { - LLM_TN(llm_arch arch) : arch(arch) {} - - llm_arch arch; - - std::string operator()(llm_tensor tensor) const { - return LLM_TENSOR_NAMES[arch].at(tensor); - } - - std::string operator()(llm_tensor tensor, const std::string & suffix) const { - return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix; - } - - std::string operator()(llm_tensor tensor, int bid) const { - return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid); - } - - std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { - return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix; - } -}; // // gguf helpers @@ -723,15 +609,9 @@ static std::string llama_format_win_err(DWORD err) { } #endif -struct llama_buffer { - void * data = NULL; - size_t size = 0; +//struct llama_buffer { - // fallback to malloc / free - // useful in cases where CUDA can try to allocate PINNED memory - bool fallback = false; - - void resize(size_t n) { +void llama_buffer::resize(size_t n) { llama_host_free(data); data = llama_host_malloc(n); @@ -746,7 +626,7 @@ struct llama_buffer { size = n; } - ~llama_buffer() { +llama_buffer::~llama_buffer() { if (data) { if (fallback) { // NOLINT free(data); @@ -757,7 +637,7 @@ struct llama_buffer { data = NULL; } -}; + struct llama_file { // use FILE * so we don't have to re-open the file to mmap @@ -835,16 +715,11 @@ struct llama_file { } }; -struct llama_mmap { - void * addr; - size_t size; +// - llama_mmap(const llama_mmap &) = delete; #ifdef _POSIX_MAPPED_FILES - static constexpr bool SUPPORTED = true; - - llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { +llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch , bool numa ) { size = file->size; int fd = fileno(file->fp); int flags = MAP_SHARED; @@ -875,13 +750,12 @@ struct llama_mmap { } } - ~llama_mmap() { +llama_mmap::~llama_mmap() { munmap(addr, size); } #elif defined(_WIN32) - static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { +llama_mmap::llama_mmap(struct llama_file * file, size_t prefetch = 1, bool numa = false) { (void) numa; size = file->size; @@ -903,7 +777,7 @@ struct llama_mmap { throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); } - if (prefetch) { + if (prefetch == 1) { // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll"); @@ -924,7 +798,7 @@ struct llama_mmap { } } - ~llama_mmap() { +llama_mmap::~llama_mmap() { if (!UnmapViewOfFile(addr)) { fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", llama_format_win_err(GetLastError()).c_str()); @@ -941,31 +815,27 @@ struct llama_mmap { throw std::runtime_error(std::string("mmap not supported")); } #endif -}; + // Represents some region of memory being locked using mlock or VirtualLock; // will automatically unlock on destruction. -struct llama_mlock { - void * addr = NULL; - size_t size = 0; +// llama_mlock - bool failed_already = false; - llama_mlock() {} - llama_mlock(const llama_mlock &) = delete; +llama_mlock::llama_mlock() {} - ~llama_mlock() { +llama_mlock::~llama_mlock() { if (size) { raw_unlock(addr, size); } } - void init(void * ptr) { +void llama_mlock::init(void * ptr) { GGML_ASSERT(addr == NULL && size == 0); // NOLINT addr = ptr; } - void grow_to(size_t target_size) { +void llama_mlock::grow_to(size_t target_size) { GGML_ASSERT(addr); if (failed_already) { return; @@ -982,9 +852,8 @@ struct llama_mlock { } #ifdef _POSIX_MEMLOCK_RANGE - static constexpr bool SUPPORTED = true; - static size_t lock_granularity() { +size_t llama_mlock::lock_granularity() { return (size_t) sysconf(_SC_PAGESIZE); } @@ -997,7 +866,7 @@ struct llama_mlock { "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" #endif - bool raw_lock(const void * addr, size_t size) const { + bool llama_mlock::raw_lock(const void * addr, size_t size) const { if (!mlock(addr, size)) { return true; } @@ -1021,21 +890,21 @@ struct llama_mlock { #undef MLOCK_SUGGESTION - static void raw_unlock(void * addr, size_t size) { + void llama_mlock::raw_unlock(void * addr, size_t size) { if (munlock(addr, size)) { fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); } } #elif defined(_WIN32) - static constexpr bool SUPPORTED = true; + - static size_t lock_granularity() { + size_t llama_mlock::lock_granularity() { SYSTEM_INFO si; GetSystemInfo(&si); return (size_t) si.dwPageSize; } - bool raw_lock(void * ptr, size_t len) const { + bool llama_mlock::raw_lock(void * ptr, size_t len) const { for (int tries = 1; ; tries++) { if (VirtualLock(ptr, len)) { return true; @@ -1070,27 +939,26 @@ struct llama_mlock { } } - static void raw_unlock(void * ptr, size_t len) { + static void llama_mlock::raw_unlock(void * ptr, size_t len) { if (!VirtualUnlock(ptr, len)) { fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", llama_format_win_err(GetLastError()).c_str()); } } #else - static constexpr bool SUPPORTED = false; - - static size_t lock_granularity() { + + static size_t llama_mlock::lock_granularity() { return (size_t) 65536; } - bool raw_lock(const void * addr, size_t len) const { + bool llama_mlock::raw_lock(const void * addr, size_t len) const { fprintf(stderr, "warning: mlock not supported on this system\n"); return false; } - static void raw_unlock(const void * addr, size_t len) {} + static void llama_mlock::raw_unlock(const void * addr, size_t len) {} #endif -}; + typedef void (*offload_func_t)(struct ggml_tensor * tensor); @@ -1125,364 +993,12 @@ struct llama_state { static llama_state g_state; -// available llama models -enum e_model { - MODEL_UNKNOWN, - MODEL_1B, - MODEL_3B, - MODEL_7B, - MODEL_8B, - MODEL_13B, - MODEL_15B, - MODEL_30B, - MODEL_34B, - MODEL_40B, - MODEL_65B, - MODEL_70B, -}; static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; -struct llama_hparams { - bool vocab_only; - uint32_t n_vocab; - uint32_t n_ctx_train; // context size the model was trained on - uint32_t n_embd; - uint32_t n_head; - uint32_t n_head_kv; - uint32_t n_layer; - uint32_t n_rot; - uint32_t n_ff; - float f_norm_eps; - float f_norm_rms_eps; - - float rope_freq_base_train; - float rope_freq_scale_train; - uint32_t n_yarn_orig_ctx; - int8_t rope_scaling_type_train : 3; - bool rope_finetuned : 1; - - float f_clamp_kqv; - float f_max_alibi_bias; - - bool operator!=(const llama_hparams & other) const { - if (this->vocab_only != other.vocab_only) return true; - if (this->n_vocab != other.n_vocab) return true; - if (this->n_ctx_train != other.n_ctx_train) return true; - if (this->n_embd != other.n_embd) return true; - if (this->n_head != other.n_head) return true; - if (this->n_head_kv != other.n_head_kv) return true; - if (this->n_layer != other.n_layer) return true; - if (this->n_rot != other.n_rot) return true; - if (this->n_ff != other.n_ff) return true; - if (this->rope_finetuned != other.rope_finetuned) return true; - if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; - - const float EPSILON = 1e-9; - - if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; - if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; - if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; - if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; - - return false; - } - - uint32_t n_gqa() const { - return n_head/n_head_kv; - } - - uint32_t n_embd_head() const { - return n_embd/n_head; - } - - uint32_t n_embd_gqa() const { - return n_embd/n_gqa(); - } -}; - -struct llama_cparams { - uint32_t n_ctx; // context size used during inference - uint32_t n_batch; - uint32_t n_threads; // number of threads to use for generation - uint32_t n_threads_batch; // number of threads to use for batch processing - - float rope_freq_base; - float rope_freq_scale; - - uint32_t n_yarn_orig_ctx; - // These hyperparameters are not exposed in GGUF, because all - // existing YaRN models use the same values for them. - float yarn_ext_factor; - float yarn_attn_factor; - float yarn_beta_fast; - float yarn_beta_slow; - - bool mul_mat_q; -}; - -struct llama_layer { - // normalization - struct ggml_tensor * attn_norm; - struct ggml_tensor * attn_norm_b; - struct ggml_tensor * attn_norm_2; - struct ggml_tensor * attn_norm_2_b; - struct ggml_tensor * attn_q_norm; - struct ggml_tensor * attn_q_norm_b; - struct ggml_tensor * attn_k_norm; - struct ggml_tensor * attn_k_norm_b; - - // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; - struct ggml_tensor * wqkv; - - // attention bias - struct ggml_tensor * bo; - struct ggml_tensor * bqkv; - - // normalization - struct ggml_tensor * ffn_norm; - struct ggml_tensor * ffn_norm_b; - - // ff - struct ggml_tensor * ffn_gate; // w1 - struct ggml_tensor * ffn_down; // w2 - struct ggml_tensor * ffn_up; // w3 - - // ff bias - struct ggml_tensor * ffn_down_b; // b2 - struct ggml_tensor * ffn_up_b; // b3 -}; - -struct llama_kv_cell { - llama_pos pos = -1; - llama_pos delta = 0; - - std::set seq_id; - - bool has_seq_id(const llama_seq_id & id) const { - return seq_id.find(id) != seq_id.end(); - } -}; - -// ring-buffer of cached KV data -struct llama_kv_cache { - bool has_shift = false; - - // Note: The value of head isn't only used to optimize searching - // for a free KV slot. llama_decode_internal also uses it, so it - // cannot be freely changed after a slot has been allocated. - uint32_t head = 0; - uint32_t size = 0; - - // computed before each graph build - uint32_t n = 0; - - std::vector cells; - - struct ggml_tensor * k = NULL; - struct ggml_tensor * v = NULL; - - struct ggml_context * ctx = NULL; - - llama_buffer buf; - - ~llama_kv_cache() { - if (ctx) { - ggml_free(ctx); - } - -#ifdef GGML_USE_CUBLAS - if (ggml_cublas_loaded()) { - ggml_cuda_free_data(k); - ggml_cuda_free_data(v); - } -#endif - } -}; - -struct llama_vocab { - using id = int32_t; - using token = std::string; - using ttype = llama_token_type; - - struct token_data { - token text; - float score; - ttype type; - }; - - enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; - - std::unordered_map token_to_id; - std::vector id_to_token; - - std::unordered_map special_tokens_cache; - - std::map, int> bpe_ranks; - - // default LLaMA special tokens - id special_bos_id = 1; - id special_eos_id = 2; - id special_unk_id = 0; - id special_sep_id = -1; - id special_pad_id = -1; - - int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add. - int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add. - - id linefeed_id = 13; - id special_prefix_id = 32007; - id special_middle_id = 32009; - id special_suffix_id = 32008; - id special_eot_id = 32010; - - int find_bpe_rank(std::string token_left, std::string token_right) const { - GGML_ASSERT(token_left.find(" ") == std::string::npos); - GGML_ASSERT(token_left.find("\n") == std::string::npos); - GGML_ASSERT(token_right.find(" ") == std::string::npos); - GGML_ASSERT(token_right.find("\n") == std::string::npos); - - auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); - if (it == bpe_ranks.end()) { - return -1; - } - - return it->second; - } -}; - -struct llama_model { - e_model type = MODEL_UNKNOWN; - llm_arch arch = LLM_ARCH_UNKNOWN; - llama_ftype ftype = LLAMA_FTYPE_ALL_F32; - - std::string name = "n/a"; - - llama_hparams hparams = {}; - llama_vocab vocab; - - struct ggml_tensor * tok_embd; - struct ggml_tensor * pos_embd; - struct ggml_tensor * tok_norm; - struct ggml_tensor * tok_norm_b; - - struct ggml_tensor * output_norm; - struct ggml_tensor * output_norm_b; - struct ggml_tensor * output; - - std::vector layers; - - int n_gpu_layers; - - // gguf metadata - std::unordered_map gguf_kv; - - // context - struct ggml_context * ctx = NULL; - - // the model memory buffer - llama_buffer buf; - - // model memory mapped file - std::unique_ptr mapping; - - // objects representing data potentially being locked in memory - llama_mlock mlock_buf; - llama_mlock mlock_mmap; - - // for quantize-stats only - std::vector> tensors_by_name; - - int64_t t_load_us = 0; - int64_t t_start_us = 0; - - ~llama_model() { - if (ctx) { - ggml_free(ctx); - } - -#ifdef GGML_USE_CUBLAS - if (ggml_cublas_loaded()) { - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - ggml_cuda_free_data(tensors_by_name[i].second); - } - ggml_cuda_free_scratch(); - } -#endif - -#if defined(GGML_USE_CLBLAST) - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - ggml_cl_free_data(tensors_by_name[i].second); - } -#endif - } -}; - -struct llama_context { - llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} - ~llama_context() { -#ifdef GGML_USE_METAL - if (ctx_metal) { - ggml_metal_free(ctx_metal); - } -#endif - if (alloc) { - ggml_allocr_free(alloc); - } - } - - llama_cparams cparams; - - const llama_model & model; - - // key + value cache for the self attention - struct llama_kv_cache kv_self; - - std::mt19937 rng; - - bool has_evaluated_once = false; - - int64_t t_start_us; - int64_t t_load_us; - int64_t t_sample_us = 0; - int64_t t_p_eval_us = 0; - int64_t t_eval_us = 0; - - int32_t n_sample = 0; // number of tokens sampled - int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - int32_t n_eval = 0; // number of eval calls - - // decode output (2-dimensional array: [n_tokens][n_vocab]) - std::vector logits; - bool logits_all = false; - - // input embedding (1-dimensional array: [n_embd]) - std::vector embedding; - - // reusable buffer for `struct ggml_graph_plan.work_data` - std::vector work_buffer; - - // memory buffers used to evaluate the model - llama_buffer buf_compute; - - llama_buffer buf_alloc; - ggml_allocr * alloc = NULL; - -#ifdef GGML_USE_METAL - ggml_metal_context * ctx_metal = NULL; -#endif - -#ifdef GGML_USE_MPI - ggml_mpi_context * ctx_mpi = NULL; -#endif -}; // // kv cache helpers @@ -1731,11 +1247,6 @@ static void llama_kv_cache_seq_shift( // model loading and saving // -enum llama_fver { - GGUF_FILE_VERSION_V1 = 1, - GGUF_FILE_VERSION_V2 = 2, - GGUF_FILE_VERSION_V3 = 3, -}; static const char * llama_file_version_name(llama_fver version) { switch (version) { @@ -6328,6 +5839,32 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< } } +// struct + +bool llama_hparams::operator!=(const llama_hparams & other) const { + if (this->vocab_only != other.vocab_only) return true; + if (this->n_vocab != other.n_vocab) return true; + if (this->n_ctx_train != other.n_ctx_train) return true; + if (this->n_embd != other.n_embd) return true; + if (this->n_head != other.n_head) return true; + if (this->n_head_kv != other.n_head_kv) return true; + if (this->n_layer != other.n_layer) return true; + if (this->n_rot != other.n_rot) return true; + if (this->n_ff != other.n_ff) return true; + if (this->rope_finetuned != other.rope_finetuned) return true; + if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; + + const float EPSILON = 1e-9; + + if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; + if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; + if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; + if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; + + return false; + } + + static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) { std::vector output; @@ -9713,3 +9250,40 @@ static void llama_log_callback_default(ggml_log_level level, const char * text, fputs(text, stderr); fflush(stderr); } + + +// LLM_TN +LLM_TN::LLM_TN(llm_arch arch) : arch(arch) {} + + +std::string LLM_TN::operator()(llm_tensor tensor) const { + return LLM_TENSOR_NAMES[arch].at(tensor); + } + + std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix) const { + return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix; + } + + std::string LLM_TN::operator()(llm_tensor tensor, int bid) const { + return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid); + } + + std::string LLM_TN::operator()(llm_tensor tensor, const std::string & suffix, int bid) const { + return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix; + } + +std::string LLM_KV::operator()(llm_kv kv) const { + return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str()); +} + + +llama_context::~llama_context() { +#ifdef GGML_USE_METAL + if (ctx_metal) { + ggml_metal_free(ctx_metal); + } +#endif + if (alloc) { + ggml_allocr_free(alloc); + } + } diff --git a/print.hpp b/print.hpp index 83ae1e10b..40dcdd802 100644 --- a/print.hpp +++ b/print.hpp @@ -3,6 +3,8 @@ #include //#include #include "llama.h" +#include "ggml-internal.hpp" +#include "llama-internal.hpp" REFL_TYPE(ggml_init_params ) REFL_END @@ -92,6 +94,12 @@ REFL_END REFL_TYPE(llama_sampling_context ) +REFL_FIELD( params) +REFL_FIELD( mirostat_mu) +REFL_FIELD( grammar) +REFL_FIELD( parsed_grammar) +REFL_FIELD( prev) +REFL_FIELD( cur) REFL_END REFL_TYPE(llama_token_data ) @@ -148,15 +156,24 @@ REFL_TYPE(ggml_something) REFL_FIELD(type_name) REFL_END -// REFL_TYPE(ggml_context) -// REFL_FIELD(mem_size) -// REFL_FIELD(mem_buffer) -// REFL_END +REFL_TYPE(ggml_context) + REFL_FIELD(mem_size) +REFL_FIELD(mem_buffer) +REFL_FIELD(mem_buffer_owned) +REFL_FIELD( no_alloc) +REFL_FIELD( no_alloc_save) +REFL_FIELD( n_objects) +REFL_FIELD( objects_begin) +REFL_FIELD( objects_end) +REFL_FIELD( scratch) +REFL_FIELD( scratch_save) -//REFL_TYPE(ggml_context_container) -// REFL_FIELD(used) -// REFL_FIELD(context) -//REFL_END +REFL_END + +REFL_TYPE(ggml_context_container) + REFL_FIELD(used) + REFL_FIELD(context) +REFL_END // REFL_TYPE(ggml_numa_node) // REFL_FIELD(cpus) @@ -340,11 +357,70 @@ REFL_END // REFL_END REFL_TYPE(llama_model) -// REFL_FIELD(type) -// REFL_FIELD(arch) + REFL_FIELD(type) + REFL_FIELD(arch) +REFL_FIELD(ftype ) + +REFL_FIELD( name ) + + REFL_FIELD( hparams ) +REFL_FIELD( vocab) + +REFL_FIELD( tok_embd) +REFL_FIELD( pos_embd) +REFL_FIELD( tok_norm) +REFL_FIELD( tok_norm_b) + +REFL_FIELD( output_norm) +REFL_FIELD( output_norm_b) +REFL_FIELD( output) + +REFL_FIELD( layers) + +REFL_FIELD( n_gpu_layers) + +REFL_FIELD( gguf_kv) + REFL_FIELD( ctx) + REFL_FIELD( buf) + REFL_FIELD( mapping) +REFL_FIELD( mlock_buf) +REFL_FIELD( mlock_mmap) +REFL_FIELD( tensors_by_name) + REFL_FIELD( t_load_us) +REFL_FIELD( t_start_us) + REFL_END REFL_TYPE(llama_context) +REFL_FIELD( cparams) +//REFL_FIELD(model) +REFL_FIELD(kv_self) +REFL_FIELD(rng) +REFL_FIELD(has_evaluated_once ) +REFL_FIELD( t_start_us) +REFL_FIELD( t_load_us) + REFL_FIELD( t_sample_us ) +REFL_FIELD( t_p_eval_us ) + REFL_FIELD( t_eval_us) +REFL_FIELD( n_sample ) +REFL_FIELD( n_p_eval ) + REFL_FIELD( n_eval ) +REFL_FIELD( logits) +REFL_FIELD( logits_all ) +REFL_FIELD( embedding) +REFL_FIELD( work_buffer) + REFL_FIELD( buf_compute) + REFL_FIELD( buf_alloc) +REFL_FIELD( alloc ) + +#ifdef GGML_USE_METAL +REFL_FIELD( ctx_metal ) +#endif + +#ifdef GGML_USE_MPI +REFL_FIELD( ctx_mpi ) + +#endif REFL_END // REFL_TYPE(llama_model_loader) @@ -459,7 +535,7 @@ void print_fields(const T& ) { // T instance{}; for_each(refl::reflect().members, [&](auto member) { - std::cout << "MEMBER" << member.name.str() << "\n"; + std::cout << "MEMBER:" << member.name.str() << "\n"; }); @@ -468,7 +544,8 @@ void print_fields(const T& ) { //if ((refl::descriptor::is_field(member)) && (!member.has_attribute()))) { //if ((refl::descriptor::is_field(member))) { // // Print the member name and value - std::cout << member.name << ": " << "\n"; + std::cout + << "Auto:" << member.name << ": " << "\n"; // refl::get(member, obj) //} });