From 89b2a43cac57beeaea9c5ea0af371d3a301d7e3f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Feb 2024 10:28:44 +0200 Subject: [PATCH] llama : cont k-shift refactoring + normalize type names ggml-ci --- common/common.cpp | 12 +- common/common.h | 4 +- examples/llama-bench/llama-bench.cpp | 14 +- examples/server/server.cpp | 12 +- llama.cpp | 318 +++++++++++++-------------- llama.h | 41 ++-- 6 files changed, 199 insertions(+), 202 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 10ef11829..95767ce4b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-scale") { if (++i >= argc) { @@ -630,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_NONE; + params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_LAYER; + params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_ROW; + params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; break; diff --git a/common/common.h b/common/common.h index 935771d44..3e21579b0 100644 --- a/common/common.h +++ b/common/common.h @@ -61,7 +61,7 @@ struct gpt_params { float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs + llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width. @@ -75,7 +75,7 @@ struct gpt_params { float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length - int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; + int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; // // sampling parameters diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 11410f8ae..8fec3d43d 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) { static const char * split_mode_str(llama_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_NONE: return "none"; - case LLAMA_SPLIT_LAYER: return "layer"; - case LLAMA_SPLIT_ROW: return "row"; + case LLAMA_SPLIT_MODE_NONE: return "none"; + case LLAMA_SPLIT_MODE_LAYER: return "layer"; + case LLAMA_SPLIT_MODE_ROW: return "row"; default: GGML_ASSERT(!"invalid split mode"); } } @@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = { /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, - /* split_mode */ {LLAMA_SPLIT_LAYER}, + /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, @@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { for (const auto & m : p) { llama_split_mode mode; if (m == "none") { - mode = LLAMA_SPLIT_NONE; + mode = LLAMA_SPLIT_MODE_NONE; } else if (m == "layer") { - mode = LLAMA_SPLIT_LAYER; + mode = LLAMA_SPLIT_MODE_LAYER; } else if (m == "row") { - mode = LLAMA_SPLIT_ROW; + mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; break; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1b887b7a2..89fdd0f81 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2082,9 +2082,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-freq-base") @@ -2208,15 +2208,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_NONE; + params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_LAYER; + params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_ROW; + params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; diff --git a/llama.cpp b/llama.cpp index accf026b1..a69c86e6a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -850,9 +850,9 @@ struct LLM_TN { // static std::map LLAMA_ROPE_SCALING_TYPES = { - { LLAMA_ROPE_SCALING_NONE, "none" }, - { LLAMA_ROPE_SCALING_LINEAR, "linear" }, - { LLAMA_ROPE_SCALING_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, + { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, }; static int32_t llama_rope_scaling_type_from_string(const std::string & name) { @@ -862,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) { } } - return LLAMA_ROPE_SCALING_UNSPECIFIED; + return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; } static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { @@ -1581,7 +1581,8 @@ struct llama_hparams { bool causal_attn = true; bool need_kq_pos = false; - uint32_t pooling_type = LLAMA_POOLING_NONE; + enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; + enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -2311,7 +2312,7 @@ namespace GGUFMeta { } }; - struct ArrayInfo{ + struct ArrayInfo { const gguf_type gt; const size_t length; const void * data; @@ -2330,7 +2331,7 @@ namespace GGUFMeta { }; template - class GKV: public GKV_Base { + class GKV : public GKV_Base { GKV() = delete; public: @@ -2353,39 +2354,39 @@ namespace GGUFMeta { return "unknown"; } - static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) { - if (!override) { return false; } - if (override->tag == expected_type) { + static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) { + if (!ovrd) { return false; } + if (ovrd->tag == expected_type) { LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", - __func__, override_type_to_str(override->tag), override->key); - switch (override->tag) { + __func__, override_type_to_str(ovrd->tag), ovrd->key); + switch (ovrd->tag) { case LLAMA_KV_OVERRIDE_BOOL: { - LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false"); + LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false"); } break; case LLAMA_KV_OVERRIDE_INT: { - LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value); + LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value); } break; case LLAMA_KV_OVERRIDE_FLOAT: { - LLAMA_LOG_INFO("%.6f\n", override->float_value); + LLAMA_LOG_INFO("%.6f\n", ovrd->float_value); } break; default: // Shouldn't be possible to end up here, but just in case... throw std::runtime_error( format("Unsupported attempt to override %s type for metadata key %s\n", - override_type_to_str(override->tag), override->key)); + override_type_to_str(ovrd->tag), ovrd->key)); } return true; } LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n", - __func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag)); + __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag)); return false; } template static typename std::enable_if::value, bool>::type - try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) { - target = override->bool_value; + try_override(OT & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_BOOL, ovrd)) { + target = ovrd->bool_value; return true; } return false; @@ -2393,9 +2394,9 @@ namespace GGUFMeta { template static typename std::enable_if::value && std::is_integral::value, bool>::type - try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) { - target = override->int_value; + try_override(OT & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_INT, ovrd)) { + target = ovrd->int_value; return true; } return false; @@ -2403,9 +2404,9 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type - try_override(T & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) { - target = override->float_value; + try_override(T & target, const struct llama_model_kv_override * ovrd) { + if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, ovrd)) { + target = ovrd->float_value; return true; } return false; @@ -2413,17 +2414,17 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type - try_override(T & target, const struct llama_model_kv_override *override) { + try_override(T & target, const struct llama_model_kv_override * ovrd) { (void)target; - (void)override; - if (!override) { return false; } + (void)ovrd; + if (!ovrd) { return false; } // Currently, we should never end up here so it would be a bug if we do. throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n", - override ? override->key : "NULL")); + ovrd ? ovrd->key : "NULL")); } - static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) { - if (try_override(target, override)) { + static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + if (try_override(target, ovrd)) { return true; } if (k < 0) { return false; } @@ -2431,12 +2432,12 @@ namespace GGUFMeta { return true; } - static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) { - return set(ctx, gguf_find_key(ctx, key), target, override); + static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + return set(ctx, gguf_find_key(ctx, key), target, ovrd); } - static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) { - return set(ctx, key.c_str(), target, override); + static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) { + return set(ctx, key.c_str(), target, ovrd); } }; } @@ -2846,6 +2847,15 @@ struct llama_model_loader { } }; +template<> +bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) { + uint32_t tmp; + const bool found = get_key(kid, tmp, required); + result = (enum llama_pooling_type) tmp; + return found; +} + + // // load LLaMA models // @@ -2924,16 +2934,16 @@ static const char * llama_model_type_name(e_model type) { default: return "?B"; } } + static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ switch (type) { - case LLAMA_VOCAB_TYPE_SPM: return "SPM"; - case LLAMA_VOCAB_TYPE_BPE: return "BPE"; - case LLAMA_VOCAB_TYPE_WPM: return "WPM"; - default: return "unknown"; + case LLAMA_VOCAB_TYPE_SPM: return "SPM"; + case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + case LLAMA_VOCAB_TYPE_WPM: return "WPM"; + default: return "unknown"; } } - static void llm_load_arch(llama_model_loader & ml, llama_model & model) { model.arch = ml.get_arch(); if (model.arch == LLM_ARCH_UNKNOWN) { @@ -2997,7 +3007,7 @@ static void llm_load_hparams( std::string rope_scaling("linear"); ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); - GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); + GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED); // rope_freq_scale (inverse of the kv) is optional float ropescale = 0.0f; @@ -3110,10 +3120,10 @@ static void llm_load_hparams( } break; case LLM_ARCH_BERT: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); switch (hparams.n_layer) { case 3: @@ -3131,10 +3141,10 @@ static void llm_load_hparams( } break; case LLM_ARCH_NOMIC_BERT: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type); - ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); if (hparams.n_layer == 12 && hparams.n_embd == 768) { model.type = e_model::MODEL_137M; @@ -3273,6 +3283,8 @@ static void llm_load_hparams( if (hparams.f_max_alibi_bias > 0.0f) { hparams.need_kq_pos = true; } + + hparams.rope_type = llama_rope_type(&model); } // TODO: This should probably be in llama.h @@ -3575,6 +3587,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); + LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type); + LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); @@ -3641,7 +3655,7 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } - if (split_mode == LLAMA_SPLIT_LAYER) { + if (split_mode == LLAMA_SPLIT_MODE_LAYER) { // calculate the split points int device_count = llama_get_device_count(); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); @@ -3680,10 +3694,10 @@ static bool llm_load_tensors( } } else { ggml_backend_buffer_type_t split_buft; - if (split_mode == LLAMA_SPLIT_ROW) { + if (split_mode == LLAMA_SPLIT_MODE_ROW) { split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); } else { - // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported + // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported split_buft = llama_default_buffer_type_offload(main_gpu); } // assign the repeating layers @@ -4596,13 +4610,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam using llm_build_cb = std::function; -enum llm_rope_type : int { - LLM_ROPE_NONE = -1, - LLM_ROPE = 0, - LLM_ROPE_NEOX = 2, - LLM_ROPE_GLM = 4, -}; - enum llm_ffn_op_type { LLM_FFN_SILU, LLM_FFN_GELU, @@ -4648,47 +4655,6 @@ static struct ggml_tensor * llm_build_inp_embd( return inpL; } -// Persimmon: n_rot = n_embd_head_k/2 -// Other: n_rot = n_embd_head_k -static void llm_build_k_shift( - struct ggml_context * ctx, - const llama_hparams & hparams, - const llama_cparams & cparams, - const llama_kv_cache & kv, - struct ggml_cgraph * graph, - struct ggml_tensor * K_shift, - llm_rope_type rope_type, - int64_t n_ctx, - float freq_base, - float freq_scale, - const llm_build_cb & cb) { - const int64_t n_layer = hparams.n_layer; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const int32_t n_rot = hparams.n_rot; - const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; - const float ext_factor = cparams.yarn_ext_factor; - const float attn_factor = cparams.yarn_attn_factor; - const float beta_fast = cparams.yarn_beta_fast; - const float beta_slow = cparams.yarn_beta_slow; - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - // we rotate only the first n_rot dimensions - ggml_rope_custom_inplace(ctx, - ggml_view_3d(ctx, kv.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, - ggml_row_size(kv.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), - 0), - K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(graph, tmp); - } -} - static void llm_build_kv_store( struct ggml_context * ctx, const llama_hparams & hparams, @@ -4982,38 +4948,6 @@ static struct ggml_tensor * llm_build_kv( return cur; } -static llm_rope_type llm_get_rope_type(llm_arch arch) { - switch (arch) { - case LLM_ARCH_LLAMA: return LLM_ROPE; - case LLM_ARCH_FALCON: return LLM_ROPE_NEOX; - case LLM_ARCH_BAICHUAN: return LLM_ROPE; - case LLM_ARCH_GPT2: return LLM_ROPE_NONE; - case LLM_ARCH_GPTJ: return LLM_ROPE_NONE; - case LLM_ARCH_GPTNEOX: return LLM_ROPE_NONE; - case LLM_ARCH_MPT: return LLM_ROPE_NONE; - case LLM_ARCH_STARCODER: return LLM_ROPE; - case LLM_ARCH_PERSIMMON: return LLM_ROPE_NEOX; - case LLM_ARCH_REFACT: return LLM_ROPE_NONE; - case LLM_ARCH_BERT: return LLM_ROPE_NEOX; - case LLM_ARCH_NOMIC_BERT: return LLM_ROPE_NEOX; - case LLM_ARCH_BLOOM: return LLM_ROPE_NONE; - case LLM_ARCH_STABLELM: return LLM_ROPE_NEOX; - case LLM_ARCH_QWEN: return LLM_ROPE_NEOX; - case LLM_ARCH_QWEN2: return LLM_ROPE_NEOX; - case LLM_ARCH_PHI2: return LLM_ROPE_NEOX; - case LLM_ARCH_PLAMO: return LLM_ROPE; - case LLM_ARCH_CODESHELL: return LLM_ROPE; - case LLM_ARCH_ORION: return LLM_ROPE; - case LLM_ARCH_INTERNLM2: return LLM_ROPE; - case LLM_ARCH_MINICPM: return LLM_ROPE; - case LLM_ARCH_GEMMA: return LLM_ROPE; - case LLM_ARCH_UNKNOWN: - default: - GGML_ASSERT(false && "unknown architecture"); - return LLM_ROPE_NONE; - } -} - struct llm_build_context { const llama_model & model; const llama_context & lctx; @@ -5024,6 +4958,7 @@ struct llm_build_context { const int64_t n_embd; const int64_t n_layer; + const int64_t n_rot; const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) const int64_t n_head; const int64_t n_head_kv; @@ -5048,9 +4983,8 @@ struct llm_build_context { const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_orig_ctx; - const uint32_t pooling_type; - - const llm_rope_type rope_type; + const enum llama_pooling_type pooling_type; + const enum llama_rope_type rope_type; const llm_build_cb & cb; @@ -5072,6 +5006,7 @@ struct llm_build_context { kv_self (lctx.kv_self), n_embd (hparams.n_embd), n_layer (hparams.n_layer), + n_rot (hparams.n_rot), n_ctx (cparams.n_ctx), n_head (hparams.n_head), n_head_kv (hparams.n_head_kv), @@ -5093,8 +5028,8 @@ struct llm_build_context { n_kv (worst_case ? n_ctx : kv_self.n), kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), - pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE), - rope_type (llm_get_rope_type(model.arch)), + pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE), + rope_type (hparams.rope_type), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -5120,7 +5055,20 @@ struct llm_build_context { struct ggml_cgraph * build_k_shift() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, rope_type, n_ctx, freq_base, freq_scale, cb); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * tmp = + // we rotate only the first n_rot dimensions + ggml_rope_custom_inplace(ctx0, + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_head_kv, n_ctx, + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + 0), + lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(tmp, "K_shifted", il); + ggml_build_forward_expand(gf, tmp); + } return gf; } @@ -6063,12 +6011,12 @@ struct llm_build_context { cur = inpL; // pooling layer - if (pooling_type == LLAMA_POOLING_MEAN) { + if (pooling_type == LLAMA_POOLING_TYPE_MEAN) { cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); - } else if (pooling_type == LLAMA_POOLING_CLS) { + } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) { cur = ggml_get_rows(ctx0, cur, inp_cls); } else { - GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type"); + GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type"); } cb(cur, "result_embd", -1); @@ -7521,6 +7469,7 @@ struct llm_build_context { static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { llama_batch dummy; + dummy.n_tokens = 0; llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; @@ -7735,7 +7684,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); @@ -7763,7 +7712,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); @@ -7784,7 +7733,7 @@ static void llama_graph_compute( ggml_cgraph * gf, int n_threads) { #ifdef GGML_USE_MPI - const int64_t n_layer = hparams.n_layer; + const int64_t n_layer = lctx.hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif @@ -7902,9 +7851,7 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - if (kv_self.has_shift) { - llama_kv_cache_apply_k_shift(&lctx); - } + llama_kv_cache_apply(&lctx); ggml_backend_sched_reset(lctx.sched); ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); @@ -8042,24 +7989,25 @@ static int llama_decode_internal( return 0; } -void llama_kv_cache_apply_k_shift(struct llama_context * ctx) { - struct llama_context & lctx = *ctx; +static void llama_kv_cache_apply_internal(struct llama_context & lctx) { + // apply K-shift if needed + if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { + llama_set_k_shift(lctx); - llama_set_k_shift(lctx); + { + ggml_cgraph * gf = llama_build_graph_k_shift(lctx); - { - ggml_cgraph * gf = llama_build_graph_k_shift(lctx); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + } - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); - } + { + auto & kv_self = lctx.kv_self; - { - auto & kv_self = ctx->kv_self; + kv_self.has_shift = false; - kv_self.has_shift = false; - - for (uint32_t i = 0; i < kv_self.size; ++i) { - kv_self.cells[i].delta = 0; + for (uint32_t i = 0; i < kv_self.size; ++i) { + kv_self.cells[i].delta = 0; + } } } } @@ -11338,7 +11286,7 @@ static int llama_apply_lora_from_file_internal( struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_gpu_layers =*/ 0, - /*.split_mode =*/ LLAMA_SPLIT_LAYER, + /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, @@ -11364,7 +11312,7 @@ struct llama_context_params llama_context_default_params() { /*.n_batch =*/ 512, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, - /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, /*.yarn_ext_factor =*/ -1.0f, @@ -11552,16 +11500,16 @@ struct llama_context * llama_new_context_with_model( cparams.cb_eval_user_data = params.cb_eval_user_data; auto rope_scaling_type = params.rope_scaling_type; - if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { rope_scaling_type = hparams.rope_scaling_type_train; } - if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none } if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' - cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } if (params.seed == LLAMA_DEFAULT_SEED) { @@ -11595,8 +11543,8 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_CUBLAS) if (model->n_gpu_layers > 0) { - // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) { + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); @@ -11605,7 +11553,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } else { - // LLAMA_SPLIT_LAYER requires a backend for each GPU + // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_cuda_init(device); if (backend == nullptr) { @@ -11807,6 +11755,38 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { return model->vocab.type; } +enum llama_rope_type llama_rope_type(const struct llama_model * model) { + switch (model->arch) { + case LLM_ARCH_LLAMA: return LLAMA_ROPE_TYPE; + case LLM_ARCH_FALCON: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_BAICHUAN: return LLAMA_ROPE_TYPE; + case LLM_ARCH_GPT2: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_GPTJ: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_GPTNEOX: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_MPT: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_STARCODER: return LLAMA_ROPE_TYPE; + case LLM_ARCH_PERSIMMON: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_REFACT: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_BERT: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_NOMIC_BERT: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_BLOOM: return LLAMA_ROPE_TYPE_NONE; + case LLM_ARCH_STABLELM: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_QWEN: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_QWEN2: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_PHI2: return LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_PLAMO: return LLAMA_ROPE_TYPE; + case LLM_ARCH_CODESHELL: return LLAMA_ROPE_TYPE; + case LLM_ARCH_ORION: return LLAMA_ROPE_TYPE; + case LLM_ARCH_INTERNLM2: return LLAMA_ROPE_TYPE; + case LLM_ARCH_MINICPM: return LLAMA_ROPE_TYPE; + case LLM_ARCH_GEMMA: return LLAMA_ROPE_TYPE; + case LLM_ARCH_UNKNOWN: + default: + GGML_ASSERT(false && "unknown architecture"); + return LLAMA_ROPE_TYPE_NONE; + } +} + int32_t llama_n_vocab(const struct llama_model * model) { return model->vocab.id_to_token.size(); } @@ -12065,6 +12045,10 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); } +void llama_kv_cache_apply(struct llama_context * ctx) { + llama_kv_cache_apply_internal(*ctx); +} + // Returns the *maximum* size of the state size_t llama_get_state_size(const struct llama_context * ctx) { diff --git a/llama.h b/llama.h index 104ca7ead..479265f6c 100644 --- a/llama.h +++ b/llama.h @@ -64,6 +64,13 @@ extern "C" { LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece }; + enum llama_rope_type { + LLAMA_ROPE_TYPE_NONE = -1, + LLAMA_ROPE_TYPE = 0, + LLAMA_ROPE_TYPE_NEOX = 2, + LLAMA_ROPE_TYPE_GLM = 4, + }; + enum llama_token_type { LLAMA_TOKEN_TYPE_UNDEFINED = 0, LLAMA_TOKEN_TYPE_NORMAL = 1, @@ -107,23 +114,23 @@ extern "C" { }; enum llama_rope_scaling_type { - LLAMA_ROPE_SCALING_UNSPECIFIED = -1, - LLAMA_ROPE_SCALING_NONE = 0, - LLAMA_ROPE_SCALING_LINEAR = 1, - LLAMA_ROPE_SCALING_YARN = 2, - LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, + LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1, + LLAMA_ROPE_SCALING_TYPE_NONE = 0, + LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, + LLAMA_ROPE_SCALING_TYPE_YARN = 2, + LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, }; enum llama_pooling_type { - LLAMA_POOLING_NONE = 0, - LLAMA_POOLING_MEAN = 1, - LLAMA_POOLING_CLS = 2, + LLAMA_POOLING_TYPE_NONE = 0, + LLAMA_POOLING_TYPE_MEAN = 1, + LLAMA_POOLING_TYPE_CLS = 2, }; enum llama_split_mode { - LLAMA_SPLIT_NONE = 0, // single GPU - LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs - LLAMA_SPLIT_ROW = 2, // split rows across GPUs + LLAMA_SPLIT_MODE_NONE = 0, // single GPU + LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs + LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs }; typedef struct llama_token_data { @@ -358,6 +365,7 @@ extern "C" { LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); + LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); @@ -512,7 +520,9 @@ extern "C" { llama_seq_id seq_id); // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) - // If the KV cache is RoPEd, the KV data is updated accordingly + // If the KV cache is RoPEd, the KV data is updated accordingly: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_apply() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_add( @@ -523,7 +533,9 @@ extern "C" { llama_pos delta); // Integer division of the positions by factor of `d > 1` - // If the KV cache is RoPEd, the KV data is updated accordingly + // If the KV cache is RoPEd, the KV data is updated accordingly: + // - lazily on next llama_decode() + // - explicitly with llama_kv_cache_apply() // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) LLAMA_API void llama_kv_cache_seq_div( @@ -533,7 +545,8 @@ extern "C" { llama_pos p1, int d); - LLAMA_API void llama_kv_cache_apply_k_shift(struct llama_context * ctx); + // Apply the KV cache updates (such as K-shifts) to the KV data + LLAMA_API void llama_kv_cache_apply(struct llama_context * ctx); // // State / sessions