From 07553cfb0fdf3b638c0ec4013d08f4fb5efb7925 Mon Sep 17 00:00:00 2001 From: HimariO Date: Sun, 10 Nov 2024 16:10:29 +0800 Subject: [PATCH] update `llama_hparams` --- convert_hf_to_gguf.py | 6 ++++++ examples/llava/qwen2_vl_surgery.py | 5 ++++- gguf-py/gguf/constants.py | 1 + gguf-py/gguf/gguf_writer.py | 3 +++ src/llama.cpp | 26 +++++++++++++++++--------- 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bccda506b..df14a7988 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1980,6 +1980,12 @@ class Qwen2Model(Model): class Qwen2VLModel(Model): model_arch = gguf.MODEL_ARCH.QWEN2VL + def set_gguf_parameters(self): + super().set_gguf_parameters() + mrope_section = self.hparams["rope_scaling"]["mrope_section"] + mrope_section += [0] * max(0, 4 - len(mrope_section)) + self.gguf_writer.add_rope_dimension_sections(mrope_section) + def set_vocab(self): try: self._set_vocab_sentencepiece() diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py index c6d966581..c71bc973f 100644 --- a/examples/llava/qwen2_vl_surgery.py +++ b/examples/llava/qwen2_vl_surgery.py @@ -133,7 +133,10 @@ def main(args): fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth) fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # BUG: not sure what this does fout.add_name(model_name) - # fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) + """ + HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig, + it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`. + """ processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name) # breakpoint() diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3feabfd64..450bd9716 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -131,6 +131,7 @@ class Keys: class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" + DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" FREQ_BASE = "{arch}.rope.freq_base" SCALING_TYPE = "{arch}.rope.scaling.type" SCALING_FACTOR = "{arch}.rope.scaling.factor" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7a55d1296..3b1d7e9e9 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -750,6 +750,9 @@ class GGUFWriter: def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) + + def add_rope_dimension_sections(self, dims: Sequence[int]) -> None: + self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims) def add_rope_freq_base(self, value: float) -> None: self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value) diff --git a/src/llama.cpp b/src/llama.cpp index 7c4172a42..7c31c80d8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -310,6 +310,7 @@ enum llm_kv { LLM_KV_ATTENTION_SCALE, LLM_KV_ROPE_DIMENSION_COUNT, + LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALING_TYPE, @@ -426,6 +427,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, @@ -2429,11 +2431,12 @@ struct llama_hparams { uint32_t time_decay_extra_dim = 0; uint32_t wkv_head_size = 0; - float rope_attn_factor = 1.0f; - float rope_freq_base_train; - float rope_freq_scale_train; - uint32_t n_ctx_orig_yarn; - float rope_yarn_log_mul; + float rope_attn_factor = 1.0f; + float rope_freq_base_train; + float rope_freq_scale_train; + uint32_t n_ctx_orig_yarn; + float rope_yarn_log_mul; + std::array rope_mrope_sections; // for State Space Models uint32_t ssm_d_conv = 0; @@ -2488,8 +2491,9 @@ struct llama_hparams { if (this->n_ff_shexp != other.n_ff_shexp) return true; if (this->n_expert_shared != other.n_expert_shared) return true; - if (this->rope_finetuned != other.rope_finetuned) return true; - if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true; + if (this->rope_finetuned != other.rope_finetuned) return true; + if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true; + if (this->rope_mrope_sections != other.rope_mrope_sections) return true; if (this->ssm_d_conv != other.ssm_d_conv) return true; if (this->ssm_d_inner != other.ssm_d_inner) return true; @@ -5710,8 +5714,12 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; - case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2VL: + { + std::fill(hparams.rope_mrope_sections.begin(), hparams.rope_mrope_sections.end(), 0); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_mrope_sections, 4, true); + } + case LLM_ARCH_QWEN2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { @@ -12532,7 +12540,7 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - int sections[4] = {16, 24, 24, 0}; // TODO: move this into gguf model file. + int * sections = (int *)hparams.rope_mrope_sections.data(); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL;