diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py index b8a12e1c3..f66f5e9cc 100644 --- a/convert-baichuan-hf-to-gguf.py +++ b/convert-baichuan-hf-to-gguf.py @@ -120,6 +120,8 @@ if "max_sequence_length" in hparams: ctx_length = hparams["max_sequence_length"] elif "max_position_embeddings" in hparams: ctx_length = hparams["max_position_embeddings"] +elif "model_max_length" in hparams: + ctx_length = hparams["model_max_length"] else: print("gguf: can not find ctx length parameter.") @@ -231,12 +233,7 @@ for part_name in part_names: tmp=model_part for i in itertools.count(): - if f"model.layers.{i}.self_attn.q_proj.weight" in model_part: - print(f"Permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = reverse_hf_permute(model_part[f"model.layers.{i}.self_attn.q_proj.weight"], head_count, head_count) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = reverse_hf_permute(model_part[f"model.layers.{i}.self_attn.k_proj.weight"], head_count, head_count_kv) - #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] - elif f"model.layers.{i}.self_attn.W_pack.weight" in model_part: + if f"model.layers.{i}.self_attn.W_pack.weight" in model_part: print(f"Unpacking and permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count) tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv) @@ -259,14 +256,6 @@ for part_name in part_names: data = data.squeeze().numpy() - # reverse permute these - # if name.endswith(".q_proj.weight"): - # data = reverse_hf_permute(data, head_count) - # if name.endswith(".k_proj.weight"): - # data = reverse_hf_permute(data, head_count, head_count_kv) - - - # map tensor names new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) if new_name is None: @@ -289,8 +278,6 @@ for part_name in part_names: data = data.astype(np.float16) print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) diff --git a/llama.cpp b/llama.cpp index 21d0d8635..dc82037ad 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1948,7 +1948,6 @@ static void llm_load_tensors( const int64_t n_vocab = hparams.n_vocab; const auto tn = LLM_TN(model.arch); - switch (model.arch) { case LLM_ARCH_LLAMA: { @@ -2777,13 +2776,11 @@ static struct ggml_cgraph * llm_build_baichaun( struct ggml_tensor * Kcur; struct ggml_tensor * Qcur; - switch (model.type) - { + switch (model.type) { case MODEL_7B: Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); - Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); break; - case MODEL_13B: Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N); Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N); @@ -2797,8 +2794,6 @@ static struct ggml_cgraph * llm_build_baichaun( offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); - - // store key and value to memory { @@ -2853,13 +2848,11 @@ static struct ggml_cgraph * llm_build_baichaun( struct ggml_tensor * KQ_masked; struct ggml_tensor * KQ_scaled_alibi; - // if model.type == MODEL_13B,here add kq_scaled_alibi - switch (model.type) - { + + switch (model.type) { case MODEL_7B: KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); break; - case MODEL_13B: KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8); ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");