diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py index 60bceb0fa..cbe4a9f11 100755 --- a/convert-mpt-hf-to-gguf.py +++ b/convert-mpt-hf-to-gguf.py @@ -121,7 +121,7 @@ gguf_writer.add_embedding_length(hparams["d_model"]) gguf_writer.add_block_count(block_count) gguf_writer.add_feed_forward_length(4 * hparams["d_model"]) gguf_writer.add_head_count(hparams["n_heads"]) -gguf_writer.add_layer_norm_eps(1e-05) +gguf_writer.add_layer_norm_eps(1e-05) if hparams["attn_config"]["clip_qkv"] is not None: gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"]) gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"]) diff --git a/llama.cpp b/llama.cpp index 81a014d0a..ede95f607 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4304,11 +4304,11 @@ static struct ggml_cgraph * llm_build_mpt( offload_func_kq(KQ_scaled); ggml_set_name(KQ_scaled, "KQ_scaled"); - // TODO: replace with ggml_add() - struct ggml_tensor * KQ_scaled_alibi = - ggml_alibi(ctx0, KQ_scaled, std::max(kv_head, n_kv - n_tokens), n_head, max_alibi_bias); + // TODO: replace with ggml_add() + struct ggml_tensor * KQ_scaled_alibi = + ggml_alibi(ctx0, KQ_scaled, std::max(kv_head, n_kv - n_tokens), n_head, max_alibi_bias); offload_func_kq(KQ_scaled_alibi); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); offload_func_kq(KQ_masked);