diff --git a/gguf-llama.cpp b/gguf-llama.cpp index 60e968153..cebe53d10 100644 --- a/gguf-llama.cpp +++ b/gguf-llama.cpp @@ -63,24 +63,25 @@ #include // for _fseeki64 #endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include +#include +#include #include -#include #include +#include +#include +#include +#include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -136,11 +137,12 @@ __attribute__((format(printf, 1, 2))) #endif #endif static std::string format(const char * fmt, ...) { - va_list ap, ap2; + va_list ap; + va_list ap2; va_start(ap, fmt); va_copy(ap2, ap); int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT std::vector buf(size + 1); int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); GGML_ASSERT(size2 == size); @@ -668,7 +670,7 @@ struct llama_hparams { uint32_t n_rot = 64; uint32_t n_ff = 11008; - float f_rms_norm_eps = 1e-5; + float f_norm_rms_eps = 1e-5; float rope_freq_base = 10000.0f; float rope_freq_scale = 1.0f; @@ -1279,7 +1281,7 @@ static void llama_model_load_internal( hparams.n_head = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.attention.head_count")); hparams.n_layer = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.block_count")); hparams.n_rot = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.rope.dimension_count")); - hparams.f_rms_norm_eps = gguf_get_val_f32(ctx, gguf_find_key(ctx, "llama.rms_norm_epsilon")); + hparams.f_norm_rms_eps = gguf_get_val_f32(ctx, gguf_find_key(ctx, "llama.attention.layer_norm_rms_epsilon")); // n_head_kv default to n_head hparams.n_head_kv = hparams.n_head; @@ -1360,7 +1362,7 @@ static void llama_model_load_internal( LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); - LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps); + LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); @@ -1658,9 +1660,9 @@ static struct ggml_cgraph * llama_build_graph( GGML_ASSERT(n_embd_head == hparams.n_rot); - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; - const float rms_norm_eps = hparams.f_rms_norm_eps; + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + const float norm_rms_eps = hparams.f_norm_rms_eps; const int n_gpu_layers = model.n_gpu_layers; @@ -1767,7 +1769,7 @@ static struct ggml_cgraph * llama_build_graph( // norm { - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); offload_func(cur); ggml_set_name(cur, "rms_norm_0"); @@ -1912,7 +1914,7 @@ static struct ggml_cgraph * llama_build_graph( { // norm { - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); + cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); offload_func(cur); ggml_set_name(cur, "rms_norm_1"); @@ -1962,7 +1964,7 @@ static struct ggml_cgraph * llama_build_graph( // norm { - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); + cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); offload_func_nr(cur); ggml_set_name(cur, "rms_norm_2");