From 085228e1f5c8886d6a09d78235f849a3c5ef8de4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 22 Aug 2023 22:09:56 +0300
Subject: [PATCH] llama : add arch member to llama_model

---
 llama.cpp | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d4d5984f8..e19f46a88 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -938,10 +938,10 @@ struct llama_vocab {
 
 struct llama_model {
     e_model     type  = MODEL_UNKNOWN;
+    llm_arch    arch  = LLM_ARCH_UNKNOWN;
     llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
 
     std::string name = "n/a";
-    std::string arch = "n/a";
 
     llama_hparams hparams;
     llama_vocab   vocab;
@@ -1481,7 +1481,7 @@ static const char * llama_model_type_name(e_model type) {
     }
 }
 
-static llm_arch llm_load_arch(llama_model_loader & ml) {
+static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
     struct gguf_context * ctx = ml.ctx_gguf;
 
     const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
@@ -1489,16 +1489,13 @@ static llm_arch llm_load_arch(llama_model_loader & ml) {
     std::string arch_name;
     GGUF_GET_KEY(ctx, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_GENERAL_ARCHITECTURE));
 
-    const llm_arch arch = llm_arch_from_string(arch_name);
-    if (arch == LLM_ARCH_UNKNOWN) {
+    model.arch = llm_arch_from_string(arch_name);
+    if (model.arch == LLM_ARCH_UNKNOWN) {
         throw std::runtime_error("unknown model architecture: '" + arch_name + "'");
     }
-
-    return arch;
 }
 
 static void llm_load_hparams(
-        llm_arch arch,
         llama_model_loader & ml,
         llama_model & model,
         int n_ctx,
@@ -1506,13 +1503,12 @@ static void llm_load_hparams(
         float rope_freq_scale) {
     struct gguf_context * ctx = ml.ctx_gguf;
 
-    const auto kv = LLM_KV(arch);
+    const auto kv = LLM_KV(model.arch);
 
     auto & hparams = model.hparams;
 
     // get general kv
     GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
-    GGUF_GET_KEY(ctx, model.arch, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
 
     // get hparams kv
     GGUF_GET_KEY(ctx, hparams.n_vocab,        gguf_get_arr_n,   GGUF_TYPE_ARRAY,   true, kv(LLM_KV_TOKENIZER_LIST));
@@ -1548,7 +1544,7 @@ static void llm_load_hparams(
     }
 
     // arch-specific KVs
-    switch (arch) {
+    switch (model.arch) {
         case LLM_ARCH_LLAMA:
             {
                 GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@@ -1593,14 +1589,13 @@ static void llm_load_hparams(
 }
 
 static void llm_load_vocab(
-        llm_arch arch,
         llama_model_loader & ml,
         llama_model & model) {
     auto & vocab = model.vocab;
 
     struct gguf_context * ctx = ml.ctx_gguf;
 
-    const auto kv = LLM_KV(arch);
+    const auto kv = LLM_KV(model.arch);
 
     const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
     if (token_idx == -1) {
@@ -1672,7 +1667,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
 
     // hparams
     LLAMA_LOG_INFO("%s: format       = %s\n",     __func__, llama_file_version_name(ml.fver));
-    LLAMA_LOG_INFO("%s: arch         = %s\n",     __func__, model.arch.c_str());
+    LLAMA_LOG_INFO("%s: arch         = %s\n",     __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
     LLAMA_LOG_INFO("%s: vocab type   = %s\n",     __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
     LLAMA_LOG_INFO("%s: n_vocab      = %u\n",     __func__, hparams.n_vocab);
     LLAMA_LOG_INFO("%s: n_ctx_train  = %u\n",     __func__, hparams.n_ctx_train);
@@ -1704,7 +1699,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
 }
 
 static void llm_load_tensors(
-        llm_arch arch,
         llama_model_loader & ml,
         llama_model & model,
         int n_batch,
@@ -1776,9 +1770,9 @@ static void llm_load_tensors(
         const int64_t n_layer    = hparams.n_layer;
         const int64_t n_vocab    = hparams.n_vocab;
 
-        const auto tn = LLM_TN(arch);
+        const auto tn = LLM_TN(model.arch);
 
-        switch (arch) {
+        switch (model.arch) {
             case LLM_ARCH_LLAMA:
                 {
                     model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
@@ -1993,10 +1987,9 @@ static bool llama_model_load(
     try {
         std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
 
-        const llm_arch arch = llm_load_arch(*ml);
-
-        llm_load_hparams(arch, *ml, model, n_ctx, rope_freq_base, rope_freq_scale);
-        llm_load_vocab  (arch, *ml, model);
+        llm_load_arch   (*ml, model);
+        llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
+        llm_load_vocab  (*ml, model);
 
         llm_load_print_meta(*ml, model);
 
@@ -2010,7 +2003,7 @@ static bool llama_model_load(
         }
 
         llm_load_tensors(
-                arch, *ml, model, n_batch, n_gpu_layers,
+                *ml, model, n_batch, n_gpu_layers,
                 main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
                 use_mlock, progress_callback, progress_callback_user_data);
     } catch (const std::exception & err) {