From e290792ae4e98937f5b4f14db27cbfef4d4d8fcb Mon Sep 17 00:00:00 2001
From: teleprint-me <77757836+teleprint-me@users.noreply.github.com>
Date: Wed, 20 Dec 2023 15:34:56 -0500
Subject: [PATCH] Consolidate Handling of Phi Models in llama.cpp

- Replaced LLM_ARCH_PHI2 with LLM_ARCH_PHI to unify the handling of different Phi model variants (Phi-1, Phi-1.5, Phi-2).
- Updated architecture names map to reflect the consolidated architecture name from "phi2" to "phi".
- Adjusted the tensor names mapping to use the new architecture name "phi" for consistent tensor loading and processing.
- Modified hyperparameter loading to include a case for 24 layers under LLM_ARCH_PHI, classifying it as MODEL_1B. This change accommodates different layer counts for various Phi model variants.
- Updated tensor loading sections to use the new architecture enum, ensuring proper tensor creation based on the model architecture.
- Renamed build_phi2() to build_phi() in the graph building section, aligning with the new architecture name and ensuring correct computational graph construction for Phi models.
- Adjusted graph construction calls to use the renamed build_phi() function, ensuring seamless integration and functionality for different Phi model variants.

These changes aim to streamline the handling of various Phi models within `llama.cpp`, enhancing the application's capability to work effectively with these models while maintaining code clarity and consistency.
---
 llama.cpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index edd2910b3..bec9136ad 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -195,7 +195,7 @@ enum llm_arch {
     LLM_ARCH_BLOOM,
     LLM_ARCH_STABLELM,
     LLM_ARCH_QWEN,
-    LLM_ARCH_PHI2,
+    LLM_ARCH_PHI,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -213,7 +213,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
     { LLM_ARCH_BLOOM,           "bloom"     },
     { LLM_ARCH_STABLELM,        "stablelm"  },
     { LLM_ARCH_QWEN,            "qwen"      },
-    { LLM_ARCH_PHI2,            "phi2"      },
+    { LLM_ARCH_PHI,             "phi"      },
 };
 
 enum llm_kv {
@@ -553,7 +553,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
         },
     },
     {
-        LLM_ARCH_PHI2,
+        LLM_ARCH_PHI,
         {
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
@@ -2651,11 +2651,12 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
-        case LLM_ARCH_PHI2:
+        case LLM_ARCH_PHI:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
 
                 switch (hparams.n_layer) {
+                    case 24: model.type = e_model::MODEL_1B; break;
                     case 32: model.type = e_model::MODEL_3B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
@@ -3655,7 +3656,7 @@ static void llm_load_tensors(
                         }
                     }
                 } break;
-            case LLM_ARCH_PHI2:
+            case LLM_ARCH_PHI:
                 {
                     model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
 
@@ -4117,7 +4118,7 @@ static struct ggml_tensor * llm_build_kqv(
     struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
     cb(kq, "kq", il);
 
-    if (model.arch == LLM_ARCH_PHI2) {
+    if (model.arch == LLM_ARCH_PHI) {
         // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
         // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
         ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -5523,7 +5524,7 @@ struct llm_build_context {
 
         return gf;
     }
-    struct ggml_cgraph * build_phi2() {
+    struct ggml_cgraph * build_phi() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         struct ggml_tensor * cur;
@@ -5924,8 +5925,8 @@ static struct ggml_cgraph * llama_build_graph(
 
             if (!ggml_allocr_is_measure(lctx.alloc)) {
                 const int64_t n_embd_head = model.hparams.n_embd_head();
-                if (model.arch == LLM_ARCH_PHI2) {
-                    // with phi2, we scale the Q to avoid precision issues
+                if (model.arch == LLM_ARCH_PHI) {
+                    // with phi, we scale the Q to avoid precision issues
                     // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
                     ggml_set_f32(cur, 1.0f);
                 } else {
@@ -6157,9 +6158,9 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_qwen();
             } break;
-        case LLM_ARCH_PHI2:
+        case LLM_ARCH_PHI:
             {
-                result = llm.build_phi2();
+                result = llm.build_phi();
             } break;
         default:
             GGML_ASSERT(false);