From ba0ab56b63077cb4d0231236cda6955016c498f2 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 27 Jul 2023 18:54:06 +0200
Subject: [PATCH] llama.cpp : fix embeddings output

---
 llama.cpp | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index c376d6517..3ae2a895e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1708,9 +1708,6 @@ static struct ggml_cgraph * llama_build_graph(
 
     lctx.use_buf(ctx0, 0);
 
-    // used at the end to optionally extract the embeddings
-    struct ggml_tensor * embeddings = NULL;
-
     // norm
     {
         cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
@@ -1721,11 +1718,6 @@ static struct ggml_cgraph * llama_build_graph(
         cur = ggml_mul(ctx0, cur, model.norm);
         // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
         ggml_set_name(cur, "result_norm");
-
-        embeddings = cur;
-#ifdef LLAMA_USE_ALLOCATOR
-    // TODO: ensure that embeddings is not freed
-#endif
     }
 
     // lm_head
@@ -1754,7 +1746,6 @@ static struct ggml_cgraph * llama_build_graph(
 
     ggml_free(ctx0);
 
-    // outputs: cur, embeddings
     return gf;
 
 #ifdef LLAMA_USE_ALLOCATOR
@@ -1864,10 +1855,10 @@ static bool llama_eval_internal(
     lctx.kv_self.n = n_past + N;
 
     struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
-    struct ggml_tensor * embeddings = NULL;
+    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
 
     LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
-    //LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
+    LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
 
     if (cgraph_fname) {
         ggml_graph_export(gf, cgraph_fname);