From 640a8896329f73af36e7726c8ac0f6f2fca6a721 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 2 Jun 2023 21:00:30 +0300
Subject: [PATCH] mtl : add save/load vocab to ggml file

---
 examples/mtl/mtl.cpp | 25 +++++++++++++++++++++++++
 examples/mtl/mtl.m   | 14 --------------
 llama.cpp            | 44 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 68 insertions(+), 15 deletions(-)

diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index b7b84cecf..ff1c1f685 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -24,6 +24,31 @@ int main(int argc, char ** argv) {
     struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
     gf.n_threads = 1;
 
+    {
+        struct ggml_tensor * t_vocab = ggml_graph_get_tensor(&gf, "vocab");
+        if (t_vocab == NULL) {
+            fprintf(stderr, "%s: vocab tensor not found\n", __func__);
+            return -1;
+        }
+
+        const char * ptr = (const char *) t_vocab->data;
+
+        int32_t n_vocab = 0;
+        memcpy(&n_vocab, ptr, sizeof(n_vocab)); ptr += sizeof(n_vocab);
+
+        printf("%s: n_vocab = %d\n", __func__, n_vocab);
+
+        for (int i = 0; i < 512; ++i) {
+            char text[32];
+            float score;
+
+            memcpy(text,   ptr, sizeof(text));  ptr += sizeof(text);
+            memcpy(&score, ptr, sizeof(score)); ptr += sizeof(score);
+
+            printf("%s: token[%4d] = %16.*s, score = %6.2f\n", __func__, i, (int) sizeof(text), text, score);
+        }
+    }
+
     // allocate work context
     static size_t buf_size = gf.work_size; // TODO
     static void * buf = malloc(buf_size);
diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m
index 89ed45c01..e4839626e 100644
--- a/examples/mtl/mtl.m
+++ b/examples/mtl/mtl.m
@@ -108,20 +108,6 @@ struct ggml_mtl_context * llama_mtl_init(
             exit(1);
         }
     }
-#elif 0
-    // this does not work !?!?!
-
-    // load library from "mtl.metallib"
-    {
-        NSError * error = nil;
-
-        NSString * path = [[NSBundle mainBundle] pathForResource:@"./mtl" ofType:@"metallib"];
-        ctx->library = [ctx->device newLibraryWithFile:path error:&error];
-        if (error) {
-            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
-        }
-    }
 #else
     // read the source from "../examples/mtl/mtl.metal" into a string and use newLibraryWithSource
     {
diff --git a/llama.cpp b/llama.cpp
index c998a77fb..9a8bf9df7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1505,7 +1505,49 @@ static bool llama_eval_internal(
     //}
 
     if (cgraph_fname) {
-        ggml_graph_export(&gf, cgraph_fname);
+        // TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found
+        {
+            char tmp[32]; // max token length
+
+            // store null-terminated string for simplicity
+            std::vector<uint8_t> buf_vocab(sizeof(int32_t) + n_vocab*(32 + sizeof(float)));
+
+            uint64_t offs = 0;
+
+            {
+                const int32_t n = n_vocab;
+                memcpy(&buf_vocab[offs], &n, sizeof(n)); offs += sizeof(n);
+            }
+
+            for (int i = 0; i < n_vocab; i++) {
+                const int32_t id = i;
+
+                const float score = lctx.vocab.id_to_token[id].score;
+                const std::string text = lctx.vocab.id_to_token[id].tok;
+
+                snprintf(tmp, sizeof(tmp), "%s", text.c_str());
+
+                memcpy(&buf_vocab[offs], tmp, 32); offs += 32;
+                memcpy(&buf_vocab[offs], &score, sizeof(score)); offs += sizeof(score);
+            }
+
+            struct ggml_init_params params;
+            params.mem_size   = ggml_tensor_overhead();
+            params.mem_buffer = NULL;
+            params.no_alloc   = true;
+
+            ggml_context * ctx_vocab = ggml_init(params);
+
+            struct ggml_tensor * t_vocab = ggml_new_tensor_1d(ctx_vocab, GGML_TYPE_I8, buf_vocab.size());
+            t_vocab->data = buf_vocab.data();
+            ggml_set_name(t_vocab, "vocab");
+
+            gf.leafs[gf.n_leafs++] = t_vocab;
+
+            ggml_graph_export(&gf, cgraph_fname);
+
+            ggml_free(ctx_vocab);
+        }
 
         float * logits = (float *) ggml_get_data(inpL);