From 640a8896329f73af36e7726c8ac0f6f2fca6a721 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 2 Jun 2023 21:00:30 +0300 Subject: [PATCH] mtl : add save/load vocab to ggml file --- examples/mtl/mtl.cpp | 25 +++++++++++++++++++++++++ examples/mtl/mtl.m | 14 -------------- llama.cpp | 44 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 68 insertions(+), 15 deletions(-) diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp index b7b84cecf..ff1c1f685 100644 --- a/examples/mtl/mtl.cpp +++ b/examples/mtl/mtl.cpp @@ -24,6 +24,31 @@ int main(int argc, char ** argv) { struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); gf.n_threads = 1; + { + struct ggml_tensor * t_vocab = ggml_graph_get_tensor(&gf, "vocab"); + if (t_vocab == NULL) { + fprintf(stderr, "%s: vocab tensor not found\n", __func__); + return -1; + } + + const char * ptr = (const char *) t_vocab->data; + + int32_t n_vocab = 0; + memcpy(&n_vocab, ptr, sizeof(n_vocab)); ptr += sizeof(n_vocab); + + printf("%s: n_vocab = %d\n", __func__, n_vocab); + + for (int i = 0; i < 512; ++i) { + char text[32]; + float score; + + memcpy(text, ptr, sizeof(text)); ptr += sizeof(text); + memcpy(&score, ptr, sizeof(score)); ptr += sizeof(score); + + printf("%s: token[%4d] = %16.*s, score = %6.2f\n", __func__, i, (int) sizeof(text), text, score); + } + } + // allocate work context static size_t buf_size = gf.work_size; // TODO static void * buf = malloc(buf_size); diff --git a/examples/mtl/mtl.m b/examples/mtl/mtl.m index 89ed45c01..e4839626e 100644 --- a/examples/mtl/mtl.m +++ b/examples/mtl/mtl.m @@ -108,20 +108,6 @@ struct ggml_mtl_context * llama_mtl_init( exit(1); } } -#elif 0 - // this does not work !?!?! - - // load library from "mtl.metallib" - { - NSError * error = nil; - - NSString * path = [[NSBundle mainBundle] pathForResource:@"./mtl" ofType:@"metallib"]; - ctx->library = [ctx->device newLibraryWithFile:path error:&error]; - if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); - exit(1); - } - } #else // read the source from "../examples/mtl/mtl.metal" into a string and use newLibraryWithSource { diff --git a/llama.cpp b/llama.cpp index c998a77fb..9a8bf9df7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1505,7 +1505,49 @@ static bool llama_eval_internal( //} if (cgraph_fname) { - ggml_graph_export(&gf, cgraph_fname); + // TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found + { + char tmp[32]; // max token length + + // store null-terminated string for simplicity + std::vector buf_vocab(sizeof(int32_t) + n_vocab*(32 + sizeof(float))); + + uint64_t offs = 0; + + { + const int32_t n = n_vocab; + memcpy(&buf_vocab[offs], &n, sizeof(n)); offs += sizeof(n); + } + + for (int i = 0; i < n_vocab; i++) { + const int32_t id = i; + + const float score = lctx.vocab.id_to_token[id].score; + const std::string text = lctx.vocab.id_to_token[id].tok; + + snprintf(tmp, sizeof(tmp), "%s", text.c_str()); + + memcpy(&buf_vocab[offs], tmp, 32); offs += 32; + memcpy(&buf_vocab[offs], &score, sizeof(score)); offs += sizeof(score); + } + + struct ggml_init_params params; + params.mem_size = ggml_tensor_overhead(); + params.mem_buffer = NULL; + params.no_alloc = true; + + ggml_context * ctx_vocab = ggml_init(params); + + struct ggml_tensor * t_vocab = ggml_new_tensor_1d(ctx_vocab, GGML_TYPE_I8, buf_vocab.size()); + t_vocab->data = buf_vocab.data(); + ggml_set_name(t_vocab, "vocab"); + + gf.leafs[gf.n_leafs++] = t_vocab; + + ggml_graph_export(&gf, cgraph_fname); + + ggml_free(ctx_vocab); + } float * logits = (float *) ggml_get_data(inpL);