From e26cd6b483c195a45c64e2f25c315269ada827a2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 4 Jun 2023 11:23:36 +0300 Subject: [PATCH] mtl : remove temp / debug code --- examples/mtl/mtl.cpp | 34 ++---------------------- llama.cpp | 63 +------------------------------------------- 2 files changed, 3 insertions(+), 94 deletions(-) diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp index e527f2856..56510904c 100644 --- a/examples/mtl/mtl.cpp +++ b/examples/mtl/mtl.cpp @@ -5,8 +5,6 @@ #include #include -#include // tmp - int main(int argc, char ** argv) { ggml_time_init(); @@ -24,44 +22,16 @@ int main(int argc, char ** argv) { struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); gf.n_threads = 1; - int32_t n_vocab = 0; - - { - struct ggml_tensor * t_vocab = ggml_graph_get_tensor(&gf, "vocab"); - if (t_vocab == NULL) { - fprintf(stderr, "%s: vocab tensor not found\n", __func__); - return -1; - } - - const char * ptr = (const char *) t_vocab->data; - - memcpy(&n_vocab, ptr, sizeof(n_vocab)); ptr += sizeof(n_vocab); - - printf("%s: n_vocab = %d\n", __func__, n_vocab); - - for (int i = 0; i < 512; ++i) { - char text[32]; - float score; - - memcpy(text, ptr, sizeof(text)); ptr += sizeof(text); - memcpy(&score, ptr, sizeof(score)); ptr += sizeof(score); - - printf("%s: token[%4d] = %16.*s, score = %6.2f\n", __func__, i, (int) sizeof(text), text, score); - } - } - // this allocates all Metal resources and memory buffers auto * ctx_mtl = ggml_mtl_init(); ggml_mtl_add_buffer(ctx_mtl, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data)); ggml_mtl_add_buffer(ctx_mtl, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval)); - // TODO: tmp to match the input used when creating the cgraph + // main { - const std::vector tmp(1, 1); // BOS - struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd"); - memcpy(input->data, tmp.data(), tmp.size() * sizeof(int)); + *(int32_t *) input->data = 1; // BOS ggml_mtl_set_tensor(ctx_mtl, input); diff --git a/llama.cpp b/llama.cpp index 26722e091..455402a4e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1459,68 +1459,7 @@ static bool llama_eval_internal( #endif if (cgraph_fname) { - // TODO: tmp add the vocabulary as a leaf to the computation graph, until better approach is found - { - char tmp[32]; // max token length - - // store null-terminated string for simplicity - std::vector buf_vocab(sizeof(int32_t) + n_vocab*(32 + sizeof(float))); - - uint64_t offs = 0; - - { - const int32_t n = n_vocab; - memcpy(&buf_vocab[offs], &n, sizeof(n)); offs += sizeof(n); - } - - for (int i = 0; i < n_vocab; i++) { - const int32_t id = i; - - const float score = lctx.vocab.id_to_token[id].score; - const std::string text = lctx.vocab.id_to_token[id].tok; - - snprintf(tmp, sizeof(tmp), "%s", text.c_str()); - - memcpy(&buf_vocab[offs], tmp, 32); offs += 32; - memcpy(&buf_vocab[offs], &score, sizeof(score)); offs += sizeof(score); - } - - struct ggml_init_params params; - params.mem_size = ggml_tensor_overhead(); - params.mem_buffer = NULL; - params.no_alloc = true; - - ggml_context * ctx_vocab = ggml_init(params); - - struct ggml_tensor * t_vocab = ggml_new_tensor_1d(ctx_vocab, GGML_TYPE_I8, buf_vocab.size()); - t_vocab->data = buf_vocab.data(); - ggml_set_name(t_vocab, "vocab"); - - gf.leafs[gf.n_leafs++] = t_vocab; - - ggml_graph_export(&gf, cgraph_fname); - - ggml_free(ctx_vocab); - } - - float * logits = (float *) ggml_get_data(cur); - - printf("logits: "); - for (int i = 0; i < 10; i++) { - printf("%8.4f ", logits[i]); - } - printf("\n"); - double sum = 0.0; - int imax = 0; - double vmax = -INFINITY; - for (int i = 0; i < 32000; i++) { - sum += (double) logits[i]; - if (logits[i] > vmax) { - vmax = logits[i]; - imax = i; - } - } - printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax); + ggml_graph_export(&gf, cgraph_fname); } #ifdef GGML_PERF