From db3db9e7749c4b7681c96272c87fdbf6b1e235e7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 4 Jun 2023 18:19:08 +0300 Subject: [PATCH] metal : clean-up stuff, fix typos --- examples/metal/metal.cpp | 2 +- ggml.c | 4 +--- llama.cpp | 37 ++++++++++++------------------------- 3 files changed, 14 insertions(+), 29 deletions(-) diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp index 10b35faf8..77aca94a3 100644 --- a/examples/metal/metal.cpp +++ b/examples/metal/metal.cpp @@ -1,4 +1,4 @@ -// Evaluate a statically export ggml computation graph with Metal +// Evaluate a statically exported ggml computation graph with Metal // // - First, export a LLaMA graph: // diff --git a/ggml.c b/ggml.c index 27a9de2be..42e8626df 100644 --- a/ggml.c +++ b/ggml.c @@ -15049,7 +15049,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** // create the tensor // "view" operations are handled differently - // TODO: handle inplac ops - currentl a copy is always made + // TODO: handle inplace ops - currently a copy is always made struct ggml_tensor * tensor = NULL; @@ -15084,10 +15084,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** } break; } - memcpy(tensor->name, ptr_name, GGML_MAX_NAME); - // TODO: double-check this is needed for (int j = 0; j < GGML_MAX_DIMS; ++j) { tensor->nb[j] = nb[j]; } diff --git a/llama.cpp b/llama.cpp index 471b996aa..9efc60cdb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1201,7 +1201,7 @@ static bool llama_model_load( // - tokens: new batch of tokens to process // - n_past: the context size so far // - n_threads: number of threads to use -// - cgraph_fname: filename of the exported computation graph (TODO: TMP!!!) +// - cgraph_fname: filename of the exported computation graph // static bool llama_eval_internal( llama_context & lctx, @@ -1256,7 +1256,7 @@ static bool llama_eval_internal( memcpy(embd->data, tokens, N*ggml_element_size(embd)); #ifdef GGML_USE_METAL - if (lctx.ctx_metal) { + if (lctx.ctx_metal && N == 1) { ggml_metal_set_tensor(lctx.ctx_metal, embd); } #endif @@ -1279,18 +1279,10 @@ static bool llama_eval_internal( // self-attention { - //auto * x = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - //struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, x, n_embd/n_head, n_head, N), n_past, n_rot, 0); - // compute Q and K and RoPE them - struct ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N); - struct ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N); - ggml_set_name(Qpre, "Qpre"); - ggml_set_name(Kpre, "Kpre"); - - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, Qpre, n_past, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, Kpre, n_past, n_rot, 0); + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); ggml_set_name(Qcur, "Qcur"); ggml_set_name(Kcur, "Kcur"); @@ -1305,9 +1297,6 @@ static bool llama_eval_internal( ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - ggml_set_name(k, "k"); - ggml_set_name(v, "v"); - // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); @@ -2341,21 +2330,19 @@ struct llama_context * llama_init_from_file( #ifdef GGML_USE_METAL if (params.n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers + ctx->ctx_metal = ggml_metal_init(); + if (params.use_mmap) { - ctx->ctx_metal = ggml_metal_init(); - ggml_metal_add_buffer(ctx->ctx_metal, "data", ctx->model.mapping->addr, ctx->model.mapping->size); - ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size); - ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size); - ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size); - ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size); + ggml_metal_add_buffer(ctx->ctx_metal, "data", ctx->model.mapping->addr, ctx->model.mapping->size); + ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size); } else { - ctx->ctx_metal = ggml_metal_init(); ggml_metal_add_buffer(ctx->ctx_metal, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx)); ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size); - ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size); - ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size); - ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size); } + + ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size); + ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size); + ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size); } #endif