diff --git a/llama.cpp b/llama.cpp index bc58ad960..b26931f8b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -53,7 +53,6 @@ enum e_model { MODEL_65B, }; - static const size_t MB = 1024*1024; // computed for n_ctx == 2048 @@ -1261,12 +1260,6 @@ static bool llama_eval_internal( ggml_set_name(embd, "embd"); memcpy(embd->data, tokens, N*ggml_element_size(embd)); -#ifdef GGML_USE_METAL - if (lctx.ctx_metal && N == 1) { - ggml_metal_set_tensor(lctx.ctx_metal, embd); - } -#endif - struct ggml_tensor * cur; struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); @@ -1456,12 +1449,6 @@ static bool llama_eval_internal( // But for now, we have focused only on Matrix x Vector Metal multiplication. // ggml_graph_compute(ctx0, &gf); - - if (lctx.ctx_metal) { - // We need to sync the CPU KV cache with the GPU KV cache - ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k); - ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v); - } } #else ggml_graph_compute(ctx0, &gf);