diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp index 382976667..e527f2856 100644 --- a/examples/mtl/mtl.cpp +++ b/examples/mtl/mtl.cpp @@ -51,23 +51,22 @@ int main(int argc, char ** argv) { } // this allocates all Metal resources and memory buffers - auto * ctx_mtl = ggml_mtl_init( - ggml_get_mem_buffer(ctx_data), - ggml_get_mem_size (ctx_data), - ggml_get_mem_buffer(ctx_eval), - ggml_get_mem_size (ctx_eval), - NULL, 0, // cache - 32*n_vocab*sizeof(float)); + auto * ctx_mtl = ggml_mtl_init(); + + ggml_mtl_add_buffer(ctx_mtl, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data)); + ggml_mtl_add_buffer(ctx_mtl, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval)); // TODO: tmp to match the input used when creating the cgraph { - const int n_batch = 1; - const int n_past = 512 - n_batch; + const std::vector tmp(1, 1); // BOS - const std::vector tmp(n_batch, 1); // BOS + struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd"); + memcpy(input->data, tmp.data(), tmp.size() * sizeof(int)); + + ggml_mtl_set_tensor(ctx_mtl, input); // warmup - ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past); + ggml_mtl_graph_compute(ctx_mtl, &gf); const int n_iter = 16; @@ -75,7 +74,7 @@ int main(int argc, char ** argv) { // the actual inference happens here for (int i = 0; i < n_iter; ++i) { - ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past); + ggml_mtl_graph_compute(ctx_mtl, &gf); } const int64_t t1 = ggml_time_us(); @@ -83,6 +82,31 @@ int main(int argc, char ** argv) { printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter); } + // debug output + { + struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1]; + ggml_mtl_get_tensor(ctx_mtl, logits); + + float * ptr = (float *) ggml_get_data(logits); + + printf("logits: "); + for (int i = 0; i < 10; i++) { + printf("%8.4f ", ptr[i]); + } + printf("\n"); + int imax = 0; + double sum = 0.0; + double vmax = -1e9; + for (int i = 0; i < 32000; i++) { + sum += (double) ptr[i]; + if (ptr[i] > vmax) { + vmax = ptr[i]; + imax = i; + } + } + printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax); + } + ggml_mtl_free(ctx_mtl); ggml_free(ctx_data); diff --git a/ggml-mtl.h b/ggml-mtl.h index 15256b27d..cab71e386 100644 --- a/ggml-mtl.h +++ b/ggml-mtl.h @@ -2,7 +2,9 @@ #include -struct ggml_context; +#define GGML_METAL_MAX_BUFFERS 16 + +struct ggml_tensor; struct ggml_cgraph; #ifdef __cplusplus @@ -11,24 +13,30 @@ extern "C" { struct ggml_mtl_context; -struct ggml_mtl_context * ggml_mtl_init( - void * data_buf, - size_t data_size, - void * eval_buf, - size_t eval_size, - void * cach_buf, - size_t cach_size, - size_t outp_size); +struct ggml_mtl_context * ggml_mtl_init(void); void ggml_mtl_free(struct ggml_mtl_context * ctx); +void ggml_mtl_add_buffer( + struct ggml_mtl_context * ctx, + const char * name, + void * data, + size_t size); + +// set data from host memory into the device +void ggml_mtl_set_tensor( + struct ggml_mtl_context * ctx, + struct ggml_tensor * t); + +// get data from the device into host memory +void ggml_mtl_get_tensor( + struct ggml_mtl_context * ctx, + struct ggml_tensor * t); + // return 0 on success int ggml_mtl_graph_compute( struct ggml_mtl_context * ctx, - struct ggml_cgraph * gf, - const int * tokens, - int n_tokens, - int n_past); + struct ggml_cgraph * gf); #ifdef __cplusplus } diff --git a/ggml-mtl.m b/ggml-mtl.m index ecbb1a188..8f831afe7 100644 --- a/ggml-mtl.m +++ b/ggml-mtl.m @@ -13,26 +13,24 @@ #endif //#define mtl_printf(...) -struct ggml_mtl_context { - void * data_buf; - size_t data_size; - void * eval_buf; - size_t eval_size; - void * cach_buf; - size_t cach_size; - size_t outp_size; +struct ggml_mtl_buffer { + const char * name; + void * data; + size_t size; + + id mtl; +}; + +struct ggml_mtl_context { float * logits; id device; id queue; id library; - id buffer_data; - id buffer_eval; - id buffer_cach; - - id out; + int n_buffers; + struct ggml_mtl_buffer buffers[GGML_METAL_MAX_BUFFERS]; // custom kernels id function_add; @@ -87,25 +85,11 @@ struct ggml_mtl_context { // for now it is easier to work in a separate file NSString * const msl_library_source = @"see mtl.metal"; -struct ggml_mtl_context * ggml_mtl_init( - void * data_buf, - size_t data_size, - void * eval_buf, - size_t eval_size, - void * cach_buf, - size_t cach_size, - size_t outp_size) { +struct ggml_mtl_context * ggml_mtl_init(void) { fprintf(stderr, "%s: allocating\n", __func__); struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context)); - ctx->data_buf = data_buf; - ctx->data_size = data_size; - ctx->eval_buf = eval_buf; - ctx->eval_size = eval_size; - ctx->cach_buf = cach_buf; - ctx->cach_size = cach_size; - ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; @@ -216,51 +200,6 @@ struct ggml_mtl_context * ggml_mtl_init( fprintf(stderr, "%s: loaded kernel_cpy_f32_f32: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f32); } - // MTLBuffer approach - - // pin ctx_data memory to GPU - // use MTLStorageModeShared to allow us to initialize the weights from the CPU - // TODO: how to use MTLStorageModeManaged? - // TODO: see if we can avoid this copy somehow - { - const void * mem_buffer = data_buf; - const size_t mem_size = data_size; - - //ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil]; - ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared]; - - fprintf(stderr, "%s: allocated data buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0); - } - - // pin ctx_eval memory to GPU - // this buffer will be used for the intermediate results of the evaluation - { - const void * mem_buffer = eval_buf; - const size_t mem_size = eval_size; - - ctx->buffer_eval = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared]; - - fprintf(stderr, "%s: allocated eval buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0); - } - - if (cach_buf) { - const void * mem_buffer = cach_buf; - const size_t mem_size = cach_size; - - ctx->buffer_cach = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared]; - - fprintf(stderr, "%s: allocated cach buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0); - } - - // allocate buffer for result extraction - { - const size_t mem_size = outp_size; - - ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared]; - - fprintf(stderr, "%s: allocated out buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0); - } - return ctx; } @@ -271,81 +210,80 @@ void ggml_mtl_free(struct ggml_mtl_context * ctx) { } // get data / eval buffer + offset -id ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) { - const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf; - const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf; - const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf; - +static id ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) { //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); - //const size_t t_size = ggml_nbytes(t); + for (int i = 0; i < ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; - id result; - size_t t_offs = 0; + if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { + *offs = (size_t) ioffs; - if ( offs_data > 0 && - (offs_eval < 0 || (offs_data < offs_eval)) && - (offs_cach < 0 || (offs_data < offs_cach)) - ) { - result = ctx->buffer_data; - t_offs = offs_data; - //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + + return ctx->buffers[i].mtl; + } } - if ( offs_eval > 0 && - (offs_data < 0 || (offs_eval < offs_data)) && - (offs_cach < 0 || (offs_eval < offs_cach)) - ) { - result = ctx->buffer_eval; - t_offs = offs_eval; - //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + fprintf(stderr, "%s: error: buffer is nil\n", __func__); + GGML_ASSERT(false); + + return nil; +} + +void ggml_mtl_add_buffer( + struct ggml_mtl_context * ctx, + const char * name, + void * data, + size_t size) { + if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { + fprintf(stderr, "%s: too many buffers\n", __func__); + return; } - if ( offs_cach > 0 && - (offs_data < 0 || (offs_cach < offs_data)) && - (offs_eval < 0 || (offs_cach < offs_eval)) - ) { - result = ctx->buffer_cach; - t_offs = offs_cach; - //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); - } + if (data) { + ctx->buffers[ctx->n_buffers].name = name; + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; + ctx->buffers[ctx->n_buffers].mtl = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared]; - if (result == nil || (t_offs > ctx->data_size && t_offs > ctx->eval_size && t_offs > ctx->cach_size)) { - fprintf(stderr, "%s: error: buffer is nil\n", __func__); - GGML_ASSERT(false); - } + ++ctx->n_buffers; - if (offs != 0) { - *offs = t_offs; + fprintf(stderr, "%s: allocated '%16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0); } +} - return result; +void ggml_mtl_set_tensor( + struct ggml_mtl_context * ctx, + struct ggml_tensor * t) { + mtl_printf("%s: set input for tensor '%s'\n", __func__, t->name); + + size_t offs; + id id_dst = ggml_mtl_get_buffer(ctx, t, &offs); + + memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t)); +} + +void ggml_mtl_get_tensor( + struct ggml_mtl_context * ctx, + struct ggml_tensor * t) { + mtl_printf("%s: extract results for tensor '%s'\n", __func__, t->name); + + size_t offs; + id id_src = ggml_mtl_get_buffer(ctx, t, &offs); + + memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t)); } int ggml_mtl_graph_compute( struct ggml_mtl_context * ctx, - struct ggml_cgraph * gf, - const int * tokens, - int n_tokens, - int n_past) { - mtl_printf("%s: evaluating, n_tokens = %d, n_past = %d\n", __func__, n_tokens, n_past); - - struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd"); - memcpy(input->data, tokens, n_tokens * sizeof(int)); + struct ggml_cgraph * gf) { + mtl_printf("%s: evaluating graph\n", __func__); size_t offs_src0 = 0; size_t offs_src1 = 0; size_t offs_dst = 0; - // copy the input data to the GPU - { - struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd"); - - id id_dst = ggml_mtl_get_buffer(ctx, embd, &offs_src0); - - memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd)); - } - id command_buffer = [ctx->queue commandBuffer]; id encoder = nil; @@ -521,6 +459,8 @@ int ggml_mtl_graph_compute( encoder = [command_buffer computeCommandEncoder]; } + const int n_past = ((int32_t *)(src1->data))[0]; + [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -690,6 +630,8 @@ int ggml_mtl_graph_compute( //mtl_printf("rope: %lld x %lld x %lld x %lld\n", ne0, ne1, ne2, ne3); //mtl_printf("rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode); + const int n_past = ((int32_t *)(src1->data))[0]; + [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -769,23 +711,9 @@ int ggml_mtl_graph_compute( } } - // extract results from the GPU - { - mtl_printf("%s: extract results from the GPU\n", __func__); - - if (encoder != nil) { - [encoder endEncoding]; - encoder = nil; - } - - struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1]; - - id id_src = ggml_mtl_get_buffer(ctx, out, &offs_src0); - id id_dst = ctx->out; - - id encoder_blit = [command_buffer blitCommandEncoder]; - [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)]; - [encoder_blit endEncoding]; + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; } [command_buffer commit]; @@ -796,31 +724,5 @@ int ggml_mtl_graph_compute( mtl_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0); } - ctx->logits = ctx->out.contents; - - const float * logits = ctx->logits; - - struct ggml_tensor * t = gf->nodes[gf->n_nodes - 1]; - memcpy(t->data, logits, ggml_nbytes(t)); - -#if 1 - mtl_printf("logits: "); - for (int i = 0; i < 100; i++) { - mtl_printf("%8.4f ", logits[i]); - } - mtl_printf("\n"); - double sum = 0.0; - int imax = 0; - double vmax = -INFINITY; - for (int i = 0; i < 32000; i++) { - sum += (double) logits[i]; - if (logits[i] > vmax) { - vmax = logits[i]; - imax = i; - } - } - mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax); -#endif - return 0; } diff --git a/llama.cpp b/llama.cpp index 5cd39b612..26722e091 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1085,7 +1085,7 @@ static void llama_model_load_internal( mmapped_size - vram_total + // weights in VRAM not in memory MEM_REQ_SCRATCH0().at(model.type) + MEM_REQ_SCRATCH1().at(model.type) + - MEM_REQ_EVAL().at(model.type); + MEM_REQ_EVAL().at (model.type); // this is the memory required by one llama_state const size_t mem_required_state = @@ -1255,14 +1255,19 @@ static bool llama_eval_internal( ggml_set_name(embd, "embd"); memcpy(embd->data, tokens, N*ggml_element_size(embd)); +#ifdef GGML_USE_METAL + if (lctx.mtl_ctx) { + ggml_mtl_set_tensor(lctx.mtl_ctx, embd); + } +#endif + + struct ggml_tensor * cur; struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * cur; - - //lctx.use_buf(ctx0, 0); + lctx.use_buf(ctx0, 0); // norm { @@ -1378,7 +1383,7 @@ static bool llama_eval_internal( cur); } - //lctx.use_buf(ctx0, 1); + lctx.use_buf(ctx0, 1); struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); @@ -1416,36 +1421,36 @@ static bool llama_eval_internal( inpL = cur; } - //lctx.use_buf(ctx0, 0); + lctx.use_buf(ctx0, 0); // used at the end to optionally extract the embeddings struct ggml_tensor * embeddings = NULL; // norm { + cur = ggml_rms_norm(ctx0, inpL); - inpL = ggml_rms_norm(ctx0, inpL); + // cur = cur*norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.norm); - // inpL = inpL*norm(broadcasted) - inpL = ggml_mul(ctx0, inpL, model.norm); - - embeddings = inpL; + embeddings = cur; } // lm_head - inpL = ggml_mul_mat(ctx0, model.output, inpL); + cur = ggml_mul_mat(ctx0, model.output, cur); - //lctx.use_buf(ctx0, -1); + lctx.use_buf(ctx0, -1); // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); + //cur = ggml_soft_max_inplace(ctx0, cur); // run the computation - ggml_build_forward_expand(&gf, inpL); + ggml_build_forward_expand(&gf, cur); #ifdef GGML_USE_METAL if (lctx.mtl_ctx) { - ggml_mtl_graph_compute(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past); + ggml_mtl_graph_compute(lctx.mtl_ctx, &gf); + ggml_mtl_get_tensor(lctx.mtl_ctx, cur); } else { ggml_graph_compute(ctx0, &gf); } @@ -1498,7 +1503,7 @@ static bool llama_eval_internal( ggml_free(ctx_vocab); } - float * logits = (float *) ggml_get_data(inpL); + float * logits = (float *) ggml_get_data(cur); printf("logits: "); for (int i = 0; i < 10; i++) { @@ -1530,7 +1535,7 @@ static bool llama_eval_internal( //} //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N); // update kv token count lctx.model.kv_self.n = n_past + N; @@ -1541,11 +1546,11 @@ static bool llama_eval_internal( if (lctx.logits_all) { logits_out.resize(n_vocab * N); - memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N); + memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N); } else { // return result for just the last token logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab); } } @@ -2374,7 +2379,12 @@ struct llama_context * llama_init_from_file( ctx->embedding.resize(hparams.n_embd); } +#ifdef GGML_USE_METAL + // when using Metal, we don't need the extra buffer for intermediate dequantization + ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)/100); +#else ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); +#endif ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); @@ -2383,14 +2393,21 @@ struct llama_context * llama_init_from_file( #ifdef GGML_USE_METAL if (params.n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers - ctx->mtl_ctx = ggml_mtl_init( - ggml_get_mem_buffer(ctx->model.ctx), - ggml_get_mem_size (ctx->model.ctx), - ctx->buf_compute.addr, - ctx->buf_compute.size, - ctx->model.kv_self.buf.addr, - ctx->model.kv_self.buf.size, - 32*ctx->model.hparams.n_vocab*sizeof(float)); + if (params.use_mmap) { + ctx->mtl_ctx = ggml_mtl_init(); + ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ctx->model.mapping->addr, ctx->model.mapping->size); + ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr, ctx->buf_compute.size); + ggml_mtl_add_buffer(ctx->mtl_ctx, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size); + ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size); + ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size); + } else { + ctx->mtl_ctx = ggml_mtl_init(); + ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx)); + ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr, ctx->buf_compute.size); + ggml_mtl_add_buffer(ctx->mtl_ctx, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size); + ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size); + ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size); + } } #endif