mtl : clean-up ggml mtl interface + suport scratch / inplace
This commit is contained in:
parent
18e482a89c
commit
e4b522232c
4 changed files with 174 additions and 223 deletions
|
@ -51,23 +51,22 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
auto * ctx_mtl = ggml_mtl_init(
|
auto * ctx_mtl = ggml_mtl_init();
|
||||||
ggml_get_mem_buffer(ctx_data),
|
|
||||||
ggml_get_mem_size (ctx_data),
|
ggml_mtl_add_buffer(ctx_mtl, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
|
||||||
ggml_get_mem_buffer(ctx_eval),
|
ggml_mtl_add_buffer(ctx_mtl, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
|
||||||
ggml_get_mem_size (ctx_eval),
|
|
||||||
NULL, 0, // cache
|
|
||||||
32*n_vocab*sizeof(float));
|
|
||||||
|
|
||||||
// TODO: tmp to match the input used when creating the cgraph
|
// TODO: tmp to match the input used when creating the cgraph
|
||||||
{
|
{
|
||||||
const int n_batch = 1;
|
const std::vector<int> tmp(1, 1); // BOS
|
||||||
const int n_past = 512 - n_batch;
|
|
||||||
|
|
||||||
const std::vector<int> tmp(n_batch, 1); // BOS
|
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
|
||||||
|
memcpy(input->data, tmp.data(), tmp.size() * sizeof(int));
|
||||||
|
|
||||||
|
ggml_mtl_set_tensor(ctx_mtl, input);
|
||||||
|
|
||||||
// warmup
|
// warmup
|
||||||
ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
|
ggml_mtl_graph_compute(ctx_mtl, &gf);
|
||||||
|
|
||||||
const int n_iter = 16;
|
const int n_iter = 16;
|
||||||
|
|
||||||
|
@ -75,7 +74,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// the actual inference happens here
|
// the actual inference happens here
|
||||||
for (int i = 0; i < n_iter; ++i) {
|
for (int i = 0; i < n_iter; ++i) {
|
||||||
ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
|
ggml_mtl_graph_compute(ctx_mtl, &gf);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t1 = ggml_time_us();
|
const int64_t t1 = ggml_time_us();
|
||||||
|
@ -83,6 +82,31 @@ int main(int argc, char ** argv) {
|
||||||
printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
|
printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// debug output
|
||||||
|
{
|
||||||
|
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
|
||||||
|
ggml_mtl_get_tensor(ctx_mtl, logits);
|
||||||
|
|
||||||
|
float * ptr = (float *) ggml_get_data(logits);
|
||||||
|
|
||||||
|
printf("logits: ");
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
printf("%8.4f ", ptr[i]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
int imax = 0;
|
||||||
|
double sum = 0.0;
|
||||||
|
double vmax = -1e9;
|
||||||
|
for (int i = 0; i < 32000; i++) {
|
||||||
|
sum += (double) ptr[i];
|
||||||
|
if (ptr[i] > vmax) {
|
||||||
|
vmax = ptr[i];
|
||||||
|
imax = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_mtl_free(ctx_mtl);
|
ggml_mtl_free(ctx_mtl);
|
||||||
|
|
||||||
ggml_free(ctx_data);
|
ggml_free(ctx_data);
|
||||||
|
|
34
ggml-mtl.h
34
ggml-mtl.h
|
@ -2,7 +2,9 @@
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
|
||||||
struct ggml_context;
|
#define GGML_METAL_MAX_BUFFERS 16
|
||||||
|
|
||||||
|
struct ggml_tensor;
|
||||||
struct ggml_cgraph;
|
struct ggml_cgraph;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
@ -11,24 +13,30 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_mtl_context;
|
struct ggml_mtl_context;
|
||||||
|
|
||||||
struct ggml_mtl_context * ggml_mtl_init(
|
struct ggml_mtl_context * ggml_mtl_init(void);
|
||||||
void * data_buf,
|
|
||||||
size_t data_size,
|
|
||||||
void * eval_buf,
|
|
||||||
size_t eval_size,
|
|
||||||
void * cach_buf,
|
|
||||||
size_t cach_size,
|
|
||||||
size_t outp_size);
|
|
||||||
|
|
||||||
void ggml_mtl_free(struct ggml_mtl_context * ctx);
|
void ggml_mtl_free(struct ggml_mtl_context * ctx);
|
||||||
|
|
||||||
|
void ggml_mtl_add_buffer(
|
||||||
|
struct ggml_mtl_context * ctx,
|
||||||
|
const char * name,
|
||||||
|
void * data,
|
||||||
|
size_t size);
|
||||||
|
|
||||||
|
// set data from host memory into the device
|
||||||
|
void ggml_mtl_set_tensor(
|
||||||
|
struct ggml_mtl_context * ctx,
|
||||||
|
struct ggml_tensor * t);
|
||||||
|
|
||||||
|
// get data from the device into host memory
|
||||||
|
void ggml_mtl_get_tensor(
|
||||||
|
struct ggml_mtl_context * ctx,
|
||||||
|
struct ggml_tensor * t);
|
||||||
|
|
||||||
// return 0 on success
|
// return 0 on success
|
||||||
int ggml_mtl_graph_compute(
|
int ggml_mtl_graph_compute(
|
||||||
struct ggml_mtl_context * ctx,
|
struct ggml_mtl_context * ctx,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf);
|
||||||
const int * tokens,
|
|
||||||
int n_tokens,
|
|
||||||
int n_past);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
242
ggml-mtl.m
242
ggml-mtl.m
|
@ -13,26 +13,24 @@
|
||||||
#endif
|
#endif
|
||||||
//#define mtl_printf(...)
|
//#define mtl_printf(...)
|
||||||
|
|
||||||
struct ggml_mtl_context {
|
struct ggml_mtl_buffer {
|
||||||
void * data_buf;
|
const char * name;
|
||||||
size_t data_size;
|
|
||||||
void * eval_buf;
|
|
||||||
size_t eval_size;
|
|
||||||
void * cach_buf;
|
|
||||||
size_t cach_size;
|
|
||||||
size_t outp_size;
|
|
||||||
|
|
||||||
|
void * data;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
id<MTLBuffer> mtl;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_mtl_context {
|
||||||
float * logits;
|
float * logits;
|
||||||
|
|
||||||
id<MTLDevice> device;
|
id<MTLDevice> device;
|
||||||
id<MTLCommandQueue> queue;
|
id<MTLCommandQueue> queue;
|
||||||
id<MTLLibrary> library;
|
id<MTLLibrary> library;
|
||||||
|
|
||||||
id<MTLBuffer> buffer_data;
|
int n_buffers;
|
||||||
id<MTLBuffer> buffer_eval;
|
struct ggml_mtl_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
||||||
id<MTLBuffer> buffer_cach;
|
|
||||||
|
|
||||||
id<MTLBuffer> out;
|
|
||||||
|
|
||||||
// custom kernels
|
// custom kernels
|
||||||
id<MTLFunction> function_add;
|
id<MTLFunction> function_add;
|
||||||
|
@ -87,25 +85,11 @@ struct ggml_mtl_context {
|
||||||
// for now it is easier to work in a separate file
|
// for now it is easier to work in a separate file
|
||||||
NSString * const msl_library_source = @"see mtl.metal";
|
NSString * const msl_library_source = @"see mtl.metal";
|
||||||
|
|
||||||
struct ggml_mtl_context * ggml_mtl_init(
|
struct ggml_mtl_context * ggml_mtl_init(void) {
|
||||||
void * data_buf,
|
|
||||||
size_t data_size,
|
|
||||||
void * eval_buf,
|
|
||||||
size_t eval_size,
|
|
||||||
void * cach_buf,
|
|
||||||
size_t cach_size,
|
|
||||||
size_t outp_size) {
|
|
||||||
fprintf(stderr, "%s: allocating\n", __func__);
|
fprintf(stderr, "%s: allocating\n", __func__);
|
||||||
|
|
||||||
struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
|
struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
|
||||||
|
|
||||||
ctx->data_buf = data_buf;
|
|
||||||
ctx->data_size = data_size;
|
|
||||||
ctx->eval_buf = eval_buf;
|
|
||||||
ctx->eval_size = eval_size;
|
|
||||||
ctx->cach_buf = cach_buf;
|
|
||||||
ctx->cach_size = cach_size;
|
|
||||||
|
|
||||||
ctx->device = MTLCreateSystemDefaultDevice();
|
ctx->device = MTLCreateSystemDefaultDevice();
|
||||||
ctx->queue = [ctx->device newCommandQueue];
|
ctx->queue = [ctx->device newCommandQueue];
|
||||||
|
|
||||||
|
@ -216,51 +200,6 @@ struct ggml_mtl_context * ggml_mtl_init(
|
||||||
fprintf(stderr, "%s: loaded kernel_cpy_f32_f32: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f32);
|
fprintf(stderr, "%s: loaded kernel_cpy_f32_f32: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f32);
|
||||||
}
|
}
|
||||||
|
|
||||||
// MTLBuffer approach
|
|
||||||
|
|
||||||
// pin ctx_data memory to GPU
|
|
||||||
// use MTLStorageModeShared to allow us to initialize the weights from the CPU
|
|
||||||
// TODO: how to use MTLStorageModeManaged?
|
|
||||||
// TODO: see if we can avoid this copy somehow
|
|
||||||
{
|
|
||||||
const void * mem_buffer = data_buf;
|
|
||||||
const size_t mem_size = data_size;
|
|
||||||
|
|
||||||
//ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil];
|
|
||||||
ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: allocated data buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// pin ctx_eval memory to GPU
|
|
||||||
// this buffer will be used for the intermediate results of the evaluation
|
|
||||||
{
|
|
||||||
const void * mem_buffer = eval_buf;
|
|
||||||
const size_t mem_size = eval_size;
|
|
||||||
|
|
||||||
ctx->buffer_eval = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: allocated eval buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cach_buf) {
|
|
||||||
const void * mem_buffer = cach_buf;
|
|
||||||
const size_t mem_size = cach_size;
|
|
||||||
|
|
||||||
ctx->buffer_cach = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: allocated cach buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// allocate buffer for result extraction
|
|
||||||
{
|
|
||||||
const size_t mem_size = outp_size;
|
|
||||||
|
|
||||||
ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: allocated out buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -271,81 +210,80 @@ void ggml_mtl_free(struct ggml_mtl_context * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// get data / eval buffer + offset
|
// get data / eval buffer + offset
|
||||||
id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
static id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
||||||
const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf;
|
|
||||||
const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf;
|
|
||||||
const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf;
|
|
||||||
|
|
||||||
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||||
|
|
||||||
//const size_t t_size = ggml_nbytes(t);
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||||
|
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
||||||
|
|
||||||
id<MTLBuffer> result;
|
if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
|
||||||
size_t t_offs = 0;
|
*offs = (size_t) ioffs;
|
||||||
|
|
||||||
if ( offs_data > 0 &&
|
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
||||||
(offs_eval < 0 || (offs_data < offs_eval)) &&
|
|
||||||
(offs_cach < 0 || (offs_data < offs_cach))
|
return ctx->buffers[i].mtl;
|
||||||
) {
|
}
|
||||||
result = ctx->buffer_data;
|
|
||||||
t_offs = offs_data;
|
|
||||||
//fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( offs_eval > 0 &&
|
fprintf(stderr, "%s: error: buffer is nil\n", __func__);
|
||||||
(offs_data < 0 || (offs_eval < offs_data)) &&
|
GGML_ASSERT(false);
|
||||||
(offs_cach < 0 || (offs_eval < offs_cach))
|
|
||||||
) {
|
return nil;
|
||||||
result = ctx->buffer_eval;
|
}
|
||||||
t_offs = offs_eval;
|
|
||||||
//fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
|
void ggml_mtl_add_buffer(
|
||||||
|
struct ggml_mtl_context * ctx,
|
||||||
|
const char * name,
|
||||||
|
void * data,
|
||||||
|
size_t size) {
|
||||||
|
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
||||||
|
fprintf(stderr, "%s: too many buffers\n", __func__);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( offs_cach > 0 &&
|
if (data) {
|
||||||
(offs_data < 0 || (offs_cach < offs_data)) &&
|
ctx->buffers[ctx->n_buffers].name = name;
|
||||||
(offs_eval < 0 || (offs_cach < offs_eval))
|
ctx->buffers[ctx->n_buffers].data = data;
|
||||||
) {
|
ctx->buffers[ctx->n_buffers].size = size;
|
||||||
result = ctx->buffer_cach;
|
ctx->buffers[ctx->n_buffers].mtl = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
|
||||||
t_offs = offs_cach;
|
|
||||||
//fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (result == nil || (t_offs > ctx->data_size && t_offs > ctx->eval_size && t_offs > ctx->cach_size)) {
|
++ctx->n_buffers;
|
||||||
fprintf(stderr, "%s: error: buffer is nil\n", __func__);
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (offs != 0) {
|
fprintf(stderr, "%s: allocated '%16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0);
|
||||||
*offs = t_offs;
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return result;
|
void ggml_mtl_set_tensor(
|
||||||
|
struct ggml_mtl_context * ctx,
|
||||||
|
struct ggml_tensor * t) {
|
||||||
|
mtl_printf("%s: set input for tensor '%s'\n", __func__, t->name);
|
||||||
|
|
||||||
|
size_t offs;
|
||||||
|
id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, t, &offs);
|
||||||
|
|
||||||
|
memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_mtl_get_tensor(
|
||||||
|
struct ggml_mtl_context * ctx,
|
||||||
|
struct ggml_tensor * t) {
|
||||||
|
mtl_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
|
||||||
|
|
||||||
|
size_t offs;
|
||||||
|
id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, t, &offs);
|
||||||
|
|
||||||
|
memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
|
||||||
}
|
}
|
||||||
|
|
||||||
int ggml_mtl_graph_compute(
|
int ggml_mtl_graph_compute(
|
||||||
struct ggml_mtl_context * ctx,
|
struct ggml_mtl_context * ctx,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf) {
|
||||||
const int * tokens,
|
mtl_printf("%s: evaluating graph\n", __func__);
|
||||||
int n_tokens,
|
|
||||||
int n_past) {
|
|
||||||
mtl_printf("%s: evaluating, n_tokens = %d, n_past = %d\n", __func__, n_tokens, n_past);
|
|
||||||
|
|
||||||
struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
|
|
||||||
memcpy(input->data, tokens, n_tokens * sizeof(int));
|
|
||||||
|
|
||||||
size_t offs_src0 = 0;
|
size_t offs_src0 = 0;
|
||||||
size_t offs_src1 = 0;
|
size_t offs_src1 = 0;
|
||||||
size_t offs_dst = 0;
|
size_t offs_dst = 0;
|
||||||
|
|
||||||
// copy the input data to the GPU
|
|
||||||
{
|
|
||||||
struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
|
|
||||||
|
|
||||||
id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, embd, &offs_src0);
|
|
||||||
|
|
||||||
memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
|
|
||||||
}
|
|
||||||
|
|
||||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
|
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
|
||||||
id<MTLComputeCommandEncoder> encoder = nil;
|
id<MTLComputeCommandEncoder> encoder = nil;
|
||||||
|
|
||||||
|
@ -521,6 +459,8 @@ int ggml_mtl_graph_compute(
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int n_past = ((int32_t *)(src1->data))[0];
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
|
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
@ -690,6 +630,8 @@ int ggml_mtl_graph_compute(
|
||||||
//mtl_printf("rope: %lld x %lld x %lld x %lld\n", ne0, ne1, ne2, ne3);
|
//mtl_printf("rope: %lld x %lld x %lld x %lld\n", ne0, ne1, ne2, ne3);
|
||||||
//mtl_printf("rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
|
//mtl_printf("rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
|
||||||
|
|
||||||
|
const int n_past = ((int32_t *)(src1->data))[0];
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_rope];
|
[encoder setComputePipelineState:ctx->pipeline_rope];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
@ -769,23 +711,9 @@ int ggml_mtl_graph_compute(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract results from the GPU
|
if (encoder != nil) {
|
||||||
{
|
[encoder endEncoding];
|
||||||
mtl_printf("%s: extract results from the GPU\n", __func__);
|
encoder = nil;
|
||||||
|
|
||||||
if (encoder != nil) {
|
|
||||||
[encoder endEncoding];
|
|
||||||
encoder = nil;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
|
|
||||||
|
|
||||||
id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, out, &offs_src0);
|
|
||||||
id<MTLBuffer> id_dst = ctx->out;
|
|
||||||
|
|
||||||
id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
|
|
||||||
[encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)];
|
|
||||||
[encoder_blit endEncoding];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
[command_buffer commit];
|
[command_buffer commit];
|
||||||
|
@ -796,31 +724,5 @@ int ggml_mtl_graph_compute(
|
||||||
mtl_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
|
mtl_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->logits = ctx->out.contents;
|
|
||||||
|
|
||||||
const float * logits = ctx->logits;
|
|
||||||
|
|
||||||
struct ggml_tensor * t = gf->nodes[gf->n_nodes - 1];
|
|
||||||
memcpy(t->data, logits, ggml_nbytes(t));
|
|
||||||
|
|
||||||
#if 1
|
|
||||||
mtl_printf("logits: ");
|
|
||||||
for (int i = 0; i < 100; i++) {
|
|
||||||
mtl_printf("%8.4f ", logits[i]);
|
|
||||||
}
|
|
||||||
mtl_printf("\n");
|
|
||||||
double sum = 0.0;
|
|
||||||
int imax = 0;
|
|
||||||
double vmax = -INFINITY;
|
|
||||||
for (int i = 0; i < 32000; i++) {
|
|
||||||
sum += (double) logits[i];
|
|
||||||
if (logits[i] > vmax) {
|
|
||||||
vmax = logits[i];
|
|
||||||
imax = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
73
llama.cpp
73
llama.cpp
|
@ -1085,7 +1085,7 @@ static void llama_model_load_internal(
|
||||||
mmapped_size - vram_total + // weights in VRAM not in memory
|
mmapped_size - vram_total + // weights in VRAM not in memory
|
||||||
MEM_REQ_SCRATCH0().at(model.type) +
|
MEM_REQ_SCRATCH0().at(model.type) +
|
||||||
MEM_REQ_SCRATCH1().at(model.type) +
|
MEM_REQ_SCRATCH1().at(model.type) +
|
||||||
MEM_REQ_EVAL().at(model.type);
|
MEM_REQ_EVAL().at (model.type);
|
||||||
|
|
||||||
// this is the memory required by one llama_state
|
// this is the memory required by one llama_state
|
||||||
const size_t mem_required_state =
|
const size_t mem_required_state =
|
||||||
|
@ -1255,14 +1255,19 @@ static bool llama_eval_internal(
|
||||||
ggml_set_name(embd, "embd");
|
ggml_set_name(embd, "embd");
|
||||||
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (lctx.mtl_ctx) {
|
||||||
|
ggml_mtl_set_tensor(lctx.mtl_ctx, embd);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * inpSA = inpL;
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
lctx.use_buf(ctx0, 0);
|
||||||
|
|
||||||
//lctx.use_buf(ctx0, 0);
|
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
|
@ -1378,7 +1383,7 @@ static bool llama_eval_internal(
|
||||||
cur);
|
cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
//lctx.use_buf(ctx0, 1);
|
lctx.use_buf(ctx0, 1);
|
||||||
|
|
||||||
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
||||||
|
|
||||||
|
@ -1416,36 +1421,36 @@ static bool llama_eval_internal(
|
||||||
inpL = cur;
|
inpL = cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
//lctx.use_buf(ctx0, 0);
|
lctx.use_buf(ctx0, 0);
|
||||||
|
|
||||||
// used at the end to optionally extract the embeddings
|
// used at the end to optionally extract the embeddings
|
||||||
struct ggml_tensor * embeddings = NULL;
|
struct ggml_tensor * embeddings = NULL;
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
|
cur = ggml_rms_norm(ctx0, inpL);
|
||||||
|
|
||||||
inpL = ggml_rms_norm(ctx0, inpL);
|
// cur = cur*norm(broadcasted)
|
||||||
|
cur = ggml_mul(ctx0, cur, model.norm);
|
||||||
|
|
||||||
// inpL = inpL*norm(broadcasted)
|
embeddings = cur;
|
||||||
inpL = ggml_mul(ctx0, inpL, model.norm);
|
|
||||||
|
|
||||||
embeddings = inpL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// lm_head
|
// lm_head
|
||||||
inpL = ggml_mul_mat(ctx0, model.output, inpL);
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
|
||||||
//lctx.use_buf(ctx0, -1);
|
lctx.use_buf(ctx0, -1);
|
||||||
|
|
||||||
// logits -> probs
|
// logits -> probs
|
||||||
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
||||||
|
|
||||||
// run the computation
|
// run the computation
|
||||||
ggml_build_forward_expand(&gf, inpL);
|
ggml_build_forward_expand(&gf, cur);
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (lctx.mtl_ctx) {
|
if (lctx.mtl_ctx) {
|
||||||
ggml_mtl_graph_compute(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
|
ggml_mtl_graph_compute(lctx.mtl_ctx, &gf);
|
||||||
|
ggml_mtl_get_tensor(lctx.mtl_ctx, cur);
|
||||||
} else {
|
} else {
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
}
|
}
|
||||||
|
@ -1498,7 +1503,7 @@ static bool llama_eval_internal(
|
||||||
ggml_free(ctx_vocab);
|
ggml_free(ctx_vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
float * logits = (float *) ggml_get_data(inpL);
|
float * logits = (float *) ggml_get_data(cur);
|
||||||
|
|
||||||
printf("logits: ");
|
printf("logits: ");
|
||||||
for (int i = 0; i < 10; i++) {
|
for (int i = 0; i < 10; i++) {
|
||||||
|
@ -1530,7 +1535,7 @@ static bool llama_eval_internal(
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//embd_w.resize(n_vocab*N);
|
//embd_w.resize(n_vocab*N);
|
||||||
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
||||||
|
|
||||||
// update kv token count
|
// update kv token count
|
||||||
lctx.model.kv_self.n = n_past + N;
|
lctx.model.kv_self.n = n_past + N;
|
||||||
|
@ -1541,11 +1546,11 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
if (lctx.logits_all) {
|
if (lctx.logits_all) {
|
||||||
logits_out.resize(n_vocab * N);
|
logits_out.resize(n_vocab * N);
|
||||||
memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
||||||
} else {
|
} else {
|
||||||
// return result for just the last token
|
// return result for just the last token
|
||||||
logits_out.resize(n_vocab);
|
logits_out.resize(n_vocab);
|
||||||
memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2374,7 +2379,12 @@ struct llama_context * llama_init_from_file(
|
||||||
ctx->embedding.resize(hparams.n_embd);
|
ctx->embedding.resize(hparams.n_embd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
// when using Metal, we don't need the extra buffer for intermediate dequantization
|
||||||
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)/100);
|
||||||
|
#else
|
||||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
||||||
|
#endif
|
||||||
|
|
||||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
||||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||||
|
@ -2383,14 +2393,21 @@ struct llama_context * llama_init_from_file(
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (params.n_gpu_layers > 0) {
|
if (params.n_gpu_layers > 0) {
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
ctx->mtl_ctx = ggml_mtl_init(
|
if (params.use_mmap) {
|
||||||
ggml_get_mem_buffer(ctx->model.ctx),
|
ctx->mtl_ctx = ggml_mtl_init();
|
||||||
ggml_get_mem_size (ctx->model.ctx),
|
ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ctx->model.mapping->addr, ctx->model.mapping->size);
|
||||||
ctx->buf_compute.addr,
|
ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr, ctx->buf_compute.size);
|
||||||
ctx->buf_compute.size,
|
ggml_mtl_add_buffer(ctx->mtl_ctx, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
|
||||||
ctx->model.kv_self.buf.addr,
|
ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size);
|
||||||
ctx->model.kv_self.buf.size,
|
ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size);
|
||||||
32*ctx->model.hparams.n_vocab*sizeof(float));
|
} else {
|
||||||
|
ctx->mtl_ctx = ggml_mtl_init();
|
||||||
|
ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx));
|
||||||
|
ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr, ctx->buf_compute.size);
|
||||||
|
ggml_mtl_add_buffer(ctx->mtl_ctx, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
|
||||||
|
ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size);
|
||||||
|
ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue