llama : Metal inference (#1642)

* mtl : export the LLaMA computation graph

* ci : disable temporary

* mtl : adapt the MNIST example as starter

* mtl : no need for mtl-export tool, add cli arg for main instead

* mtl : export just a small part of the graph for now to make it easier

* mtl : move MSL code into separate file for easy editing

* mtl : initial get_rows_q4_0 kernel

* mtl : confirmed get_rows_q4_0 is working correctly

* mtl : add rms_norm kernel + confirm working

* mtl : add mul kernel + confirm working

* mtl : initial mul_mat Q4 kernel (wrong results)

* mtl : mul_mat fixes (still wrong)

* mtl : another mul_mat Q4 (still does not work)

* mtl : working mul_mat q4

* ggml : fix handling of "view" ops in ggml_graph_import()

* mtl : add rope kernel

* mtl : add reshape and transpose handling

* ggml : store offset as opt arg for ggml_view_xd() operators

* mtl : add cpy kernel + handle view ops

* mtl : confirm f16 x f32 attention mul mat

* mtl : add scale kernel

* mtl : add diag_mask_inf kernel

* mtl : fix soft_max kernel

* ggml : update ggml_nbytes() to handle non-contiguous tensors

* mtl : verify V tensor contents

* mtl : add f32 -> f32 cpy kernel

* mtl : add silu kernel

* mtl : add non-broadcast mul kernel

* mtl : full GPU inference of the computation graph

* mtl : optimize rms_norm and soft_max kernels

* mtl : add f16 mat x f32 vec multiplication kernel

* mtl : fix bug in f16 x f32 mul mat + speed-up computation

* mtl : faster mul_mat_q4_0_f32 kernel

* mtl : fix kernel signature + roll inner loop

* mtl : more threads for rms_norm + better timing

* mtl : remove printfs from inner loop

* mtl : simplify implementation

* mtl : add save/load vocab to ggml file

* mtl : plug Metal inference into llama.cpp (very quick-n-dirty)

* mtl : make it work with main example

Lots of hacks but at least now it generates text

* mtl : preparing for merge

* mtl : clean-up ggml mtl interface + suport scratch / inplace

* mtl : remove temp / debug code

* metal : final refactoring and simplification

* Revert "ci : disable temporary"

This reverts commit 98c267fc77.

* metal : add comments

* metal : clean-up stuff, fix typos

* readme : add Metal instructions

* readme : add example for main
This commit is contained in:
Georgi Gerganov 2023-06-04 23:34:30 +03:00 committed by GitHub
parent dcb2ed4826
commit ecb217db4f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 1677 additions and 94 deletions

153
ggml.c
View file

@ -3723,7 +3723,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
}
int ggml_nrows(const struct ggml_tensor * tensor) {
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@ -3732,7 +3732,14 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
// this should handle cases where the tensor is not contiguous in memory
// probaby just:
//
// return tensor->ne[3]*tensor->nb[3]
//
// is enough, but just in case, adding the second part
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
}
int ggml_blck_size(enum ggml_type type) {
@ -3814,11 +3821,11 @@ size_t ggml_tensor_overhead(void) {
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
}
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1];
}
static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
@ -5802,10 +5809,18 @@ struct ggml_tensor * ggml_view_1d(
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx);
result->op = GGML_OP_VIEW;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
result->opt[0] = offs;
if (is_node) {
memcpy(result->padding, &offset, sizeof(offset));
@ -5834,6 +5849,13 @@ struct ggml_tensor * ggml_view_2d(
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx);
result->nb[1] = nb1;
result->nb[2] = result->nb[1]*ne1;
result->nb[3] = result->nb[2];
@ -5842,6 +5864,7 @@ struct ggml_tensor * ggml_view_2d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
result->opt[0] = offs;
if (is_node) {
memcpy(result->padding, &offset, sizeof(offset));
@ -5872,6 +5895,13 @@ struct ggml_tensor * ggml_view_3d(
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx);
result->nb[1] = nb1;
result->nb[2] = nb2;
result->nb[3] = result->nb[2]*ne2;
@ -5880,6 +5910,7 @@ struct ggml_tensor * ggml_view_3d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
result->opt[0] = offs;
if (is_node) {
memcpy(result->padding, &offset, sizeof(offset));
@ -5912,6 +5943,13 @@ struct ggml_tensor * ggml_view_4d(
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
ggml_scratch_save(ctx);
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
memcpy(offs->data, &offset, 2*sizeof(int32_t));
ggml_scratch_load(ctx);
result->nb[1] = nb1;
result->nb[2] = nb2;
result->nb[3] = nb3;
@ -5920,6 +5958,7 @@ struct ggml_tensor * ggml_view_4d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
result->opt[0] = offs;
if (is_node) {
memcpy(result->padding, &offset, sizeof(offset));
@ -9252,7 +9291,7 @@ static void ggml_compute_forward_rms_norm_f32(
sum += (ggml_float)(x[i00] * x[i00]);
}
float mean = sum/ne00;
const float mean = sum/ne00;
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
@ -11163,7 +11202,7 @@ static void ggml_compute_forward_rope_f32(
theta *= theta_scale;
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = src[0];
const float x1 = src[1];
@ -11184,7 +11223,7 @@ static void ggml_compute_forward_rope_f32(
const int64_t i0 = ib*n_dims + ic/2;
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = src[0];
const float x1 = src[n_dims/2];
@ -14588,7 +14627,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
const int64_t * ne = tensor->ne;
const size_t * nb = tensor->nb;
fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %32s\n",
ggml_type_name(tensor->type),
ggml_op_name (tensor->op),
tensor->n_dims,
@ -14602,7 +14641,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
const int64_t * ne = tensor->ne;
const size_t * nb = tensor->nb;
fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %32s\n",
arg,
ggml_type_name(tensor->type),
ggml_op_name (tensor->op),
@ -14615,8 +14654,8 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
}
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
assert(cgraph->work == NULL);
assert(cgraph->work_size == 0);
//assert(cgraph->work == NULL);
//assert(cgraph->work_size == 0);
uint64_t size_eval = 0;
@ -14837,7 +14876,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
// read file into data
{
FILE * fin = fopen(fname, "rb");
if (!fin) {
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
return result;
@ -14977,6 +15015,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
op = *(const uint32_t *) ptr; ptr += sizeof(op);
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
enum ggml_op eop = (enum ggml_op) op;
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS];
@ -14991,42 +15031,77 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
nb[j] = nb_cur;
}
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
tensor->op = (enum ggml_op) op;
const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
// parse args
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
const int32_t arg_idx = ptr_arg_idx[j];
if (arg_idx == -1) {
continue;
}
if (arg_idx < GGML_MAX_NODES) {
args[j] = result.leafs[arg_idx];
} else {
args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
}
}
// create the tensor
// "view" operations are handled differently
// TODO: handle inplace ops - currently a copy is always made
struct ggml_tensor * tensor = NULL;
switch (eop) {
// TODO: implement other view ops
case GGML_OP_RESHAPE:
{
tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
} break;
case GGML_OP_VIEW:
{
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
uint64_t offs;
memcpy(&offs, args[2]->data, sizeof(offs));
tensor->data = ((char *) tensor->data) + offs;
} break;
case GGML_OP_TRANSPOSE:
{
tensor = ggml_transpose(*ctx_eval, args[0]);
} break;
case GGML_OP_PERMUTE:
{
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
} break;
default:
{
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
tensor->op = eop;
} break;
}
memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
tensor->nb[j] = nb[j];
}
// parse args
{
struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
&tensor->src0,
&tensor->src1,
};
tensor->src0 = args[0];
tensor->src1 = args[1];
for (int j = 0; j < GGML_MAX_OPT; ++j) {
args[2 + j] = &tensor->opt[j];
}
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
if (arg_idx == -1) {
continue;
}
if (arg_idx < GGML_MAX_NODES) {
*args[j] = result.leafs[arg_idx];
} else {
*args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
}
}
for (int j = 0; j < GGML_MAX_OPT; ++j) {
tensor->opt[j] = args[2 + j];
}
result.nodes[i] = tensor;