llama : Metal inference (#1642)
* mtl : export the LLaMA computation graph
* ci : disable temporary
* mtl : adapt the MNIST example as starter
* mtl : no need for mtl-export tool, add cli arg for main instead
* mtl : export just a small part of the graph for now to make it easier
* mtl : move MSL code into separate file for easy editing
* mtl : initial get_rows_q4_0 kernel
* mtl : confirmed get_rows_q4_0 is working correctly
* mtl : add rms_norm kernel + confirm working
* mtl : add mul kernel + confirm working
* mtl : initial mul_mat Q4 kernel (wrong results)
* mtl : mul_mat fixes (still wrong)
* mtl : another mul_mat Q4 (still does not work)
* mtl : working mul_mat q4
* ggml : fix handling of "view" ops in ggml_graph_import()
* mtl : add rope kernel
* mtl : add reshape and transpose handling
* ggml : store offset as opt arg for ggml_view_xd() operators
* mtl : add cpy kernel + handle view ops
* mtl : confirm f16 x f32 attention mul mat
* mtl : add scale kernel
* mtl : add diag_mask_inf kernel
* mtl : fix soft_max kernel
* ggml : update ggml_nbytes() to handle non-contiguous tensors
* mtl : verify V tensor contents
* mtl : add f32 -> f32 cpy kernel
* mtl : add silu kernel
* mtl : add non-broadcast mul kernel
* mtl : full GPU inference of the computation graph
* mtl : optimize rms_norm and soft_max kernels
* mtl : add f16 mat x f32 vec multiplication kernel
* mtl : fix bug in f16 x f32 mul mat + speed-up computation
* mtl : faster mul_mat_q4_0_f32 kernel
* mtl : fix kernel signature + roll inner loop
* mtl : more threads for rms_norm + better timing
* mtl : remove printfs from inner loop
* mtl : simplify implementation
* mtl : add save/load vocab to ggml file
* mtl : plug Metal inference into llama.cpp (very quick-n-dirty)
* mtl : make it work with main example
Lots of hacks but at least now it generates text
* mtl : preparing for merge
* mtl : clean-up ggml mtl interface + suport scratch / inplace
* mtl : remove temp / debug code
* metal : final refactoring and simplification
* Revert "ci : disable temporary"
This reverts commit 98c267fc77
.
* metal : add comments
* metal : clean-up stuff, fix typos
* readme : add Metal instructions
* readme : add example for main
This commit is contained in:
parent
dcb2ed4826
commit
ecb217db4f
17 changed files with 1677 additions and 94 deletions
153
ggml.c
153
ggml.c
|
@ -3723,7 +3723,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
|||
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
||||
}
|
||||
|
||||
int ggml_nrows(const struct ggml_tensor * tensor) {
|
||||
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
||||
|
@ -3732,7 +3732,14 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
|
|||
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
|
||||
// this should handle cases where the tensor is not contiguous in memory
|
||||
// probaby just:
|
||||
//
|
||||
// return tensor->ne[3]*tensor->nb[3]
|
||||
//
|
||||
// is enough, but just in case, adding the second part
|
||||
|
||||
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
||||
}
|
||||
|
||||
int ggml_blck_size(enum ggml_type type) {
|
||||
|
@ -3814,11 +3821,11 @@ size_t ggml_tensor_overhead(void) {
|
|||
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
|
||||
}
|
||||
|
||||
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||
return tensor->nb[0] > tensor->nb[1];
|
||||
}
|
||||
|
||||
static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return
|
||||
|
@ -5802,10 +5809,18 @@ struct ggml_tensor * ggml_view_1d(
|
|||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
result->op = GGML_OP_VIEW;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src0 = a;
|
||||
result->src1 = NULL;
|
||||
result->opt[0] = offs;
|
||||
|
||||
if (is_node) {
|
||||
memcpy(result->padding, &offset, sizeof(offset));
|
||||
|
@ -5834,6 +5849,13 @@ struct ggml_tensor * ggml_view_2d(
|
|||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
result->nb[1] = nb1;
|
||||
result->nb[2] = result->nb[1]*ne1;
|
||||
result->nb[3] = result->nb[2];
|
||||
|
@ -5842,6 +5864,7 @@ struct ggml_tensor * ggml_view_2d(
|
|||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src0 = a;
|
||||
result->src1 = NULL;
|
||||
result->opt[0] = offs;
|
||||
|
||||
if (is_node) {
|
||||
memcpy(result->padding, &offset, sizeof(offset));
|
||||
|
@ -5872,6 +5895,13 @@ struct ggml_tensor * ggml_view_3d(
|
|||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
result->nb[1] = nb1;
|
||||
result->nb[2] = nb2;
|
||||
result->nb[3] = result->nb[2]*ne2;
|
||||
|
@ -5880,6 +5910,7 @@ struct ggml_tensor * ggml_view_3d(
|
|||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src0 = a;
|
||||
result->src1 = NULL;
|
||||
result->opt[0] = offs;
|
||||
|
||||
if (is_node) {
|
||||
memcpy(result->padding, &offset, sizeof(offset));
|
||||
|
@ -5912,6 +5943,13 @@ struct ggml_tensor * ggml_view_4d(
|
|||
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
||||
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
result->nb[1] = nb1;
|
||||
result->nb[2] = nb2;
|
||||
result->nb[3] = nb3;
|
||||
|
@ -5920,6 +5958,7 @@ struct ggml_tensor * ggml_view_4d(
|
|||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src0 = a;
|
||||
result->src1 = NULL;
|
||||
result->opt[0] = offs;
|
||||
|
||||
if (is_node) {
|
||||
memcpy(result->padding, &offset, sizeof(offset));
|
||||
|
@ -9252,7 +9291,7 @@ static void ggml_compute_forward_rms_norm_f32(
|
|||
sum += (ggml_float)(x[i00] * x[i00]);
|
||||
}
|
||||
|
||||
float mean = sum/ne00;
|
||||
const float mean = sum/ne00;
|
||||
|
||||
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
|
@ -11163,7 +11202,7 @@ static void ggml_compute_forward_rope_f32(
|
|||
theta *= theta_scale;
|
||||
|
||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[1];
|
||||
|
@ -11184,7 +11223,7 @@ static void ggml_compute_forward_rope_f32(
|
|||
const int64_t i0 = ib*n_dims + ic/2;
|
||||
|
||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[n_dims/2];
|
||||
|
@ -14588,7 +14627,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
|
|||
const int64_t * ne = tensor->ne;
|
||||
const size_t * nb = tensor->nb;
|
||||
|
||||
fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
|
||||
fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %32s\n",
|
||||
ggml_type_name(tensor->type),
|
||||
ggml_op_name (tensor->op),
|
||||
tensor->n_dims,
|
||||
|
@ -14602,7 +14641,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|||
const int64_t * ne = tensor->ne;
|
||||
const size_t * nb = tensor->nb;
|
||||
|
||||
fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
|
||||
fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %32s\n",
|
||||
arg,
|
||||
ggml_type_name(tensor->type),
|
||||
ggml_op_name (tensor->op),
|
||||
|
@ -14615,8 +14654,8 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
|||
}
|
||||
|
||||
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||
assert(cgraph->work == NULL);
|
||||
assert(cgraph->work_size == 0);
|
||||
//assert(cgraph->work == NULL);
|
||||
//assert(cgraph->work_size == 0);
|
||||
|
||||
uint64_t size_eval = 0;
|
||||
|
||||
|
@ -14837,7 +14876,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|||
// read file into data
|
||||
{
|
||||
FILE * fin = fopen(fname, "rb");
|
||||
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
|
||||
return result;
|
||||
|
@ -14977,6 +15015,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|||
op = *(const uint32_t *) ptr; ptr += sizeof(op);
|
||||
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
|
||||
|
||||
enum ggml_op eop = (enum ggml_op) op;
|
||||
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
size_t nb[GGML_MAX_DIMS];
|
||||
|
||||
|
@ -14991,42 +15031,77 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|||
nb[j] = nb_cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
||||
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
|
||||
|
||||
tensor->op = (enum ggml_op) op;
|
||||
const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
|
||||
|
||||
uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
|
||||
const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
|
||||
|
||||
memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
|
||||
struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
|
||||
|
||||
// parse args
|
||||
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
||||
const int32_t arg_idx = ptr_arg_idx[j];
|
||||
|
||||
if (arg_idx == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg_idx < GGML_MAX_NODES) {
|
||||
args[j] = result.leafs[arg_idx];
|
||||
} else {
|
||||
args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
|
||||
}
|
||||
}
|
||||
|
||||
// create the tensor
|
||||
// "view" operations are handled differently
|
||||
// TODO: handle inplace ops - currently a copy is always made
|
||||
|
||||
struct ggml_tensor * tensor = NULL;
|
||||
|
||||
switch (eop) {
|
||||
// TODO: implement other view ops
|
||||
case GGML_OP_RESHAPE:
|
||||
{
|
||||
tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
|
||||
} break;
|
||||
case GGML_OP_VIEW:
|
||||
{
|
||||
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
|
||||
|
||||
uint64_t offs;
|
||||
memcpy(&offs, args[2]->data, sizeof(offs));
|
||||
|
||||
tensor->data = ((char *) tensor->data) + offs;
|
||||
} break;
|
||||
case GGML_OP_TRANSPOSE:
|
||||
{
|
||||
tensor = ggml_transpose(*ctx_eval, args[0]);
|
||||
} break;
|
||||
case GGML_OP_PERMUTE:
|
||||
{
|
||||
tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
|
||||
|
||||
tensor->op = eop;
|
||||
} break;
|
||||
}
|
||||
|
||||
memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
|
||||
|
||||
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||
tensor->nb[j] = nb[j];
|
||||
}
|
||||
|
||||
// parse args
|
||||
{
|
||||
struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
|
||||
&tensor->src0,
|
||||
&tensor->src1,
|
||||
};
|
||||
tensor->src0 = args[0];
|
||||
tensor->src1 = args[1];
|
||||
|
||||
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
||||
args[2 + j] = &tensor->opt[j];
|
||||
}
|
||||
|
||||
for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
|
||||
const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
|
||||
|
||||
if (arg_idx == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg_idx < GGML_MAX_NODES) {
|
||||
*args[j] = result.leafs[arg_idx];
|
||||
} else {
|
||||
*args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < GGML_MAX_OPT; ++j) {
|
||||
tensor->opt[j] = args[2 + j];
|
||||
}
|
||||
|
||||
result.nodes[i] = tensor;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue