Merge branch 'master' into compilade/refactor-kv-cache

This commit is contained in:
Francis Couture-Harpin 2024-04-29 10:31:39 -04:00
commit c460ff1a1c
172 changed files with 11845 additions and 11211 deletions

295
ggml.c
View file

@ -4,6 +4,7 @@
#include "ggml-impl.h"
#include "ggml-quants.h"
#include "ggml.h"
#include "sgemm.h"
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
@ -32,6 +33,10 @@
#include <unistd.h>
#endif
#ifdef __ARM_FEATURE_MATMUL_INT8
#undef GGML_USE_LLAMAFILE
#endif
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :)
@ -853,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
// simd mappings
//
#if defined(__ARM_NEON)
#if !defined(__aarch64__)
// 64-bit compatibility
inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}
#endif
#endif
// we define a common set of C macros which map to specific intrinsics based on the current architecture
// we then implement the fundamental computation operations below using only these macros
// adding support for new architectures requires to define the corresponding SIMD macros
@ -4573,21 +4566,32 @@ void ggml_mul_mat_set_prec(
// ggml_mul_mat_id
// NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
// this will allow computing all the used experts in a single matrix multiplication
/*
c = ggml_mul_mat_id(ctx, as, b, ids);
as -> [cols, rows, n_expert]
ids -> [n_experts_used, n_tokens] (i32)
b -> [cols, n_expert_used, n_tokens]
c -> [cols, n_expert_used, n_tokens]
in b, n_experts_used can be broadcasted to match the n_expert_used of ids
c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
*/
struct ggml_tensor * ggml_mul_mat_id(
struct ggml_context * ctx,
struct ggml_tensor * as,
struct ggml_tensor * ids,
int id,
struct ggml_tensor * b) {
struct ggml_tensor * b,
struct ggml_tensor * ids) {
GGML_ASSERT(!ggml_is_transposed(as));
GGML_ASSERT(ids->type == GGML_TYPE_I32);
GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
GGML_ASSERT(b->ne[3] == 1); // b is 3d
GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
bool is_node = false;
@ -4595,11 +4599,9 @@ struct ggml_tensor * ggml_mul_mat_id(
is_node = true;
}
const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
ggml_set_op_params_i32(result, 0, id);
result->op = GGML_OP_MUL_MAT_ID;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = as;
@ -10809,6 +10811,28 @@ static void ggml_compute_forward_mul_mat(
}
#endif
#if GGML_USE_LLAMAFILE
if (src1_cont) {
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type),
(const char *)src1->data + i12*nb12 + i13*nb13,
nb11/ggml_type_size(src1->type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
params->type,
src0->type,
src1->type,
dst->type))
goto UseGgmlGemm1;
return;
}
UseGgmlGemm1:;
#endif
if (params->type == GGML_TASK_TYPE_INIT) {
if (ith != 0) {
return;
@ -10840,6 +10864,28 @@ static void ggml_compute_forward_mul_mat(
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
#if GGML_USE_LLAMAFILE
if (src1->type != vec_dot_type) {
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
nb01/ggml_type_size(src0->type),
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
row_size/ggml_type_size(vec_dot_type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
params->type,
src0->type,
vec_dot_type,
dst->type))
goto UseGgmlGemm2;
return;
}
UseGgmlGemm2:;
#endif
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
@ -10957,11 +11003,6 @@ static void ggml_compute_forward_mul_mat_id(
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
GGML_ASSERT(ne0 == ne01);
GGML_ASSERT(ne1 == ne11);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == ggml_type_size(type));
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
@ -10972,22 +11013,21 @@ static void ggml_compute_forward_mul_mat_id(
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
// broadcast is not supported with mmid
assert(ne12 == 1);
assert(ne13 == 1);
// row groups
const int id = ggml_get_op_params_i32(dst, 0);
const int n_as = src0->ne[2];
const int n_ids = ids->ne[0]; // n_expert_used
const int n_as = ne02; // n_expert
char * wdata_src1_end = (src1->type == vec_dot_type) ?
(char *) params->wdata :
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11]
struct mmid_row_mapping {
int32_t i1;
int32_t i2;
};
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
if (params->type == GGML_TASK_TYPE_INIT) {
if (ith != 0) {
@ -11011,16 +11051,20 @@ static void ggml_compute_forward_mul_mat_id(
}
// initialize matrix_row_counts
GGML_ASSERT(wdata == wdata_src1_end);
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
// group rows by src0 matrix
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
GGML_ASSERT(row_id >= 0 && row_id < n_as);
MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
matrix_row_counts[row_id] += 1;
// group rows by src0 matrix
for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
for (int id = 0; id < n_ids; ++id) {
const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
assert(i02 >= 0 && i02 < n_as);
MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
matrix_row_counts[i02] += 1;
}
}
return;
@ -11038,15 +11082,13 @@ static void ggml_compute_forward_mul_mat_id(
continue;
}
size_t src0_offset = cur_a*src0->nb[2];
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = cne1*ne12*ne13; // src1 rows
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = cne1; // src1 rows
// distribute the thread work across the inner or outer loop based on which one is larger
@ -11065,13 +11107,11 @@ static void ggml_compute_forward_mul_mat_id(
const int64_t ir110 = dr1*ith1;
const int64_t ir111 = MIN(ir110 + dr1, nr1);
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
// threads with no work simply yield (not sure if it helps)
if (ir010 >= ir011 || ir110 >= ir111) {
sched_yield();
continue;
}
//if (ir010 >= ir011 || ir110 >= ir111) {
// sched_yield();
// continue;
//}
// block-tiling attempt
const int64_t blck_0 = 16;
@ -11083,20 +11123,16 @@ static void ggml_compute_forward_mul_mat_id(
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11);
const int64_t _i12 = ir1; // logical row index for this expert
// broadcast src0 into src1
//const int64_t i03 = i13/r3;
//const int64_t i02 = i12/r2;
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
const int id = row_mapping.i1; // selected expert index
const int64_t i1 = i11;
const int64_t i2 = i12;
const int64_t i3 = i13;
const int64_t i11 = id % ne11;
const int64_t i12 = row_mapping.i2; // row index in src1
const char * src0_row = (const char *) src0->data + src0_offset;
const int64_t i1 = id; // selected expert index
const int64_t i2 = i12; // row
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@ -11104,25 +11140,26 @@ static void ggml_compute_forward_mul_mat_id(
// TODO: this is a bit of a hack, we should probably have a better way to handle this
const char * src1_col = (const char *) wdata +
(src1_cont || src1->type != vec_dot_type
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
: (i11*nb11 + i12*nb12 + i13*nb13));
? (i11 + i12*ne11)*row_size
: (i11*nb11 + i12*nb12));
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
//}
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
}
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
}
}
}
}
#undef MMID_MATRIX_ROW
#undef MMID_MATRIX_ROW
}
// ggml_compute_forward_out_prod
@ -18442,7 +18479,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
const int n_as = src0->ne[2];
cur += GGML_PAD(cur, sizeof(int64_t)); // align
cur += n_as * sizeof(int64_t); // matrix_row_counts
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
} break;
case GGML_OP_OUT_PROD:
{
@ -20530,8 +20567,34 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
return ok;
}
static void gguf_free_kv(struct gguf_kv * kv) {
if (kv->key.data) {
GGML_FREE(kv->key.data);
}
if (kv->type == GGUF_TYPE_STRING) {
if (kv->value.str.data) {
GGML_FREE(kv->value.str.data);
}
}
if (kv->type == GGUF_TYPE_ARRAY) {
if (kv->value.arr.data) {
if (kv->value.arr.type == GGUF_TYPE_STRING) {
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
if (str->data) {
GGML_FREE(str->data);
}
}
}
GGML_FREE(kv->value.arr.data);
}
}
}
struct gguf_context * gguf_init_empty(void) {
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
ctx->header.version = GGUF_VERSION;
@ -20576,7 +20639,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
bool ok = true;
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
// read the header
{
@ -20613,9 +20676,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the kv pairs
{
ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
const uint64_t n_kv = ctx->header.n_kv;
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
// header.n_kv will hold the actual value of pairs that were successfully read in the loop below
ctx->header.n_kv = 0;
ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv));
for (uint64_t i = 0; i < n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i];
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@ -20664,7 +20731,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
return NULL;
}
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
} break;
@ -20678,7 +20745,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
return NULL;
}
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
@ -20694,6 +20761,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
if (!ok) {
break;
}
ctx->header.n_kv++;
}
if (!ok) {
@ -20706,7 +20775,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the tensor infos
{
ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i];
@ -20727,8 +20796,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
// TODO: return an error instead of crashing with GGML_ASSERT
gguf_tensor_info_sanitize(info);
// make sure there is no duplicated tensor names
for (uint64_t j = 0; j < i; ++j) {
if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
ok = false;
}
}
if (!ok) {
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
fclose(file);
@ -20842,12 +20920,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
ok = ok && cur != NULL;
ggml_set_name(cur, ctx->infos[i].name.data);
if (!ok) {
break;
}
ggml_set_name(cur, ctx->infos[i].name.data);
// point the data member to the appropriate location in the binary blob using the tensor infos
if (!params.no_alloc) {
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
@ -20879,31 +20957,7 @@ void gguf_free(struct gguf_context * ctx) {
if (ctx->kv) {
// free string memory - not great..
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i];
if (kv->key.data) {
GGML_FREE(kv->key.data);
}
if (kv->type == GGUF_TYPE_STRING) {
if (kv->value.str.data) {
GGML_FREE(kv->value.str.data);
}
}
if (kv->type == GGUF_TYPE_ARRAY) {
if (kv->value.arr.data) {
if (kv->value.arr.type == GGUF_TYPE_STRING) {
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
if (str->data) {
GGML_FREE(str->data);
}
}
}
GGML_FREE(kv->value.arr.data);
}
}
gguf_free_kv(&ctx->kv[i]);
}
GGML_FREE(ctx->kv);
@ -20921,7 +20975,7 @@ void gguf_free(struct gguf_context * ctx) {
GGML_FREE(ctx->infos);
}
GGML_ALIGNED_FREE(ctx);
GGML_FREE(ctx);
}
const char * gguf_type_name(enum gguf_type type) {
@ -21128,6 +21182,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
return n_kv;
}
void gguf_remove_key(struct gguf_context * ctx, const char * key) {
const int idx = gguf_find_key(ctx, key);
if (idx >= 0) {
const int n_kv = gguf_get_n_kv(ctx);
gguf_free_kv(&ctx->kv[idx]);
for (int i = idx; i < n_kv-1; ++i) {
ctx->kv[i] = ctx->kv[i+1];
}
ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
ctx->header.n_kv--;
}
}
void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
const int idx = gguf_get_or_add_key(ctx, key);
@ -21219,7 +21286,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
ctx->kv[idx].value.arr.type = type;
ctx->kv[idx].value.arr.n = n;
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
}
@ -21229,7 +21296,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
ctx->kv[idx].value.arr.n = n;
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
for (int i = 0; i < n; i++) {
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
str->n = strlen(data[i]);
@ -21256,7 +21323,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
case GGUF_TYPE_ARRAY:
{
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
}
@ -21276,6 +21343,10 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
void gguf_add_tensor(
struct gguf_context * ctx,
const struct ggml_tensor * tensor) {
if (gguf_find_tensor(ctx, tensor->name) != -1) {
GGML_ASSERT(false && "duplicated tensor name");
}
const int idx = ctx->header.n_tensors;
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
@ -21344,7 +21415,7 @@ struct gguf_buf {
static struct gguf_buf gguf_buf_init(size_t size) {
struct gguf_buf buf = {
/*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
/*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
/*buf.size =*/ size,
/*buf.offset =*/ 0,
};