From 20a68a7030ee06e8eb7eb8e24ae4ac52dc17803f Mon Sep 17 00:00:00 2001 From: LostRuins <39025047+LostRuins@users.noreply.github.com> Date: Thu, 14 Dec 2023 20:13:33 +0800 Subject: [PATCH 01/43] ggml : add ggml_row_size() (fixes llama out of space) (#4461) * Fixes "Not enough space in the context's memory pool" encountered on certain models, which seems to be caused by some imprecision related to the automatic casting of floating point values * do not cast to size_t, instead just use doubles * ggml : add ggml_row_size(), deprecate ggml_type_sizef() * ggml : fix row size compute to avoid overflows * tests : fix sizey -> sizez --------- Co-authored-by: Georgi Gerganov --- examples/benchmark/benchmark-matmult.cpp | 14 +++++++------- ggml.c | 9 +++++++-- ggml.h | 10 +++++++--- llama.cpp | 12 ++++++------ 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 284733b10..434e1d6bd 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -129,13 +129,13 @@ int main(int argc, char ** argv) { const ggml_type qtype = GGML_TYPE_Q4_1; size_t ctx_size = 0; - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizey*ggml_type_sizef(qtype); - ctx_size += sizex*sizey*ggml_type_sizef(qtype); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez); + ctx_size += ggml_row_size(qtype, sizex*sizey); + ctx_size += ggml_row_size(qtype, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS ctx_size += 1024*1024*16; printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); diff --git a/ggml.c b/ggml.c index 7e1272817..f0a972690 100644 --- a/ggml.c +++ b/ggml.c @@ -2011,8 +2011,13 @@ size_t ggml_type_size(enum ggml_type type) { return type_traits[type].type_size; } -float ggml_type_sizef(enum ggml_type type) { - return ((float)(type_traits[type].type_size))/type_traits[type].blck_size; +size_t ggml_row_size(enum ggml_type type, int64_t ne) { + assert(ne % ggml_blck_size(type) == 0); + return ggml_type_size(type)*ne/ggml_blck_size(type); +} + +double ggml_type_sizef(enum ggml_type type) { + return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } const char * ggml_type_name(enum ggml_type type) { diff --git a/ggml.h b/ggml.h index 1447646b1..ae8101fab 100644 --- a/ggml.h +++ b/ggml.h @@ -641,9 +641,13 @@ extern "C" { GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); - GGML_API int ggml_blck_size (enum ggml_type type); - GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block - GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float + GGML_API int ggml_blck_size(enum ggml_type type); + GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block + GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row + + GGML_DEPRECATED( + GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float + "use ggml_row_size() instead"); GGML_API const char * ggml_type_name(enum ggml_type type); GGML_API const char * ggml_op_name (enum ggml_op op); diff --git a/llama.cpp b/llama.cpp index 0e5ab044c..456807d9d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1555,7 +1555,7 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead()); + cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead()); memset(cache.buf.data, 0, cache.buf.size); struct ggml_init_params params; @@ -3822,8 +3822,8 @@ static void llm_build_k_shift( ggml_rope_custom_inplace(ctx, ggml_view_3d(ctx, kv.k_l[il], n_embd_head, n_head_kv, n_ctx, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, + ggml_row_size(kv.k_l[il]->type, n_embd_head), + ggml_row_size(kv.k_l[il]->type, n_embd_gqa), 0), K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); @@ -3852,7 +3852,7 @@ static void llm_build_kv_store( cb(v_cur_t, "v_cur_t", il); struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa, - (ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head); + (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head); cb(k_cache_view, "k_cache_view", il); struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa, @@ -4011,8 +4011,8 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * k = ggml_view_3d(ctx, kv.k_l[il], n_embd_head, n_kv, n_head_kv, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, + ggml_row_size(kv.k_l[il]->type, n_embd_gqa), + ggml_row_size(kv.k_l[il]->type, n_embd_head), 0); cb(k, "k", il); From c50e40016394f124b97ce39da48148b1f6c01833 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Thu, 14 Dec 2023 21:44:49 +0900 Subject: [PATCH 02/43] py : add protobuf dependency (#4466) --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index badfec3be..1a1162566 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ numpy==1.24.4 sentencepiece==0.1.98 transformers>=4.34.0 gguf>=0.1.0 +protobuf>=4.21.0 From cafcd4f89500b8afef722cdb08088eceb8a22572 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 14 Dec 2023 16:52:08 +0100 Subject: [PATCH 03/43] ggml : remove n_dims from ggml_tensor (#4469) ggml-ci --- common/train.cpp | 18 ++-- examples/baby-llama/baby-llama.cpp | 18 ++-- .../convert-llama2c-to-ggml.cpp | 4 +- examples/finetune/finetune.cpp | 2 +- examples/gguf/gguf.cpp | 2 +- examples/llava/clip.cpp | 6 +- ggml.c | 94 ++++++++++--------- ggml.h | 8 +- llama.cpp | 2 +- 9 files changed, 81 insertions(+), 73 deletions(-) diff --git a/common/train.cpp b/common/train.cpp index 773e2c59c..dcf9614e4 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd) struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { float scale = 1.0f; // xavier - switch (tensor->n_dims) { + switch (ggml_n_dims(tensor)) { case 1: scale /= sqrtf((float) tensor->ne[0]); for (int i0 = 0; i0 < tensor->ne[0]; i0++) { @@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct } struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { - switch (tensor->n_dims) { + switch (ggml_n_dims(tensor)) { case 1: for (int i0 = 0; i0 < tensor->ne[0]; i0++) { float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); @@ -183,25 +183,27 @@ float fclamp(const float v, const float min, const float max) { } void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { - GGML_ASSERT(tensor->n_dims == 1); GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == 1); + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); } void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { - GGML_ASSERT(tensor->n_dims == 2); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); } void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { - GGML_ASSERT(tensor->n_dims == 3); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[2] == ne2); + GGML_ASSERT(tensor->ne[3] == 1); } void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - GGML_ASSERT(tensor->n_dims == 4); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[2] == ne2); @@ -225,8 +227,8 @@ int64_t get_example_targets_batch( bool sample_random_offsets ) { GGML_ASSERT(samples_count > 0); - GGML_ASSERT(tokens_input->n_dims == 2); - GGML_ASSERT(target_probs->n_dims == 3); + GGML_ASSERT(ggml_is_matrix(tokens_input)); + GGML_ASSERT(ggml_is_3d(target_probs)); int64_t n_vocab = target_probs->ne[0]; int64_t n_tokens = tokens_input->ne[0]; int64_t n_batch = tokens_input->ne[1]; diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 8155101d0..2dc2988d3 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1258,9 +1258,9 @@ static struct ggml_tensor * forward_lora( } static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { - assert(logits->n_dims == 2); - assert(probs->n_dims == 2); - assert(best_samples->n_dims == 1); + assert(ggml_is_matrix(logits)); + assert(ggml_is_matrix(probs)); + assert(ggml_is_vector(best_samples)); assert(logits->ne[1] == best_samples->ne[0]); assert(logits->ne[0] == probs->ne[0]); assert(logits->ne[1] == probs->ne[1]); @@ -1292,9 +1292,9 @@ static void sample_softmax_batch( struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples ) { - GGML_ASSERT(best_samples->n_dims == 2); - GGML_ASSERT(logits->n_dims == 3); - GGML_ASSERT(probs->n_dims == 3); + GGML_ASSERT(ggml_is_matrix(best_samples)); + GGML_ASSERT(ggml_is_3d(logits)); + GGML_ASSERT(ggml_is_3d(probs)); int n_tokens = best_samples->ne[0]; int n_batch = best_samples->ne[1]; int n_vocab = logits->ne[0]; @@ -1334,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) { } static void print_matrix(struct ggml_tensor * probs) { - assert(probs->n_dims == 2); + assert(ggml_is_matrix(probs)); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); @@ -1386,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu static void get_example_targets_batch( struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets ) { - GGML_ASSERT(tokens_input->n_dims == 2); - GGML_ASSERT( targets->n_dims == 3); + GGML_ASSERT(ggml_is_matrix(tokens_input)); + GGML_ASSERT(ggml_is_3d(targets)); int n_tokens = tokens_input->ne[0]; int n_batch = tokens_input->ne[1]; GGML_ASSERT(n_tokens == targets->ne[1]); diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index cae3bf3c3..4d41e1779 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) { } static void print_matrix(struct ggml_tensor * probs) { - assert(probs->n_dims == 2); + assert(ggml_is_matrix(probs)); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = get_f32_2d(probs, k, i); @@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { int ct; - switch (gg_weights->n_dims){ + switch (ggml_n_dims(gg_weights)) { case 1: ct = 0; for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){ diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index af46e44a6..b9849e8c9 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1110,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, name = ggml_get_name(tensor); } uint32_t name_len = strlen(name); - uint32_t nd = tensor->n_dims; + uint32_t nd = ggml_n_dims(tensor); uint32_t ne[4] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 9ab63a293..9e24bf24c 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -195,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) { struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); + printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data); // print first 10 elements const float * data = (const float *) cur->data; diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 4bb7b93b6..112465968 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -514,7 +514,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { ctx_size += padded_size; if (verbosity >= 3) { printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i, - cur->n_dims, cur->name, tensor_size, padded_size, offset); + ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset); } } } @@ -962,7 +962,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i } // quantize only 2D tensors - quantize &= (cur->n_dims == 2); + quantize &= (ggml_n_dims(cur) == 2); if (quantize) { new_type = type; @@ -1035,7 +1035,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i fout.put(0); } - printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize, + printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); } diff --git a/ggml.c b/ggml.c index f0a972690..f6f8b8251 100644 --- a/ggml.c +++ b/ggml.c @@ -2054,24 +2054,37 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) { return ggml_type_size(tensor->type); } -static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) { +bool ggml_is_scalar(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool ggml_is_vector(const struct ggml_tensor * tensor) { +bool ggml_is_vector(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) { +bool ggml_is_matrix(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[2] == 1 && tensor->ne[3] == 1; } +bool ggml_is_3d(const struct ggml_tensor * tensor) { + return tensor->ne[3] == 1; +} + +int ggml_n_dims(const struct ggml_tensor * tensor) { + for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) { + if (tensor->ne[i] > 1) { + return i + 1; + } + } + return 1; +} + static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -2521,7 +2534,6 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.type =*/ type, /*.backend =*/ GGML_BACKEND_CPU, /*.buffer =*/ NULL, - /*.n_dims =*/ n_dims, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ GGML_OP_NONE, @@ -2628,7 +2640,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { } struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { - return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne); + return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne); } static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { @@ -3077,7 +3089,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, struct ggml_tensor * src) { - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0); ggml_format_name(result, "%s (view)", src->name); for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -3235,10 +3247,10 @@ static struct ggml_tensor * ggml_add_cast_impl( is_node = true; } - struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne); result->op = GGML_OP_ADD; - result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL; + result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL; result->src[0] = a; result->src[1] = b; @@ -3607,12 +3619,12 @@ struct ggml_tensor * ggml_sum_rows( is_node = true; } - int64_t ne[4] = {1,1,1,1}; - for (int i=1; in_dims; ++i) { + int64_t ne[GGML_MAX_DIMS] = { 1 }; + for (int i = 1; i < GGML_MAX_DIMS; ++i) { ne[i] = a->ne[i]; } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne); result->op = GGML_OP_SUM_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3633,8 +3645,8 @@ struct ggml_tensor * ggml_mean( is_node = true; } - int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne); + int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_MEAN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3656,8 +3668,7 @@ struct ggml_tensor * ggml_argmax( is_node = true; } - int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne); + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]); result->op = GGML_OP_ARGMAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3680,7 +3691,7 @@ struct ggml_tensor * ggml_repeat( is_node = true; } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne); result->op = GGML_OP_REPEAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3707,7 +3718,7 @@ struct ggml_tensor * ggml_repeat_back( return a; } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne); result->op = GGML_OP_REPEAT_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4083,7 +4094,7 @@ struct ggml_tensor * ggml_mul_mat( } const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_MUL_MAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4117,7 +4128,7 @@ struct ggml_tensor * ggml_mul_mat_id( } const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); ggml_set_op_params_i32(result, 0, id); ggml_set_op_params_i32(result, 1, n_as); @@ -4155,7 +4166,7 @@ struct ggml_tensor * ggml_out_prod( // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3] const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_OUT_PROD; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4440,7 +4451,7 @@ struct ggml_tensor * ggml_reshape( //GGML_ASSERT(false); } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -4818,7 +4829,7 @@ struct ggml_tensor * ggml_diag( } const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne); result->op = GGML_OP_DIAG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5465,7 +5476,7 @@ struct ggml_tensor * ggml_pool_1d( is_node = true; } - const int64_t ne[3] = { + const int64_t ne[2] = { ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), a->ne[1], }; @@ -5584,7 +5595,7 @@ struct ggml_tensor * ggml_argsort( enum ggml_sort_order order) { bool is_node = false; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); @@ -5631,7 +5642,7 @@ struct ggml_tensor * ggml_flash_attn( } //struct ggml_tensor * result = ggml_dup_tensor(ctx, q); - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne); int32_t t = masked ? 1 : 0; ggml_set_op_params(result, &t, sizeof(t)); @@ -5664,7 +5675,7 @@ struct ggml_tensor * ggml_flash_ff( } //struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne); result->op = GGML_OP_FLASH_FF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5780,7 +5791,6 @@ struct ggml_tensor * ggml_win_part( const int np = npx*npy; const int64_t ne[4] = { a->ne[0], w, w, np, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { npx, npy, w }; @@ -14563,7 +14573,7 @@ static struct ggml_tensor * ggml_recompute_graph_node( return replacements->vals[i]; } - struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne); + struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne); // insert clone into replacements GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite @@ -16564,7 +16574,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", ggml_type_name(tensor->type), ggml_op_name (tensor->op), - tensor->n_dims, + ggml_n_dims(tensor), ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], tensor->data, @@ -16579,7 +16589,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), - tensor->n_dims, + ggml_n_dims(tensor), ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], tensor->data, @@ -16669,11 +16679,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { const uint32_t type = tensor->type; const uint32_t op = tensor->op; - const uint32_t n_dims = tensor->n_dims; fwrite(&type, sizeof(uint32_t), 1, fout); fwrite(&op, sizeof(uint32_t), 1, fout); - fwrite(&n_dims, sizeof(uint32_t), 1, fout); for (int j = 0; j < GGML_MAX_DIMS; ++j) { const uint64_t ne = tensor->ne[j]; @@ -16703,11 +16711,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { const uint32_t type = tensor->type; const uint32_t op = tensor->op; - const uint32_t n_dims = tensor->n_dims; fwrite(&type, sizeof(uint32_t), 1, fout); fwrite(&op, sizeof(uint32_t), 1, fout); - fwrite(&n_dims, sizeof(uint32_t), 1, fout); for (int j = 0; j < GGML_MAX_DIMS; ++j) { const uint64_t ne = tensor->ne[j]; @@ -16879,12 +16885,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * { uint32_t type; uint32_t op; - uint32_t n_dims; for (uint32_t i = 0; i < n_leafs; ++i) { type = *(const uint32_t *) ptr; ptr += sizeof(type); op = *(const uint32_t *) ptr; ptr += sizeof(op); - n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); int64_t ne[GGML_MAX_DIMS]; size_t nb[GGML_MAX_DIMS]; @@ -16900,7 +16904,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * nb[j] = nb_cur; } - struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne); tensor->op = (enum ggml_op) op; @@ -16917,7 +16921,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * ptr += ggml_nbytes(tensor); - fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); } } @@ -16927,12 +16931,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * { uint32_t type; uint32_t op; - uint32_t n_dims; for (uint32_t i = 0; i < n_nodes; ++i) { type = *(const uint32_t *) ptr; ptr += sizeof(type); op = *(const uint32_t *) ptr; ptr += sizeof(op); - n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); enum ggml_op eop = (enum ggml_op) op; @@ -17003,7 +17005,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * } break; default: { - tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne); tensor->op = eop; } break; @@ -17022,7 +17024,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * result->nodes[i] = tensor; - fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); } } } @@ -17160,7 +17162,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "(%s)|", ggml_type_name(node->type)); } - if (node->n_dims == 2) { + if (ggml_is_matrix(node)) { fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op)); } else { fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op)); @@ -17427,7 +17429,7 @@ static enum ggml_opt_result ggml_opt_adam( int64_t i = 0; for (int p = 0; p < np; ++p) { const int64_t ne = ggml_nelements(ps[p]); - const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched; + const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched; for (int64_t j = 0; j < ne; ++j) { float x = ggml_get_f32_1d(ps[p], j); float g_ = g[i]*gnorm; @@ -19205,8 +19207,8 @@ void gguf_add_tensor( ctx->infos[idx].ne[i] = 1; } - ctx->infos[idx].n_dims = tensor->n_dims; - for (int i = 0; i < tensor->n_dims; i++) { + ctx->infos[idx].n_dims = ggml_n_dims(tensor); + for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) { ctx->infos[idx].ne[i] = tensor->ne[i]; } diff --git a/ggml.h b/ggml.h index ae8101fab..84d6ba8b1 100644 --- a/ggml.h +++ b/ggml.h @@ -502,7 +502,6 @@ extern "C" { struct ggml_backend_buffer * buffer; - int n_dims; int64_t ne[GGML_MAX_DIMS]; // number of elements size_t nb[GGML_MAX_DIMS]; // stride in bytes: // nb[0] = ggml_type_size(type) @@ -534,7 +533,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[12]; + char padding[8]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -666,6 +665,11 @@ extern "C" { GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); + GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); diff --git a/llama.cpp b/llama.cpp index 456807d9d..eddb70859 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8471,7 +8471,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? // quantize only 2D tensors - quantize &= (tensor->n_dims == 2); + quantize &= (ggml_n_dims(tensor) == 2); quantize &= params->quantize_output_tensor || name != "output.weight"; quantize &= !params->only_copy; From 6744dbe924a317e3e2a5a2a4a2037061b2223449 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 14 Dec 2023 20:05:21 +0100 Subject: [PATCH 04/43] ggml : use ggml_row_size where possible (#4472) * ggml : use ggml_row_size where possible ggml-ci * ggml : move ggml_nbytes_split to ggml-cuda.cu --- ggml-cuda.cu | 12 ++++++++---- ggml.c | 18 ++++++------------ ggml.h | 1 - tests/test-backend-ops.cpp | 9 +++++---- tests/test-quantize-perf.cpp | 10 +++++----- 5 files changed, 24 insertions(+), 26 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 019648bdd..0a63c1ecf 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -8898,6 +8898,12 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg (void) dst; } +static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); +} + void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { const int64_t nrows = ggml_nrows(tensor); @@ -8947,8 +8953,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses if (ne0 % MATRIX_ROW_PADDING != 0) { - size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) - * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } char * buf; @@ -9485,8 +9490,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t if (ggml_is_quantized(tensor->type)) { if (ne0 % MATRIX_ROW_PADDING != 0) { - size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) - * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } } diff --git a/ggml.c b/ggml.c index f6f8b8251..1feb7ead3 100644 --- a/ggml.c +++ b/ggml.c @@ -1997,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); } -size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type); -} - int ggml_blck_size(enum ggml_type type) { return type_traits[type].blck_size; } @@ -2491,7 +2485,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( view_src = view_src->view_src; } - size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type)); + size_t data_size = ggml_row_size(type, ne[0]); for (int i = 1; i < n_dims; i++) { data_size *= ne[i]; } @@ -9698,7 +9692,7 @@ static void ggml_compute_forward_mul_mat( if (params->type == GGML_TASK_INIT) { if (src1->type != vec_dot_type) { char * wdata = params->wdata; - const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + const size_t row_size = ggml_row_size(vec_dot_type, ne10); assert(params->wsize >= ne11*ne12*ne13*row_size); assert(src1->type == GGML_TYPE_F32); @@ -9721,7 +9715,7 @@ static void ggml_compute_forward_mul_mat( } const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + const size_t row_size = ggml_row_size(vec_dot_type, ne10); const int64_t nr0 = ne01; // src0 rows const int64_t nr1 = cne1*ne12*ne13; // src1 rows @@ -16326,7 +16320,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } else #endif if (node->src[1]->type != vec_dot_type) { - cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type); + cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); } } break; case GGML_OP_MUL_MAT_ID: @@ -16343,7 +16337,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } else #endif if (b->type != vec_dot_type) { - cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type); + cur = ggml_row_size(vec_dot_type, ggml_nelements(b)); } } break; case GGML_OP_OUT_PROD: @@ -18703,7 +18697,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return NULL; } - const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type); + const size_t size_cur = ggml_row_size(info->type, ne); ctx->size += GGML_PAD(size_cur, ctx->alignment); } diff --git a/ggml.h b/ggml.h index 84d6ba8b1..68f7833b6 100644 --- a/ggml.h +++ b/ggml.h @@ -638,7 +638,6 @@ extern "C" { GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN - GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); GGML_API int ggml_blck_size(enum ggml_type type); GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index afca85143..df2c3fb6e 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -54,7 +54,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); - std::vector dataq(ggml_type_size(tensor->type)*size/ggml_blck_size(tensor->type)); + std::vector dataq(ggml_row_size(tensor->type, size)); int64_t hist[16]; ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist); ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); @@ -72,6 +72,8 @@ static std::vector tensor_to_float(const ggml_tensor * t) { ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type); size_t bs = ggml_blck_size(t->type); + std::vector vq(ggml_blck_size(t->type)); + bool quantized = ggml_is_quantized(t->type); // access elements by index to avoid gaps in views for (int64_t i3 = 0; i3 < t->ne[3]; i3++) { @@ -85,9 +87,8 @@ static std::vector tensor_to_float(const ggml_tensor * t) { tv.push_back(*(float *) &buf[i]); } else if (t->type == GGML_TYPE_I32) { tv.push_back((float)*(int32_t *) &buf[i]); - } else if (ggml_is_quantized(t->type)) { - std::vector vq(ggml_blck_size(t->type)); - tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type)); + } else if (quantized) { + tt.to_float(&buf[i], vq.data(), bs); tv.insert(tv.end(), vq.begin(), vq.end()); } else { GGML_ASSERT(false); diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 62d0190f9..09d410b7f 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -286,7 +286,7 @@ int main(int argc, char * argv[]) { qfns.from_float_reference(test_data1, test_q1, size); return test_q1[0]; }; - size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + size_t quantized_size = ggml_row_size(type, size); benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); @@ -300,7 +300,7 @@ int main(int argc, char * argv[]) { qfns.from_float(test_data1, test_q1, size); return test_q1[0]; }; - size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + size_t quantized_size = ggml_row_size(type, size); benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); @@ -315,7 +315,7 @@ int main(int argc, char * argv[]) { qfns.to_float(test_q1, test_out, size); return test_out[0]; }; - size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + size_t quantized_size = ggml_row_size(type, size); benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); @@ -330,7 +330,7 @@ int main(int argc, char * argv[]) { vdot.from_float(test_data1, test_q1, size); return test_q1[0]; }; - size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + size_t quantized_size = ggml_row_size(type, size); benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); @@ -347,7 +347,7 @@ int main(int argc, char * argv[]) { qfns.vec_dot(size, &result, test_q1, test_q2); return result; }; - size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type); + size_t quantized_size = ggml_row_size(type, size); benchmark_function(size, quantized_size, iterations, quantize_fn); } printf("\n"); From ee4725a686643669a8587142fa068cbf29de3ce2 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 15 Dec 2023 12:45:50 +0100 Subject: [PATCH 05/43] ggml : group mul_mat_id rows by matrix (cpu only) (#4480) * ggml : group mul_mat_id rows by matrix (cpu only) * remove mmid parameters from mm forward * store row groups in wdata and calculate only once in GGML_TASK_INIT ggml-ci --- ggml.c | 237 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 195 insertions(+), 42 deletions(-) diff --git a/ggml.c b/ggml.c index 1feb7ead3..ad546a731 100644 --- a/ggml.c +++ b/ggml.c @@ -9580,16 +9580,11 @@ static bool ggml_compute_forward_mul_mat_use_blas( } #endif -// off1 = offset in i11 and i1 -// cne1 = ne11 and ne1 -// in a normal matrix multiplication, off1 = 0 and cne1 = ne1 -// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1 static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst, - int64_t off1, int64_t cne1) { + struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -9657,9 +9652,9 @@ static void ggml_compute_forward_mul_mat( const int64_t i03 = i13/r3; const int64_t i02 = i12/r2; - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13); - float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3); + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); if (type != GGML_TYPE_F32) { float * const wdata = params->wdata; @@ -9676,7 +9671,7 @@ static void ggml_compute_forward_mul_mat( } cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - cne1, ne01, ne10, + ne1, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); @@ -9717,8 +9712,8 @@ static void ggml_compute_forward_mul_mat( const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); - const int64_t nr0 = ne01; // src0 rows - const int64_t nr1 = cne1*ne12*ne13; // src1 rows + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = ne1*ne12*ne13; // src1 rows //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); @@ -9760,9 +9755,9 @@ static void ggml_compute_forward_mul_mat( for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { - const int64_t i13 = (ir1/(ne12*cne1)); - const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1; - const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1; + const int64_t i13 = (ir1/(ne12*ne1)); + const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1; + const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1); // broadcast src0 into src1 const int64_t i03 = i13/r3; @@ -9802,28 +9797,191 @@ static void ggml_compute_forward_mul_mat( static void ggml_compute_forward_mul_mat_id( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, + const struct ggml_tensor * ids, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { - // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type - ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]); - return; - } + const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS - const struct ggml_tensor * ids = src0; + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // row groups const int id = ggml_get_op_params_i32(dst, 0); const int n_as = ggml_get_op_params_i32(dst, 1); - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); + char * wdata_src1_end = (src1->type == vec_dot_type) ? + (char *) params->wdata : + (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); - GGML_ASSERT(row_id >= 0 && row_id < n_as); + int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] + int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11] - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1); + #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] + + if (params->type == GGML_TASK_INIT) { + char * wdata = params->wdata; + if (src1->type != vec_dot_type) { + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + assert(params->wsize >= ne11*ne12*ne13*row_size); + assert(src1->type == GGML_TYPE_F32); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + + // initialize matrix_row_counts + GGML_ASSERT(wdata == wdata_src1_end); + memset(matrix_row_counts, 0, n_as*sizeof(int64_t)); + + // group rows by src0 matrix + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01; + matrix_row_counts[row_id] += 1; + } + + return; } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // compute each matrix multiplication in sequence + for (int cur_a = 0; cur_a < n_as; ++cur_a) { + const int64_t cne1 = matrix_row_counts[cur_a]; + + if (cne1 == 0) { + continue; + } + + const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; + + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = cne1*ne12*ne13; // src1 rows + + //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); + + // distribute the thread work across the inner or outer loop based on which one is larger + + const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows + + const int64_t ith0 = ith % nth0; + const int64_t ith1 = ith / nth0; + + const int64_t dr0 = (nr0 + nth0 - 1)/nth0; + const int64_t dr1 = (nr1 + nth1 - 1)/nth1; + + const int64_t ir010 = dr0*ith0; + const int64_t ir011 = MIN(ir010 + dr0, nr0); + + const int64_t ir110 = dr1*ith1; + const int64_t ir111 = MIN(ir110 + dr1, nr1); + + //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); + + // threads with no work simply yield (not sure if it helps) + if (ir010 >= ir011 || ir110 >= ir111) { + sched_yield(); + continue; + } + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + // attempt to reduce false-sharing (does not seem to make a difference) + float tmp[16]; + + for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { + for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix + const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1; + const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1); + const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11); + + // broadcast src0 into src1 + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); + + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + } + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); + } + } + } + } + + #undef MMID_MATRIX_ROW } // ggml_compute_forward_out_prod @@ -14191,7 +14349,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]); + ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_MUL_MAT_ID: { @@ -15991,7 +16149,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_MUL_MAT_ID: { - // FIXME: blas n_tasks = n_threads; } break; case GGML_OP_OUT_PROD: @@ -16325,20 +16482,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_MUL_MAT_ID: { - const struct ggml_tensor * a = node->src[2]; - const struct ggml_tensor * b = node->src[1]; - const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) { - if (a->type != GGML_TYPE_F32) { - // here we need memory just for single 2D matrix from src0 - cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]); - } - } else -#endif - if (b->type != vec_dot_type) { - cur = ggml_row_size(vec_dot_type, ggml_nelements(b)); + const struct ggml_tensor * src0 = node->src[2]; + const struct ggml_tensor * src1 = node->src[1]; + const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; + if (src1->type != vec_dot_type) { + cur = ggml_row_size(vec_dot_type, ggml_nelements(src1)); } + const int n_as = ggml_get_op_params_i32(node, 1); + cur = GGML_PAD(cur, sizeof(int64_t)); // align + cur += n_as * sizeof(int64_t); // matrix_row_counts + cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows } break; case GGML_OP_OUT_PROD: { From 88ae8952b65cbf32eb1f5703681ea592e510e570 Mon Sep 17 00:00:00 2001 From: ShadovvBeast Date: Fri, 15 Dec 2023 13:49:01 +0200 Subject: [PATCH 06/43] server : add optional API Key Authentication example (#4441) * Add API key authentication for enhanced server-client security * server : to snake_case --------- Co-authored-by: Georgi Gerganov --- examples/server/public/completion.js | 3 +- examples/server/public/index.html | 7 ++- examples/server/server.cpp | 70 ++++++++++++++++++++++++---- 3 files changed, 70 insertions(+), 10 deletions(-) diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index c281f0fbd..6e2b99565 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -34,7 +34,8 @@ export async function* llama(prompt, params = {}, config = {}) { headers: { 'Connection': 'keep-alive', 'Content-Type': 'application/json', - 'Accept': 'text/event-stream' + 'Accept': 'text/event-stream', + ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {}) }, signal: controller.signal, }); diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 451fd4a3b..07d779d20 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -235,7 +235,8 @@ grammar: '', n_probs: 0, // no completion_probabilities, image_data: [], - cache_prompt: true + cache_prompt: true, + api_key: '' }) /* START: Support for storing prompt templates and parameters in browsers LocalStorage */ @@ -790,6 +791,10 @@
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
+
+ + +
` diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 39d1e83d1..5f93dcb66 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -36,6 +36,7 @@ using json = nlohmann::json; struct server_params { std::string hostname = "127.0.0.1"; + std::string api_key; std::string public_path = "examples/server/public"; int32_t port = 8080; int32_t read_timeout = 600; @@ -1953,6 +1954,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); + printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); @@ -2002,6 +2004,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } sparams.public_path = argv[i]; } + else if (arg == "--api-key") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.api_key = argv[i]; + } else if (arg == "--timeout" || arg == "-to") { if (++i >= argc) @@ -2669,6 +2680,32 @@ int main(int argc, char **argv) httplib::Server svr; + // Middleware for API key validation + auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool { + // If API key is not set, skip validation + if (sparams.api_key.empty()) { + return true; + } + + // Check for API key in the header + auto auth_header = req.get_header_value("Authorization"); + std::string prefix = "Bearer "; + if (auth_header.substr(0, prefix.size()) == prefix) { + std::string received_api_key = auth_header.substr(prefix.size()); + if (received_api_key == sparams.api_key) { + return true; // API key is valid + } + } + + // API key is invalid or not provided + res.set_content("Unauthorized: Invalid API Key", "text/plain"); + res.status = 401; // Unauthorized + + LOG_WARNING("Unauthorized: Invalid API Key", {}); + + return false; + }; + svr.set_default_headers({{"Server", "llama.cpp"}, {"Access-Control-Allow-Origin", "*"}, {"Access-Control-Allow-Headers", "content-type"}}); @@ -2711,8 +2748,11 @@ int main(int argc, char **argv) res.set_content(data.dump(), "application/json"); }); - svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res) + svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + if (!validate_api_key(req, res)) { + return; + } json data = json::parse(req.body); const int task_id = llama.request_completion(data, false, false, -1); if (!json_value(data, "stream", false)) { @@ -2799,8 +2839,11 @@ int main(int argc, char **argv) }); // TODO: add mount point without "/v1" prefix -- how? - svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res) + svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + if (!validate_api_key(req, res)) { + return; + } json data = oaicompat_completion_params_parse(json::parse(req.body)); const int task_id = llama.request_completion(data, false, false, -1); @@ -2869,8 +2912,11 @@ int main(int argc, char **argv) } }); - svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res) + svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + if (!validate_api_key(req, res)) { + return; + } json data = json::parse(req.body); const int task_id = llama.request_completion(data, true, false, -1); if (!json_value(data, "stream", false)) { @@ -3005,11 +3051,15 @@ int main(int argc, char **argv) svr.set_error_handler([](const httplib::Request &, httplib::Response &res) { + if (res.status == 401) + { + res.set_content("Unauthorized", "text/plain"); + } if (res.status == 400) { res.set_content("Invalid request", "text/plain"); } - else if (res.status != 500) + else if (res.status == 404) { res.set_content("File Not Found", "text/plain"); res.status = 404; @@ -3032,11 +3082,15 @@ int main(int argc, char **argv) // to make it ctrl+clickable: LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); - LOG_INFO("HTTP server listening", { - {"hostname", sparams.hostname}, - {"port", sparams.port}, - }); + std::unordered_map log_data; + log_data["hostname"] = sparams.hostname; + log_data["port"] = std::to_string(sparams.port); + if (!sparams.api_key.empty()) { + log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4); + } + + LOG_INFO("HTTP server listening", log_data); // run the HTTP server in a thread - see comment below std::thread t([&]() { From 8a5be3bd5885d79ad84aadf32bb8c1a67bd43c19 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Fri, 15 Dec 2023 22:16:15 -0500 Subject: [PATCH 07/43] llama : sanity checks for access to logits (#4274) Co-authored-by: Georgi Gerganov --- llama.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/llama.cpp b/llama.cpp index eddb70859..58fe7492e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1505,6 +1505,10 @@ struct llama_context { // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; +#ifndef NDEBUG + // guard against access to unset logits + std::vector logits_valid; +#endif bool logits_all = false; // input embedding (1-dimensional array: [n_embd]) @@ -6150,6 +6154,14 @@ static int llama_decode_internal( { auto & logits_out = lctx.logits; +#ifndef NDEBUG + auto & logits_valid = lctx.logits_valid; + logits_valid.clear(); + logits_valid.resize(n_tokens); + + logits_out.clear(); +#endif + if (batch.logits) { logits_out.resize(n_vocab * n_tokens); for (uint32_t i = 0; i < n_tokens; i++) { @@ -6157,13 +6169,22 @@ static int llama_decode_internal( continue; } memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab); +#ifndef NDEBUG + logits_valid[i] = true; +#endif } } else if (lctx.logits_all) { logits_out.resize(n_vocab * n_tokens); memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens); +#ifndef NDEBUG + std::fill(logits_valid.begin(), logits_valid.end(), true); +#endif } else { logits_out.resize(n_vocab); memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); +#ifndef NDEBUG + logits_valid[n_tokens - 1] = true; +#endif } } @@ -10052,6 +10073,7 @@ float * llama_get_logits(struct llama_context * ctx) { } float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { + assert(ctx->logits_valid.at(i)); return ctx->logits.data() + i*ctx->model.hparams.n_vocab; } From c6c4fc081c1df1c60a9bfe3e6a3fd086f1a29ec7 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 16 Dec 2023 18:58:46 +0100 Subject: [PATCH 08/43] lora : add support for non-llama models (#3333) * lora : add support for non-llama models ggml-ci * avoid leaking ggml_context on failure cleanup ggml-ci * lora : allow 1d tensors * lora : include embd and output layers in size calculation * fix style --- convert-lora-to-ggml.py | 86 +++++++++++++------------- llama.cpp | 133 ++++++++++++++++++++-------------------- llama.h | 1 + 3 files changed, 114 insertions(+), 106 deletions(-) diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index a937410dd..53bb8a3d9 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -3,7 +3,6 @@ from __future__ import annotations import json import os -import re import struct import sys from typing import Any, BinaryIO, Sequence @@ -11,43 +10,15 @@ from typing import Any, BinaryIO, Sequence import numpy as np import torch +from pathlib import Path +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} -HF_SUBLAYER_TO_GGML = { - "self_attn.q_proj": "attn_q", - "self_attn.k_proj": "attn_k", - "self_attn.v_proj": "attn_v", - "self_attn.o_proj": "attn_output", - "mlp.gate_proj": "ffn_gate", - "mlp.down_proj": "ffn_down", - "mlp.up_proj": "ffn_up", - "input_layernorm": "attn_norm", - "post_attention_layernorm": "ffn_norm", -} - - -def translate_tensor_name(t: str) -> str: - match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t) - if match: - nn = match.group(1) - sub_layer = match.group(2) - lora_type = match.group(3) - - sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer) - if sub_layer_renamed is None: - print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}") - sys.exit(1) - - output_string = ( - f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}" - ) - return output_string - else: - print(f"Error: unrecognized tensor {t}") - sys.exit(1) - - def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: fout.write(b"ggla"[::-1]) # magic (ggml lora) fout.write(struct.pack("i", 1)) # file version @@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: fout.write(struct.pack("i", int(params["lora_alpha"]))) -def write_tensor_header( - self, name: str, shape: Sequence[int], data_type: np.dtype[Any] -) -> None: +def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None: sname = name.encode("utf-8") fout.write( struct.pack( @@ -78,11 +47,12 @@ def write_tensor_header( fout.seek((fout.tell() + 31) & -32) -if len(sys.argv) != 2: - print(f"Usage: python {sys.argv[0]} ") +if len(sys.argv) < 2: + print(f"Usage: python {sys.argv[0]} [arch]") print( "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" ) + print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") sys.exit(1) input_json = os.path.join(sys.argv[1], "adapter_config.json") @@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin") output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") model = torch.load(input_model, map_location="cpu") +arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" + +if arch_name not in gguf.MODEL_ARCH_NAMES.values(): + print(f"Error: unsupported architecture {arch_name}") + sys.exit(1) + +arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)] +name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone with open(input_json, "r") as f: params = json.load(f) @@ -117,6 +95,7 @@ with open(output_path, "wb") as fout: write_file_header(fout, params) for k, v in model.items(): + orig_k = k if k.endswith(".default.weight"): k = k.replace(".default.weight", ".weight") if k in ["llama_proj.weight", "llama_proj.bias"]: @@ -129,7 +108,32 @@ with open(output_path, "wb") as fout: v = v.float() t = v.detach().numpy() - tname = translate_tensor_name(k) + + prefix = "base_model.model." + if k.startswith(prefix): + k = k[len(prefix) :] + + lora_suffixes = (".lora_A.weight", ".lora_B.weight") + if k.endswith(lora_suffixes): + suffix = k[-len(lora_suffixes[0]):] + k = k[: -len(lora_suffixes[0])] + else: + print(f"Error: unrecognized tensor name {orig_k}") + sys.exit(1) + + tname = name_map.get_name(k) + if tname is None: + print(f"Error: could not map tensor name {orig_k}") + print(" Note: the arch parameter must be specified if the model is not llama") + sys.exit(1) + + if suffix == ".lora_A.weight": + tname += ".weight.loraA" + elif suffix == ".lora_B.weight": + tname += ".weight.loraB" + else: + assert False + print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") write_tensor_header(fout, tname, t.shape, t.dtype) t.tofile(fout) diff --git a/llama.cpp b/llama.cpp index 58fe7492e..f49214c13 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8647,53 +8647,60 @@ static int llama_apply_lora_from_file_internal( const int64_t t_start_lora_us = ggml_time_us(); - auto fin = std::ifstream(path_lora, std::ios::binary); - if (!fin) { - LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora); - return 1; - } + llama_file fin(path_lora, "rb"); // verify magic and version { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - uint32_t format_version; - fin.read((char *) &format_version, sizeof(format_version)); + uint32_t magic = fin.read_u32(); + if (magic != LLAMA_FILE_MAGIC_GGLA) { + LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); + return 1; + } + uint32_t format_version = fin.read_u32(); if (format_version != 1) { LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); return 1; } } - int32_t lora_r; - int32_t lora_alpha; - fin.read((char *) &lora_r, sizeof(lora_r)); - fin.read((char *) &lora_alpha, sizeof(lora_alpha)); + int32_t lora_r = fin.read_u32(); + int32_t lora_alpha = fin.read_u32(); float scaling = scale * (float)lora_alpha / (float)lora_r; LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); + // create a name -> tensor map of the model to accelerate lookups + // find the max tensor size to estimate the required temporary buffer size + size_t max_tensor_size = 0; + std::unordered_map model_tensors; + for (const auto & kv : model.tensors_by_name) { + model_tensors.insert(kv); + size_t f32_size = ggml_nelements(kv.second) * sizeof(float); + max_tensor_size = std::max(max_tensor_size, f32_size); + } + // create a temporary ggml context to store the lora tensors - // todo: calculate size from biggest possible tensor - std::vector lora_buf(1024ull * 1024ull * 1024ull); + // TODO: use ggml-alloc + size_t lora_ctx_size = max_tensor_size * 3; + LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0); + std::vector lora_buf(lora_ctx_size); + struct ggml_init_params params; params.mem_size = lora_buf.size(); params.mem_buffer = lora_buf.data(); params.no_alloc = false; - ggml_context * lora_ctx = ggml_init(params); - std::unordered_map lora_tensors; + using unique_context = std::unique_ptr; - // create a name -> tensor map of the model to accelerate lookups - std::unordered_map model_tensors; - for (const auto & kv : model.tensors_by_name) { - model_tensors.insert(kv); - } + unique_context lora_ctx(nullptr, ggml_free); + lora_ctx.reset(ggml_init(params)); + std::unordered_map lora_tensors; // load base model std::unique_ptr ml; - ggml_context * base_ctx = NULL; + + unique_context base_ctx(nullptr, ggml_free); std::vector base_buf; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); @@ -8702,6 +8709,7 @@ static int llama_apply_lora_from_file_internal( size_t ctx_size; size_t mmapped_size; ml->calc_sizes(ctx_size, mmapped_size); + base_buf.resize(ctx_size); ggml_init_params base_params; @@ -8709,9 +8717,9 @@ static int llama_apply_lora_from_file_internal( base_params.mem_buffer = base_buf.data(); base_params.no_alloc = ml->use_mmap; - base_ctx = ggml_init(base_params); + base_ctx.reset(ggml_init(base_params)); - // maybe this should in llama_model_loader + // maybe this should be in llama_model_loader if (ml->use_mmap) { ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa())); } @@ -8724,27 +8732,35 @@ static int llama_apply_lora_from_file_internal( std::vector work_buffer; while (true) { + if (fin.tell() == fin.size) { + // eof + break; + } + int32_t n_dims; - int32_t length; + int32_t name_len; int32_t ftype; - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ftype), sizeof(ftype)); - if (fin.eof()) { - break; + fin.read_raw(&n_dims, sizeof(n_dims)); + fin.read_raw(&name_len, sizeof(name_len)); + fin.read_raw(&ftype, sizeof(ftype)); + + if (n_dims != 1 && n_dims != 2) { + LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); + return 1; } int32_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + fin.read_raw(&ne[i], sizeof(ne[i])); } std::string name; { + GGML_ASSERT(name_len <= 1024); char buf[1024]; - fin.read(buf, length); - name = std::string(buf, length); + fin.read_raw(buf, name_len); + name = std::string(buf, name_len); } // check for lora suffix and get the type of tensor @@ -8758,7 +8774,7 @@ static int llama_apply_lora_from_file_internal( std::string lora_type = name.substr(pos + lora_suffix.length()); std::string base_name = name; base_name.erase(pos); - // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); + // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str()); if (model_tensors.find(base_name) == model_tensors.end()) { LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); @@ -8777,22 +8793,15 @@ static int llama_apply_lora_from_file_internal( return false; } } - ggml_tensor * lora_tensor; - if (n_dims == 2) { - lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); - } - else { - LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); - return 1; - } - ggml_set_name(lora_tensor, "lora_tensor"); + ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]); + ggml_set_name(lora_tensor, name.c_str()); // load tensor data - size_t offset = fin.tellg(); + size_t offset = fin.tell(); size_t tensor_data_size = ggml_nbytes(lora_tensor); offset = (offset + 31) & -32; - fin.seekg(offset); - fin.read((char*)lora_tensor->data, tensor_data_size); + fin.seek(offset, SEEK_SET); + fin.read_raw(lora_tensor->data, tensor_data_size); lora_tensors[name] = lora_tensor; @@ -8822,13 +8831,11 @@ static int llama_apply_lora_from_file_internal( // load from base model if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { - // TODO: throw LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; } - // TODO: not tested!! maybe not working! - base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); + base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU); ml->load_data_for(base_t); } else { base_t = dest_t; @@ -8857,43 +8864,45 @@ static int llama_apply_lora_from_file_internal( } // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB); offload_func(BA); ggml_set_name(BA, "BA"); if (scaling != 1.0f) { - ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx.get(), scaling); ggml_set_name(scale_tensor, "scale_tensor"); - BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + BA = ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor); offload_func(BA); ggml_set_name(BA, "BA_scaled"); } ggml_tensor * r; if (base_t == dest_t) { - r = ggml_add_inplace(lora_ctx, dest_t, BA); + r = ggml_add_inplace(lora_ctx.get(), dest_t, BA); offload_func_force_inplace(r); ggml_set_name(r, "r_add_inplace"); } else { - r = ggml_add(lora_ctx, base_t, BA); + r = ggml_add(lora_ctx.get(), base_t, BA); offload_func(r); ggml_set_name(r, "r_add"); - r = ggml_cpy(lora_ctx, r, dest_t); + r = ggml_cpy(lora_ctx.get(), r, dest_t); offload_func(r); ggml_set_name(r, "r_cpy"); } - struct ggml_cgraph * gf = ggml_new_graph(lora_ctx); + struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get()); ggml_build_forward_expand(gf, r); ggml_graph_compute_helper(work_buffer, gf, n_threads); + // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other + GGML_ASSERT(lora_tensors.size() == 2); + // we won't need these tensors again, reset the context to save memory - ggml_free(lora_ctx); - lora_ctx = ggml_init(params); + lora_ctx.reset(ggml_init(params)); lora_tensors.clear(); n_tensors++; @@ -8903,12 +8912,6 @@ static int llama_apply_lora_from_file_internal( } } - // TODO: this should be in a destructor, it will leak on failure - ggml_free(lora_ctx); - if (base_ctx) { - ggml_free(base_ctx); - } - const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); diff --git a/llama.h b/llama.h index 45a65cacb..15ab4f80e 100644 --- a/llama.h +++ b/llama.h @@ -39,6 +39,7 @@ #define LLAMA_MAX_RNG_STATE (64*1024) +#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN From 5daa5f54fdcd2b5228add1a4c43a1897b2168f35 Mon Sep 17 00:00:00 2001 From: Bach Le Date: Sun, 17 Dec 2023 18:57:33 +0800 Subject: [PATCH 09/43] Link to cublas dynamically on Windows even with LLAMA_STATIC (#4506) --- CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57b43c136..e3cd43ab3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -291,7 +291,12 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) if (LLAMA_STATIC) - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + if (WIN32) + # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + else () + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + endif() else() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() From 62bd52b7bf90819e75f427a95a484cd5eee0b3c7 Mon Sep 17 00:00:00 2001 From: mzcu Date: Sun, 17 Dec 2023 15:54:37 +0100 Subject: [PATCH 10/43] server : allow requests larger than 8K (#4500) --- examples/server/server.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5f93dcb66..a9f8b3747 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -10,7 +10,8 @@ // crash the server in debug mode, otherwise send an http 500 error #define CPPHTTPLIB_NO_EXCEPTIONS 1 #endif - +// increase max payload length to allow use of larger context size +#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576 #include "httplib.h" #include "json.hpp" From eb16dae7e70ca97396190698b29c0f9ee3388e88 Mon Sep 17 00:00:00 2001 From: Alexey Parfenov Date: Sun, 17 Dec 2023 14:56:09 +0000 Subject: [PATCH 11/43] server : fix possible ambiguity in content type charset (#4501) --- examples/server/server.cpp | 44 +++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a9f8b3747..be7b5b95e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2699,7 +2699,7 @@ int main(int argc, char **argv) } // API key is invalid or not provided - res.set_content("Unauthorized: Invalid API Key", "text/plain"); + res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8"); res.status = 401; // Unauthorized LOG_WARNING("Unauthorized: Invalid API Key", {}); @@ -2714,28 +2714,28 @@ int main(int argc, char **argv) // this is only called if no index.html is found in the public --path svr.Get("/", [](const httplib::Request &, httplib::Response &res) { - res.set_content(reinterpret_cast(&index_html), index_html_len, "text/html"); + res.set_content(reinterpret_cast(&index_html), index_html_len, "text/html; charset=utf-8"); return false; }); // this is only called if no index.js is found in the public --path svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res) { - res.set_content(reinterpret_cast(&index_js), index_js_len, "text/javascript"); + res.set_content(reinterpret_cast(&index_js), index_js_len, "text/javascript; charset=utf-8"); return false; }); // this is only called if no index.html is found in the public --path svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res) { - res.set_content(reinterpret_cast(&completion_js), completion_js_len, "application/javascript"); + res.set_content(reinterpret_cast(&completion_js), completion_js_len, "application/javascript; charset=utf-8"); return false; }); // this is only called if no index.html is found in the public --path svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res) { - res.set_content(reinterpret_cast(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript"); + res.set_content(reinterpret_cast(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8"); return false; }); @@ -2746,7 +2746,7 @@ int main(int argc, char **argv) { "user_name", llama.name_user.c_str() }, { "assistant_name", llama.name_assistant.c_str() } }; - res.set_content(data.dump(), "application/json"); + res.set_content(data.dump(), "application/json; charset=utf-8"); }); svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) @@ -2760,12 +2760,12 @@ int main(int argc, char **argv) std::string completion_text; task_result result = llama.next_result(task_id); if (!result.error && result.stop) { - res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); + res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); } else { res.status = 404; - res.set_content(result.result_json["content"], "text/plain"); + res.set_content(result.result_json["content"], "text/plain; charset=utf-8"); return; } } else { @@ -2836,7 +2836,7 @@ int main(int argc, char **argv) }} }; - res.set_content(models.dump(), "application/json"); + res.set_content(models.dump(), "application/json; charset=utf-8"); }); // TODO: add mount point without "/v1" prefix -- how? @@ -2858,10 +2858,10 @@ int main(int argc, char **argv) res.set_content(oaicompat_result.dump(-1, ' ', false, json::error_handler_t::replace), - "application/json"); + "application/json; charset=utf-8"); } else { res.status = 500; - res.set_content(result.result_json["content"], "text/plain"); + res.set_content(result.result_json["content"], "text/plain; charset=utf-8"); return; } } else { @@ -2925,12 +2925,12 @@ int main(int argc, char **argv) task_result result = llama.next_result(task_id); if (!result.error && result.stop) { - res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); + res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); } else { res.status = 404; - res.set_content(result.result_json["content"], "text/plain"); + res.set_content(result.result_json["content"], "text/plain; charset=utf-8"); return; } } else { @@ -2979,11 +2979,11 @@ int main(int argc, char **argv) svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res) { const json data = llama.get_model_props(); - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(), "application/json; charset=utf-8"); }); svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res) - { return res.set_content("", "application/json"); }); + { return res.set_content("", "application/json; charset=utf-8"); }); svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res) { @@ -2994,7 +2994,7 @@ int main(int argc, char **argv) tokens = llama.tokenize(body["content"], false); } const json data = format_tokenizer_response(tokens); - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(), "application/json; charset=utf-8"); }); svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res) @@ -3008,7 +3008,7 @@ int main(int argc, char **argv) } const json data = format_detokenized_response(content); - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(), "application/json; charset=utf-8"); }); svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res) @@ -3025,7 +3025,7 @@ int main(int argc, char **argv) } const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); task_result result = llama.next_result(task_id); - return res.set_content(result.result_json.dump(), "application/json"); + return res.set_content(result.result_json.dump(), "application/json; charset=utf-8"); }); svr.set_logger(log_server_request); @@ -3046,7 +3046,7 @@ int main(int argc, char **argv) { snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); } - res.set_content(buf, "text/plain"); + res.set_content(buf, "text/plain; charset=utf-8"); res.status = 500; }); @@ -3054,15 +3054,15 @@ int main(int argc, char **argv) { if (res.status == 401) { - res.set_content("Unauthorized", "text/plain"); + res.set_content("Unauthorized", "text/plain; charset=utf-8"); } if (res.status == 400) { - res.set_content("Invalid request", "text/plain"); + res.set_content("Invalid request", "text/plain; charset=utf-8"); } else if (res.status == 404) { - res.set_content("File Not Found", "text/plain"); + res.set_content("File Not Found", "text/plain; charset=utf-8"); res.status = 404; } }); From 8edd2b40fdbcafbf630f2cf29306b29d5cb48c42 Mon Sep 17 00:00:00 2001 From: AdithyanI Date: Sun, 17 Dec 2023 15:57:56 +0100 Subject: [PATCH 12/43] server : fix grammar being ignored (#4494) Fix bug in identifying the grammar. --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index be7b5b95e..c97efe97d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2414,7 +2414,7 @@ json oaicompat_completion_params_parse( llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0); - if (llama_params.count("grammar") != 0) { + if (body.count("grammar") != 0) { llama_params["grammar"] = json_value(body, "grammar", json::object()); } From 0ffc92d2d23a789625f018840469af045be1e3c0 Mon Sep 17 00:00:00 2001 From: olexiyb Date: Sun, 17 Dec 2023 17:02:16 +0200 Subject: [PATCH 13/43] server : disable llm logs if SERVER_VERBOSE is off (#3792) --- examples/server/server.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c97efe97d..04038530f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2645,6 +2645,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con int main(int argc, char **argv) { +#if SERVER_VERBOSE != 1 + log_disable(); +#endif // own arguments required by this example gpt_params params; server_params sparams; From 45668633fdb522a925c3dafc1ecf426f539efb27 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 17 Dec 2023 16:05:56 +0100 Subject: [PATCH 14/43] finetune : keep allocs alive until all allocations are done (#4486) --- examples/finetune/finetune.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index b9849e8c9..6a668d764 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1620,8 +1620,6 @@ int main(int argc, char ** argv) { opt->params.adam.gclip = params.common.adam_gclip; opt->params.adam.eps_f = params.common.adam_eps_f; - ggml_allocr * alloc = NULL; - printf("%s: init model\n", __func__); bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train); @@ -1725,10 +1723,9 @@ int main(int argc, char ** argv) { // allocate input tensors mem_input_data.resize(max_input_size); - alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); - ggml_allocr_alloc(alloc, tokens_input); - ggml_allocr_alloc(alloc, target_probs); - ggml_allocr_free(alloc); + ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); + ggml_allocr_alloc(alloc_inps, tokens_input); + ggml_allocr_alloc(alloc_inps, target_probs); // context for compute tensors without their data const size_t estimated_compute_size_wo_data = ( @@ -1755,7 +1752,7 @@ int main(int argc, char ** argv) { // find best evaluation order for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { ctx_compute = ggml_init(ctx_compute_params); - alloc = ggml_allocr_new_measure(tensor_alignment); + ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = (enum ggml_cgraph_eval_order) order; gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); @@ -1788,7 +1785,7 @@ int main(int argc, char ** argv) { // allocate compute tensors mem_compute_data.resize(max_compute_size); ctx_compute = ggml_init(ctx_compute_params); - alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); + ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = best_order; gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); @@ -1804,6 +1801,8 @@ int main(int argc, char ** argv) { params.common.use_checkpointing ); ggml_allocr_free(alloc); + ggml_allocr_free(alloc_inps); + // tokenize data std::vector train_tokens; From 919c40660fd27157b391b5832d2a577d5afef4cb Mon Sep 17 00:00:00 2001 From: Matheus Gabriel Alves Silva Date: Sun, 17 Dec 2023 12:23:33 -0300 Subject: [PATCH 15/43] build : Check the ROCm installation location (#4485) * build : Check the ROCm installation location * more generic approach * fixup! It was returning the path instead of the command output * fixup! Trailing whitespace --- Makefile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index fb775ae5b..8273f8400 100644 --- a/Makefile +++ b/Makefile @@ -439,9 +439,15 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h endif # LLAMA_CLBLAST ifdef LLAMA_HIPBLAS - ROCM_PATH ?= /opt/rocm - HIPCC ?= $(ROCM_PATH)/bin/hipcc - GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) + + ifeq ($(wildcard /opt/rocm),) + ROCM_PATH ?= /usr + GPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) + else + ROCM_PATH ?= /opt/rocm + GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) + endif + HIPCC ?= $(ROCM_PATH)/bin/hipcc LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_KQUANTS_ITER ?= 2 From f7f468a97dceec2f8fe8b1ed7a2091083446ebc7 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Sun, 17 Dec 2023 10:45:46 -0500 Subject: [PATCH 16/43] gguf-py : fail fast on nonsensical special token IDs (#4489) --- gguf-py/gguf/vocab.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index de3e5edb5..76924d8f2 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -109,8 +109,10 @@ class SpecialVocab: return True def _set_special_token(self, typ: str, tid: Any) -> None: - if not isinstance(tid, int) or tid < 0: + if not isinstance(tid, int): return + if tid < 0: + raise ValueError(f'invalid value for special token type {typ}: {tid}') if self.n_vocab is None or tid < self.n_vocab: if typ in self.special_token_ids: return From 800a489e4a8be199122259a995b1ee9dd7fae320 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 17 Dec 2023 19:38:41 +0200 Subject: [PATCH 17/43] llama.swiftui : add bench functionality (#4483) * llama.swiftui : add bench button * llama.swiftui : initial bench functionality * force to use n_gpu_layers on simulator * add download buttons & expose llamaState.loadModel * update project.pbxproj * comment #Preview & fix editorconfig check * gitignore : xcode stuff * llama.swiftui : UX improvements * llama.swiftui : avoid data copy via "downloadTask" * llama.swiftui : remove model from project * llama : remove "mostly" from model infos * llama.swiftui : improve bench --------- Co-authored-by: jhen --- .editorconfig | 3 + examples/llama.swiftui/.gitignore | 1 + .../llama.cpp.swift/LibLlama.swift | 182 +++- .../llama.swiftui.xcodeproj/project.pbxproj | 898 +++++++++--------- .../llama.swiftui/Models/LlamaState.swift | 52 +- .../llama.swiftui/UI/ContentView.swift | 114 ++- .../llama.swiftui/UI/DownloadButton.swift | 122 +++ llama.cpp | 33 +- 8 files changed, 895 insertions(+), 510 deletions(-) create mode 100644 examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift diff --git a/.editorconfig b/.editorconfig index a56e9ccc8..16d16b3b5 100644 --- a/.editorconfig +++ b/.editorconfig @@ -23,3 +23,6 @@ insert_final_newline = unset [examples/server/public/*] indent_size = 2 + +[examples/llama.swiftui/llama.swiftui.xcodeproj/*] +indent_style = tab diff --git a/examples/llama.swiftui/.gitignore b/examples/llama.swiftui/.gitignore index 9bce6af39..e585a2a4f 100644 --- a/examples/llama.swiftui/.gitignore +++ b/examples/llama.swiftui/.gitignore @@ -1 +1,2 @@ xcuserdata +xcshareddata diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 3754f0551..272e1fd8a 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -6,16 +6,34 @@ enum LlamaError: Error { case couldNotInitializeContext } +func llama_batch_clear(_ batch: inout llama_batch) { + batch.n_tokens = 0 +} + +func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) { + batch.token [Int(batch.n_tokens)] = id + batch.pos [Int(batch.n_tokens)] = pos + batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count) + for i in 0.. LlamaContext { + static func create_context(path: String) throws -> LlamaContext { llama_backend_init(false) - let model_params = llama_model_default_params() + var model_params = llama_model_default_params() +#if targetEnvironment(simulator) + model_params.n_gpu_layers = 0 + print("Running on simulator, force use n_gpu_layers = 0") +#endif let model = llama_load_model_from_file(path, model_params) guard let model else { print("Could not load model at \(path)") throw LlamaError.couldNotInitializeContext } + + let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2)) + print("Using \(n_threads) threads") + var ctx_params = llama_context_default_params() - ctx_params.seed = 1234 + ctx_params.seed = 1234 ctx_params.n_ctx = 2048 - ctx_params.n_threads = 8 - ctx_params.n_threads_batch = 8 + ctx_params.n_threads = UInt32(n_threads) + ctx_params.n_threads_batch = UInt32(n_threads) let context = llama_new_context_with_model(model, ctx_params) guard let context else { @@ -56,6 +83,26 @@ actor LlamaContext { return LlamaContext(model: model, context: context) } + func model_info() -> String { + let result = UnsafeMutablePointer.allocate(capacity: 256) + result.initialize(repeating: Int8(0), count: 256) + defer { + result.deallocate() + } + + // TODO: this is probably very stupid way to get the string from C + + let nChars = llama_model_desc(model, result, 256) + let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars)) + + var SwiftString = "" + for char in bufferPointer { + SwiftString.append(Character(UnicodeScalar(UInt8(char)))) + } + + return SwiftString + } + func get_n_tokens() -> Int32 { return batch.n_tokens; } @@ -79,16 +126,11 @@ actor LlamaContext { print(String(cString: token_to_piece(token: id) + [0])) } - // batch = llama_batch_init(512, 0) // done in init() - batch.n_tokens = Int32(tokens_list.count) + llama_batch_clear(&batch) - for i1 in 0.. String { + var pp_avg: Double = 0 + var tg_avg: Double = 0 + + var pp_std: Double = 0 + var tg_std: Double = 0 + + for r in 0.. 1 { + pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1)) + tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1)) + } else { + pp_std = 0 + tg_std = 0 + } + + let model_desc = model_info(); + let model_size = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0); + let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9); + let backend = "Metal"; + let pp_avg_str = String(format: "%.2f", pp_avg); + let tg_avg_str = String(format: "%.2f", tg_avg); + let pp_std_str = String(format: "%.2f", pp_std); + let tg_std_str = String(format: "%.2f", tg_std); + + var result = "" + + result += String("| model | size | params | backend | test | t/s |\n") + result += String("| --- | --- | --- | --- | --- | --- |\n") + result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n") + result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n") + + return result; + } + func clear() { tokens_list.removeAll() temporary_invalid_cchars.removeAll() + llama_kv_cache_clear(context) } private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count - let n_tokens = utf8Count + (add_bos ? 1 : 0) + let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1 let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false) diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj index bc1fd15ce..2e6159928 100644 --- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj +++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj @@ -1,481 +1,483 @@ // !$*UTF8*$! { - archiveVersion = 1; - classes = { - }; - objectVersion = 56; - objects = { + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { /* Begin PBXBuildFile section */ - 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; }; - 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; }; - 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; }; - 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; }; - 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; }; - 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; }; - 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; - 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; }; - 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; }; - 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; }; - 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; }; - 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; }; - 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; }; - 8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; }; - 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; }; - 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; }; + 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; }; + 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; }; + 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; }; + 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; }; + 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; }; + 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; }; + 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; + 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; }; + 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; }; + 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; }; + 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; }; + 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; }; + 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; }; + 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; }; + 8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; }; + 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; }; + 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ - 542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = ""; }; - 542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = ""; }; - 542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = ""; }; - 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = ""; }; - 542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = ""; }; - 542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = ""; }; - 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = ""; }; - 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = ""; }; - 542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = ""; }; - 542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = ""; }; - 549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = ""; }; - 549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = ""; }; - 549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = ""; }; - 549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; - 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = ""; }; - 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; }; - 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = ""; }; - 8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; - 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; - 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = ""; }; - 8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; }; - 8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = ""; }; - 8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = ""; }; - 8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = ""; }; - 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = ""; }; + 542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = ""; }; + 542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = ""; }; + 542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = ""; }; + 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = ""; }; + 542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = ""; }; + 542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = ""; }; + 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = ""; }; + 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = ""; }; + 542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = ""; }; + 542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = ""; }; + 549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = ""; }; + 549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = ""; }; + 549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = ""; }; + 549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; + 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = ""; }; + 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = ""; }; + 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; }; + 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = ""; }; + 8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = ""; }; + 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; + 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = ""; }; + 8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; }; + 8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = ""; }; + 8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = ""; }; + 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ - 8A1C83702AC328BD0096AF73 /* Frameworks */ = { - isa = PBXFrameworksBuildPhase; - buildActionMask = 2147483647; - files = ( - 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */, - 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; + 8A1C83702AC328BD0096AF73 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */, + 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXFrameworksBuildPhase section */ /* Begin PBXGroup section */ - 8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = { - isa = PBXGroup; - children = ( - 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */, - 542376092B0D9C40008E6A1C /* ggml-backend.h */, - 542376062B0D9BEA008E6A1C /* ggml-quants.h */, - 542376072B0D9BFB008E6A1C /* ggml-quants.c */, - 549479C82AC9E10B00E0F78B /* ggml-metal.metal */, - 549479C62AC9E0F200E0F78B /* ggml-metal.h */, - 549479C52AC9E0F200E0F78B /* ggml-metal.m */, - 542EA09B2AC8723900A8AEE9 /* ggml.c */, - 542EA09C2AC8723900A8AEE9 /* ggml.h */, - 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */, - 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */, - 542EA0A12AC8729100A8AEE9 /* llama.cpp */, - 542EA0A22AC8729100A8AEE9 /* llama.h */, - ); - name = llama.cpp; - sourceTree = ""; - }; - 8A1C836A2AC328BD0096AF73 = { - isa = PBXGroup; - children = ( - 8A08D1F62AC7383900FE6CD4 /* llama.cpp */, - 8A907F312AC7134E006146EA /* llama.cpp.swift */, - 8A3F84232AC4C891005E2EE8 /* models */, - 8A1C83752AC328BD0096AF73 /* llama.swiftui */, - 8A1C83742AC328BD0096AF73 /* Products */, - 8A39BE082AC7601000BFEB40 /* Frameworks */, - ); - sourceTree = ""; - }; - 8A1C83742AC328BD0096AF73 /* Products */ = { - isa = PBXGroup; - children = ( - 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */, - ); - name = Products; - sourceTree = ""; - }; - 8A1C83752AC328BD0096AF73 /* llama.swiftui */ = { - isa = PBXGroup; - children = ( - 8A3F84102AC4BD85005E2EE8 /* Resources */, - 8A9F7C4B2AC332DC008AE1EA /* Models */, - 8A9F7C4A2AC332BF008AE1EA /* UI */, - 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */, - 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */, - 8A1C837C2AC328BE0096AF73 /* Preview Content */, - ); - path = llama.swiftui; - sourceTree = ""; - }; - 8A1C837C2AC328BE0096AF73 /* Preview Content */ = { - isa = PBXGroup; - children = ( - 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */, - ); - path = "Preview Content"; - sourceTree = ""; - }; - 8A39BE082AC7601000BFEB40 /* Frameworks */ = { - isa = PBXGroup; - children = ( - 549479CA2AC9E16000E0F78B /* Metal.framework */, - 8A39BE092AC7601000BFEB40 /* Accelerate.framework */, - ); - name = Frameworks; - sourceTree = ""; - }; - 8A3F84102AC4BD85005E2EE8 /* Resources */ = { - isa = PBXGroup; - children = ( - 8A3F84112AC4BD8C005E2EE8 /* models */, - ); - path = Resources; - sourceTree = ""; - }; - 8A3F84112AC4BD8C005E2EE8 /* models */ = { - isa = PBXGroup; - children = ( - 8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */, - ); - path = models; - sourceTree = ""; - }; - 8A907F312AC7134E006146EA /* llama.cpp.swift */ = { - isa = PBXGroup; - children = ( - 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */, - 8A907F322AC7134E006146EA /* LibLlama.swift */, - ); - path = llama.cpp.swift; - sourceTree = ""; - }; - 8A9F7C4A2AC332BF008AE1EA /* UI */ = { - isa = PBXGroup; - children = ( - 8A1C83782AC328BD0096AF73 /* ContentView.swift */, - ); - path = UI; - sourceTree = ""; - }; - 8A9F7C4B2AC332DC008AE1EA /* Models */ = { - isa = PBXGroup; - children = ( - 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */, - ); - path = Models; - sourceTree = ""; - }; + 8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = { + isa = PBXGroup; + children = ( + 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */, + 542376092B0D9C40008E6A1C /* ggml-backend.h */, + 542376062B0D9BEA008E6A1C /* ggml-quants.h */, + 542376072B0D9BFB008E6A1C /* ggml-quants.c */, + 549479C82AC9E10B00E0F78B /* ggml-metal.metal */, + 549479C62AC9E0F200E0F78B /* ggml-metal.h */, + 549479C52AC9E0F200E0F78B /* ggml-metal.m */, + 542EA09B2AC8723900A8AEE9 /* ggml.c */, + 542EA09C2AC8723900A8AEE9 /* ggml.h */, + 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */, + 542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */, + 542EA0A12AC8729100A8AEE9 /* llama.cpp */, + 542EA0A22AC8729100A8AEE9 /* llama.h */, + ); + name = llama.cpp; + sourceTree = ""; + }; + 8A1C836A2AC328BD0096AF73 = { + isa = PBXGroup; + children = ( + 8A08D1F62AC7383900FE6CD4 /* llama.cpp */, + 8A907F312AC7134E006146EA /* llama.cpp.swift */, + 8A3F84232AC4C891005E2EE8 /* models */, + 8A1C83752AC328BD0096AF73 /* llama.swiftui */, + 8A1C83742AC328BD0096AF73 /* Products */, + 8A39BE082AC7601000BFEB40 /* Frameworks */, + ); + sourceTree = ""; + }; + 8A1C83742AC328BD0096AF73 /* Products */ = { + isa = PBXGroup; + children = ( + 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */, + ); + name = Products; + sourceTree = ""; + }; + 8A1C83752AC328BD0096AF73 /* llama.swiftui */ = { + isa = PBXGroup; + children = ( + 8A3F84102AC4BD85005E2EE8 /* Resources */, + 8A9F7C4B2AC332DC008AE1EA /* Models */, + 8A9F7C4A2AC332BF008AE1EA /* UI */, + 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */, + 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */, + 8A1C837C2AC328BE0096AF73 /* Preview Content */, + ); + path = llama.swiftui; + sourceTree = ""; + }; + 8A1C837C2AC328BE0096AF73 /* Preview Content */ = { + isa = PBXGroup; + children = ( + 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */, + ); + path = "Preview Content"; + sourceTree = ""; + }; + 8A39BE082AC7601000BFEB40 /* Frameworks */ = { + isa = PBXGroup; + children = ( + 549479CA2AC9E16000E0F78B /* Metal.framework */, + 8A39BE092AC7601000BFEB40 /* Accelerate.framework */, + ); + name = Frameworks; + sourceTree = ""; + }; + 8A3F84102AC4BD85005E2EE8 /* Resources */ = { + isa = PBXGroup; + children = ( + 8A3F84112AC4BD8C005E2EE8 /* models */, + ); + path = Resources; + sourceTree = ""; + }; + 8A3F84112AC4BD8C005E2EE8 /* models */ = { + isa = PBXGroup; + children = ( + ); + path = models; + sourceTree = ""; + }; + 8A907F312AC7134E006146EA /* llama.cpp.swift */ = { + isa = PBXGroup; + children = ( + 8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */, + 8A907F322AC7134E006146EA /* LibLlama.swift */, + ); + path = llama.cpp.swift; + sourceTree = ""; + }; + 8A9F7C4A2AC332BF008AE1EA /* UI */ = { + isa = PBXGroup; + children = ( + 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */, + 8A1C83782AC328BD0096AF73 /* ContentView.swift */, + ); + path = UI; + sourceTree = ""; + }; + 8A9F7C4B2AC332DC008AE1EA /* Models */ = { + isa = PBXGroup; + children = ( + 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */, + ); + path = Models; + sourceTree = ""; + }; /* End PBXGroup section */ /* Begin PBXNativeTarget section */ - 8A1C83722AC328BD0096AF73 /* llama.swiftui */ = { - isa = PBXNativeTarget; - buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */; - buildPhases = ( - 8A1C836F2AC328BD0096AF73 /* Sources */, - 8A1C83702AC328BD0096AF73 /* Frameworks */, - 8A1C83712AC328BD0096AF73 /* Resources */, - ); - buildRules = ( - ); - dependencies = ( - ); - name = llama.swiftui; - packageProductDependencies = ( - ); - productName = llama.swiftui; - productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */; - productType = "com.apple.product-type.application"; - }; + 8A1C83722AC328BD0096AF73 /* llama.swiftui */ = { + isa = PBXNativeTarget; + buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */; + buildPhases = ( + 8A1C836F2AC328BD0096AF73 /* Sources */, + 8A1C83702AC328BD0096AF73 /* Frameworks */, + 8A1C83712AC328BD0096AF73 /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = llama.swiftui; + packageProductDependencies = ( + ); + productName = llama.swiftui; + productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */; + productType = "com.apple.product-type.application"; + }; /* End PBXNativeTarget section */ /* Begin PBXProject section */ - 8A1C836B2AC328BD0096AF73 /* Project object */ = { - isa = PBXProject; - attributes = { - BuildIndependentTargetsInParallel = 1; - LastSwiftUpdateCheck = 1500; - LastUpgradeCheck = 1500; - TargetAttributes = { - 8A1C83722AC328BD0096AF73 = { - CreatedOnToolsVersion = 15.0; - LastSwiftMigration = 1500; - }; - }; - }; - buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */; - compatibilityVersion = "Xcode 14.0"; - developmentRegion = en; - hasScannedForEncodings = 0; - knownRegions = ( - en, - Base, - ); - mainGroup = 8A1C836A2AC328BD0096AF73; - packageReferences = ( - ); - productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */; - projectDirPath = ""; - projectRoot = ""; - targets = ( - 8A1C83722AC328BD0096AF73 /* llama.swiftui */, - ); - }; + 8A1C836B2AC328BD0096AF73 /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastSwiftUpdateCheck = 1500; + LastUpgradeCheck = 1500; + TargetAttributes = { + 8A1C83722AC328BD0096AF73 = { + CreatedOnToolsVersion = 15.0; + LastSwiftMigration = 1500; + }; + }; + }; + buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = 8A1C836A2AC328BD0096AF73; + packageReferences = ( + ); + productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 8A1C83722AC328BD0096AF73 /* llama.swiftui */, + ); + }; /* End PBXProject section */ /* Begin PBXResourcesBuildPhase section */ - 8A1C83712AC328BD0096AF73 /* Resources */ = { - isa = PBXResourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */, - 8A3F84242AC4C891005E2EE8 /* models in Resources */, - 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */, - 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; + 8A1C83712AC328BD0096AF73 /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */, + 8A3F84242AC4C891005E2EE8 /* models in Resources */, + 8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */, + 8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXResourcesBuildPhase section */ /* Begin PBXSourcesBuildPhase section */ - 8A1C836F2AC328BD0096AF73 /* Sources */ = { - isa = PBXSourcesBuildPhase; - buildActionMask = 2147483647; - files = ( - 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */, - 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */, - 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */, - 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */, - 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */, - 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */, - 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */, - 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */, - 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */, - 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */, - ); - runOnlyForDeploymentPostprocessing = 0; - }; + 8A1C836F2AC328BD0096AF73 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + 542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */, + 549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */, + 542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */, + 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */, + 542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */, + 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */, + 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */, + 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */, + 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */, + 542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */, + 5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; /* End PBXSourcesBuildPhase section */ /* Begin XCBuildConfiguration section */ - 8A1C837F2AC328BE0096AF73 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - CLANG_ENABLE_OBJC_WEAK = YES; - CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; - CLANG_WARN_BOOL_CONVERSION = YES; - CLANG_WARN_COMMA = YES; - CLANG_WARN_CONSTANT_CONVERSION = YES; - CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; - CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; - CLANG_WARN_DOCUMENTATION_COMMENTS = YES; - CLANG_WARN_EMPTY_BODY = YES; - CLANG_WARN_ENUM_CONVERSION = YES; - CLANG_WARN_INFINITE_RECURSION = YES; - CLANG_WARN_INT_CONVERSION = YES; - CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; - CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; - CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; - CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; - CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; - CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; - CLANG_WARN_STRICT_PROTOTYPES = YES; - CLANG_WARN_SUSPICIOUS_MOVE = YES; - CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; - CLANG_WARN_UNREACHABLE_CODE = YES; - CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = dwarf; - ENABLE_STRICT_OBJC_MSGSEND = YES; - ENABLE_TESTABILITY = YES; - ENABLE_USER_SCRIPT_SANDBOXING = YES; - GCC_C_LANGUAGE_STANDARD = gnu17; - GCC_DYNAMIC_NO_PIC = NO; - GCC_NO_COMMON_BLOCKS = YES; - GCC_OPTIMIZATION_LEVEL = 0; - GCC_PREPROCESSOR_DEFINITIONS = ( - "DEBUG=1", - "$(inherited)", - ); - GCC_WARN_64_TO_32_BIT_CONVERSION = YES; - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNDECLARED_SELECTOR = YES; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - GCC_WARN_UNUSED_FUNCTION = YES; - GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 17.0; - LOCALIZATION_PREFERS_STRING_CATALOGS = YES; - MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; - MTL_FAST_MATH = YES; - ONLY_ACTIVE_ARCH = YES; - SDKROOT = iphoneos; - SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; - SWIFT_OPTIMIZATION_LEVEL = "-Onone"; - }; - name = Debug; - }; - 8A1C83802AC328BE0096AF73 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ALWAYS_SEARCH_USER_PATHS = NO; - ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; - CLANG_ANALYZER_NONNULL = YES; - CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; - CLANG_ENABLE_MODULES = YES; - CLANG_ENABLE_OBJC_ARC = YES; - CLANG_ENABLE_OBJC_WEAK = YES; - CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; - CLANG_WARN_BOOL_CONVERSION = YES; - CLANG_WARN_COMMA = YES; - CLANG_WARN_CONSTANT_CONVERSION = YES; - CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; - CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; - CLANG_WARN_DOCUMENTATION_COMMENTS = YES; - CLANG_WARN_EMPTY_BODY = YES; - CLANG_WARN_ENUM_CONVERSION = YES; - CLANG_WARN_INFINITE_RECURSION = YES; - CLANG_WARN_INT_CONVERSION = YES; - CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; - CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; - CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; - CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; - CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; - CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; - CLANG_WARN_STRICT_PROTOTYPES = YES; - CLANG_WARN_SUSPICIOUS_MOVE = YES; - CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; - CLANG_WARN_UNREACHABLE_CODE = YES; - CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; - COPY_PHASE_STRIP = NO; - DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; - ENABLE_NS_ASSERTIONS = NO; - ENABLE_STRICT_OBJC_MSGSEND = YES; - ENABLE_USER_SCRIPT_SANDBOXING = YES; - GCC_C_LANGUAGE_STANDARD = gnu17; - GCC_NO_COMMON_BLOCKS = YES; - GCC_WARN_64_TO_32_BIT_CONVERSION = YES; - GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; - GCC_WARN_UNDECLARED_SELECTOR = YES; - GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; - GCC_WARN_UNUSED_FUNCTION = YES; - GCC_WARN_UNUSED_VARIABLE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 17.0; - LOCALIZATION_PREFERS_STRING_CATALOGS = YES; - MTL_ENABLE_DEBUG_INFO = NO; - MTL_FAST_MATH = YES; - SDKROOT = iphoneos; - SWIFT_COMPILATION_MODE = wholemodule; - VALIDATE_PRODUCT = YES; - }; - name = Release; - }; - 8A1C83822AC328BE0096AF73 /* Debug */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CLANG_ENABLE_MODULES = YES; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; - DEVELOPMENT_TEAM = STLSG3FG8Q; - ENABLE_PREVIEWS = YES; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; - SWIFT_OPTIMIZATION_LEVEL = "-Onone"; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Debug; - }; - 8A1C83832AC328BE0096AF73 /* Release */ = { - isa = XCBuildConfiguration; - buildSettings = { - ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; - ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CLANG_ENABLE_MODULES = YES; - CODE_SIGN_STYLE = Automatic; - CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; - DEVELOPMENT_TEAM = STLSG3FG8Q; - ENABLE_PREVIEWS = YES; - GENERATE_INFOPLIST_FILE = YES; - INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; - INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; - INFOPLIST_KEY_UILaunchScreen_Generation = YES; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; - IPHONEOS_DEPLOYMENT_TARGET = 16.0; - LD_RUNPATH_SEARCH_PATHS = ( - "$(inherited)", - "@executable_path/Frameworks", - ); - MARKETING_VERSION = 1.0; - PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; - PRODUCT_NAME = "$(TARGET_NAME)"; - SWIFT_EMIT_LOC_STRINGS = YES; - SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; - SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = "1,2"; - }; - name = Release; - }; + 8A1C837F2AC328BE0096AF73 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + }; + name = Debug; + }; + 8A1C83802AC328BE0096AF73 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_USER_SCRIPT_SANDBOXING = YES; + GCC_C_LANGUAGE_STANDARD = gnu17; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 17.0; + LOCALIZATION_PREFERS_STRING_CATALOGS = YES; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + VALIDATE_PRODUCT = YES; + }; + name = Release; + }; + 8A1C83822AC328BE0096AF73 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; + DEVELOPMENT_TEAM = STLSG3FG8Q; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + 8A1C83832AC328BE0096AF73 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; + ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; + CLANG_ENABLE_MODULES = YES; + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\""; + DEVELOPMENT_TEAM = STLSG3FG8Q; + ENABLE_PREVIEWS = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; + INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; + INFOPLIST_KEY_UILaunchScreen_Generation = YES; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; + IPHONEOS_DEPLOYMENT_TARGET = 16.0; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift"; + PRODUCT_NAME = "$(TARGET_NAME)"; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h"; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; /* End XCBuildConfiguration section */ /* Begin XCConfigurationList section */ - 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = { - isa = XCConfigurationList; - buildConfigurations = ( - 8A1C837F2AC328BE0096AF73 /* Debug */, - 8A1C83802AC328BE0096AF73 /* Release */, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; - 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = { - isa = XCConfigurationList; - buildConfigurations = ( - 8A1C83822AC328BE0096AF73 /* Debug */, - 8A1C83832AC328BE0096AF73 /* Release */, - ); - defaultConfigurationIsVisible = 0; - defaultConfigurationName = Release; - }; + 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 8A1C837F2AC328BE0096AF73 /* Debug */, + 8A1C83802AC328BE0096AF73 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 8A1C83822AC328BE0096AF73 /* Debug */, + 8A1C83832AC328BE0096AF73 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; /* End XCConfigurationList section */ - }; - rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */; + }; + rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */; } diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index babc60cdc..3393eb242 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -3,24 +3,26 @@ import Foundation @MainActor class LlamaState: ObservableObject { @Published var messageLog = "" + @Published var cacheCleared = false private var llamaContext: LlamaContext? - private var modelUrl: URL? { - Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models") + private var defaultModelUrl: URL? { + Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models") // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models") } + init() { do { - try loadModel() + try loadModel(modelUrl: defaultModelUrl) } catch { messageLog += "Error!\n" } } - private func loadModel() throws { + func loadModel(modelUrl: URL?) throws { messageLog += "Loading model...\n" if let modelUrl { - llamaContext = try LlamaContext.createContext(path: modelUrl.path()) + llamaContext = try LlamaContext.create_context(path: modelUrl.path()) messageLog += "Loaded model \(modelUrl.lastPathComponent)\n" } else { messageLog += "Could not locate model\n" @@ -31,7 +33,7 @@ class LlamaState: ObservableObject { guard let llamaContext else { return } - messageLog += "Attempting to complete text...\n" + await llamaContext.completion_init(text: text) messageLog += "\(text)" @@ -42,4 +44,42 @@ class LlamaState: ObservableObject { await llamaContext.clear() messageLog += "\n\ndone\n" } + + func bench() async { + guard let llamaContext else { + return + } + + messageLog += "\n" + messageLog += "Running benchmark...\n" + messageLog += "Model info: " + messageLog += await llamaContext.model_info() + "\n" + + let t_start = DispatchTime.now().uptimeNanoseconds + await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up + let t_end = DispatchTime.now().uptimeNanoseconds + + let t_heat = Double(t_end - t_start) / 1_000_000_000.0 + messageLog += "Heat up time: \(t_heat) seconds, please wait...\n" + + // if more than 5 seconds, then we're probably running on a slow device + if t_heat > 5.0 { + messageLog += "Heat up time is too long, aborting benchmark\n" + return + } + + let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3) + + messageLog += "\(result)" + messageLog += "\n" + } + + func clear() async { + guard let llamaContext else { + return + } + + await llamaContext.clear() + messageLog = "" + } } diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift index 0bd16a806..219bf4dc1 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift @@ -5,24 +5,97 @@ struct ContentView: View { @State private var multiLineText = "" + private static func cleanupModelCaches() { + // Delete all models (*.gguf) + let fileManager = FileManager.default + let documentsUrl = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0] + do { + let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil) + for fileURL in fileURLs { + if fileURL.pathExtension == "gguf" { + try fileManager.removeItem(at: fileURL) + } + } + } catch { + print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)") + } + } + var body: some View { VStack { - ScrollView(.vertical) { + ScrollView(.vertical, showsIndicators: true) { Text(llamaState.messageLog) + .font(.system(size: 12)) + .frame(maxWidth: .infinity, alignment: .leading) + .padding() + .onTapGesture { + UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil) + } } TextEditor(text: $multiLineText) - .frame(height: 200) + .frame(height: 80) .padding() .border(Color.gray, width: 0.5) - Button(action: { - sendText() - }) { - Text("Send") - .padding() - .background(Color.blue) - .foregroundColor(.white) - .cornerRadius(8) + + HStack { + Button("Send") { + sendText() + } + .padding(8) + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(8) + + Button("Bench") { + bench() + } + .padding(8) + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(8) + + Button("Clear") { + clear() + } + .padding(8) + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(8) + + Button("Copy") { + UIPasteboard.general.string = llamaState.messageLog + } + .padding(8) + .background(Color.blue) + .foregroundColor(.white) + .cornerRadius(8) + } + + VStack { + DownloadButton( + llamaState: llamaState, + modelName: "TinyLlama-1.1B (Q4_0)", + modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true", + filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf" + ) + .font(.system(size: 12)) + .padding(.top, 4) + + DownloadButton( + llamaState: llamaState, + modelName: "TinyLlama-1.1B (Q8_0)", + modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true", + filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf" + ) + .font(.system(size: 12)) + + Button("Clear downloaded models") { + ContentView.cleanupModelCaches() + llamaState.cacheCleared = true + } + .padding(8) + .font(.system(size: 12)) } } .padding() @@ -34,9 +107,20 @@ struct ContentView: View { multiLineText = "" } } + + func bench() { + Task { + await llamaState.bench() + } + } + + func clear() { + Task { + await llamaState.clear() + } + } } -/* -#Preview { - ContentView() -} -*/ + +//#Preview { +// ContentView() +//} diff --git a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift new file mode 100644 index 000000000..4bd75cb69 --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift @@ -0,0 +1,122 @@ +import SwiftUI + +struct DownloadButton: View { + @ObservedObject private var llamaState: LlamaState + private var modelName: String + private var modelUrl: String + private var filename: String + + @State private var status: String + + @State private var downloadTask: URLSessionDownloadTask? + @State private var progress = 0.0 + @State private var observation: NSKeyValueObservation? + + private static func getFileURL(filename: String) -> URL { + FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename) + } + + private func checkFileExistenceAndUpdateStatus() { + } + + init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) { + self.llamaState = llamaState + self.modelName = modelName + self.modelUrl = modelUrl + self.filename = filename + + let fileURL = DownloadButton.getFileURL(filename: filename) + status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download" + } + + private func download() { + status = "downloading" + print("Downloading model \(modelName) from \(modelUrl)") + guard let url = URL(string: modelUrl) else { return } + let fileURL = DownloadButton.getFileURL(filename: filename) + + downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in + if let error = error { + print("Error: \(error.localizedDescription)") + return + } + + guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else { + print("Server error!") + return + } + + do { + if let temporaryURL = temporaryURL { + try FileManager.default.copyItem(at: temporaryURL, to: fileURL) + print("Writing to \(filename) completed") + + llamaState.cacheCleared = false + + status = "downloaded" + } + } catch let err { + print("Error: \(err.localizedDescription)") + } + } + + observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in + self.progress = progress.fractionCompleted + } + + downloadTask?.resume() + } + + var body: some View { + VStack { + if status == "download" { + Button(action: download) { + Text("Download " + modelName) + } + } else if status == "downloading" { + Button(action: { + downloadTask?.cancel() + status = "download" + }) { + Text("\(modelName) (Downloading \(Int(progress * 100))%)") + } + } else if status == "downloaded" { + Button(action: { + let fileURL = DownloadButton.getFileURL(filename: filename) + if !FileManager.default.fileExists(atPath: fileURL.path) { + download() + return + } + do { + try llamaState.loadModel(modelUrl: fileURL) + } catch let err { + print("Error: \(err.localizedDescription)") + } + }) { + Text("\(modelName) (Downloaded)") + } + } else { + Text("Unknown status") + } + } + .onDisappear() { + downloadTask?.cancel() + } + .onChange(of: llamaState.cacheCleared) { newValue in + if newValue { + downloadTask?.cancel() + let fileURL = DownloadButton.getFileURL(filename: filename) + status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download" + } + } + } +} + +// #Preview { +// DownloadButton( +// llamaState: LlamaState(), +// modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)", +// modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true", +// filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf" +// ) +// } diff --git a/llama.cpp b/llama.cpp index f49214c13..fd9fd6ed9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2397,25 +2397,25 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { switch (ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; - case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; - case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; - case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; + case LLAMA_FTYPE_MOSTLY_F16: return "F16"; + case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; + case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: - return "mostly Q4_1, some F16"; - case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; - case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; - case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; + return "Q4_1, some F16"; + case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; + case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; + case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; - case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large"; - case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; + case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K"; + case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; + case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; default: return "unknown, may not work"; } @@ -2533,6 +2533,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { + case 22: model.type = e_model::MODEL_1B; break; case 26: model.type = e_model::MODEL_3B; break; case 32: model.type = e_model::MODEL_7B; break; case 40: model.type = e_model::MODEL_13B; break; From b1306c439490c7fa4ec33594500d980d1e9e15e6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 17 Dec 2023 20:16:23 +0200 Subject: [PATCH 18/43] readme : update hot topics --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index edbe6ba57..01aef2afc 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics +- Collecting Apple Silicon performance stats: + - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167 + - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508 - Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406 -- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309** -- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 -- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167 ---- From 2994f0c5a2e8c96955b422dedc93ec2595d16b82 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Sun, 17 Dec 2023 19:39:02 -0500 Subject: [PATCH 19/43] decode : fix logits_valid for legacy API (#4516) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index fd9fd6ed9..d6d575f9e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6184,7 +6184,7 @@ static int llama_decode_internal( logits_out.resize(n_vocab); memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); #ifndef NDEBUG - logits_valid[n_tokens - 1] = true; + logits_valid[0] = true; #endif } } From 3c04bf6da89eaf4c7d317e0518f0687dfcbf2de7 Mon Sep 17 00:00:00 2001 From: hankcs Date: Mon, 18 Dec 2023 05:14:58 -0800 Subject: [PATCH 20/43] llama : fix try_override for bool_value which always return true (#4519) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index d6d575f9e..99facbf77 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1937,7 +1937,7 @@ namespace GGUFMeta { target = override->bool_value; return true; } - return true; + return false; } template From b9e74f9bca5fdf7d0a22ed25e7a9626335fdfa48 Mon Sep 17 00:00:00 2001 From: Ebey Abraham Date: Mon, 18 Dec 2023 17:27:47 +0000 Subject: [PATCH 21/43] llama : add phi-2 + fix NeoX rope + ggml_mul_mat_set_prec (#4490) * phi2 implementation * fix breaking change * phi-2 : various fixes * phi-2 : use layer norm eps * py : whitespaces * llama : fix meta KV override bug * convert : phi don't add BOS token * convert : revert "added_tokens_decoder" change * phi-2 : scale Q instead of KQ for better precision * ggml : fix NeoX rope to rotate just first n_dims * cuda : less diff in the rope_neox kernel * ggml : add ggml_mul_mat_set_prec ggml-ci * Update ggml-cuda.cu Co-authored-by: slaren * Update ggml-cuda.cu Co-authored-by: slaren * cuda : ggml_cuda_op_mul_mat_cublas support F32 precision * cuda : remove oboslete comment --------- Co-authored-by: Ebey Abraham Co-authored-by: Georgi Gerganov Co-authored-by: slaren --- convert-hf-to-gguf.py | 22 +++ ggml-cuda.cu | 117 +++++++++---- ggml-metal.metal | 13 +- ggml.c | 46 ++++- ggml.h | 12 ++ gguf-py/gguf/constants.py | 13 ++ gguf-py/gguf/tensor_mapping.py | 8 + llama.cpp | 307 +++++++++++++++++++++++++++++---- tests/test-backend-ops.cpp | 1 + 9 files changed, 463 insertions(+), 76 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index e46a7813a..e71a96c48 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -182,6 +182,8 @@ class Model: return QwenModel if model_architecture == "MixtralForCausalLM": return MixtralModel + if model_architecture == "PhiForCausalLM": + return Phi2Model return Model def _is_model_safetensors(self) -> bool: @@ -221,6 +223,8 @@ class Model: return gguf.MODEL_ARCH.QWEN if arch == "MixtralForCausalLM": return gguf.MODEL_ARCH.LLAMA + if arch == "PhiForCausalLM": + return gguf.MODEL_ARCH.PHI2 raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -980,6 +984,24 @@ class QwenModel(Model): print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) + +class Phi2Model(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("Phi2") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_add_bos_token(False) + + ###### CONVERSION LOGIC ###### diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0a63c1ecf..d0f3d8034 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4998,7 +4998,16 @@ static __global__ void rope_neox( const int ib = col / n_dims; const int ic = col % n_dims; - const int i = row*ncols + ib*n_dims + ic/2; + if (ib > 0) { + const int i = row*ncols + ib*n_dims + ic; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ncols + ib*n_dims + ic/2; const int i2 = row/p_delta_rows; float cur_rot = inv_ndims * ic - ib; @@ -7057,6 +7066,7 @@ inline void ggml_cuda_op_upscale( (void) src1; (void) dst; + (void) src1_dd; } inline void ggml_cuda_op_pad( @@ -7073,6 +7083,7 @@ inline void ggml_cuda_op_pad( (void) src1; (void) dst; + (void) src1_dd; } inline void ggml_cuda_op_rms_norm( @@ -7376,7 +7387,7 @@ inline void ggml_cuda_op_mul_mat_cublas( const int compute_capability = g_compute_capabilities[id]; - if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { + if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 half * src0_as_f16 = nullptr; size_t src0_as = 0; @@ -8300,27 +8311,27 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor } static __global__ void k_compute_batched_ptrs( - const half * src0_as_f16, const half * src1_as_f16, half * dst_f16, + const half * src0_as_f16, const half * src1_as_f16, char * dst, const void ** ptrs_src, void ** ptrs_dst, - int ne12, int ne13, - int ne23, - int nb02, int nb03, - int nb12, int nb13, - int nb2, int nb3, - int r2, int r3) { - int i13 = blockIdx.x * blockDim.x + threadIdx.x; - int i12 = blockIdx.y * blockDim.y + threadIdx.y; + int64_t ne12, int64_t ne13, + int64_t ne23, + size_t nb02, size_t nb03, + size_t nb12, size_t nb13, + size_t nbd2, size_t nbd3, + int64_t r2, int64_t r3) { + int64_t i13 = blockIdx.x * blockDim.x + threadIdx.x; + int64_t i12 = blockIdx.y * blockDim.y + threadIdx.y; if (i13 >= ne13 || i12 >= ne12) { return; } - int i03 = i13 / r3; - int i02 = i12 / r2; + int64_t i03 = i13 / r3; + int64_t i02 = i12 / r2; ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; - ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2; + ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; } static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -8376,7 +8387,41 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); size_t dst_as = 0; - half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); + + half * dst_f16 = nullptr; + char * dst_t = nullptr; + + cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; + cudaDataType_t cu_data_type = CUDA_R_16F; + + // dst strides + size_t nbd2 = dst->nb[2]; + size_t nbd3 = dst->nb[3]; + + const half alpha_f16 = 1.0f; + const half beta_f16 = 0.0f; + + const float alpha_f32 = 1.0f; + const float beta_f32 = 0.0f; + + const void * alpha = &alpha_f16; + const void * beta = &beta_f16; + + if (dst->op_params[0] == GGML_PREC_DEFAULT) { + dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); + dst_t = (char *) dst_f16; + + nbd2 /= sizeof(float) / sizeof(half); + nbd3 /= sizeof(float) / sizeof(half); + } else { + dst_t = (char *) dst_ddf; + + cu_compute_type = CUBLAS_COMPUTE_32F; + cu_data_type = CUDA_R_32F; + + alpha = &alpha_f32; + beta = &beta_f32; + } GGML_ASSERT(ne12 % ne02 == 0); GGML_ASSERT(ne13 % ne03 == 0); @@ -8385,9 +8430,6 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const const int64_t r2 = ne12/ne02; const int64_t r3 = ne13/ne03; - const half alpha_f16 = 1.0f; - const half beta_f16 = 0.0f; - #if 0 // use cublasGemmEx { @@ -8397,12 +8439,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const int i02 = i12 / r2; CUBLAS_CHECK( - cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), - (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), - &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01, - CUBLAS_COMPUTE_16F, + alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), + (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), + beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01, + cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } } @@ -8414,11 +8456,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const CUBLAS_CHECK( cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA - (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB - &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC + alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA + (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB + beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC ne12*ne13, - CUBLAS_COMPUTE_16F, + cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { // use cublasGemmBatchedEx @@ -8435,24 +8477,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const dim3 block_dims(ne13, ne12); k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( - src0_as_f16, src1_as_f16, dst_f16, + src0_as_f16, src1_as_f16, dst_t, ptrs_src, ptrs_dst, ne12, ne13, ne23, nb02, nb03, nb12, nb13, - dst->nb[2], dst->nb[3], + nbd2, nbd3, r2, r3); CUDA_CHECK(cudaGetLastError()); CUBLAS_CHECK( cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half), - (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float), - &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01, + alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half), + (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float), + beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01, ne23, - CUBLAS_COMPUTE_16F, + cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); if (ptrs_src_s != 0) { @@ -8464,11 +8506,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const } #endif - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); - to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); + if (dst->op_params[0] == GGML_PREC_DEFAULT) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); + + ggml_cuda_pool_free(dst_f16, dst_as); + } ggml_cuda_pool_free(src1_as_f16, src1_as); - ggml_cuda_pool_free(dst_f16, dst_as); } static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { diff --git a/ggml-metal.metal b/ggml-metal.metal index fe0ada445..d5b54e112 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1702,8 +1702,9 @@ kernel void kernel_rope( dst_data[1] = x0*sin_theta + x1*cos_theta; } } else { - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { + for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) { + if (ic < n_dims) { + const int64_t ib = 0; // simplified from `(ib * n_dims + ic) * inv_ndims` const float cur_rot = inv_ndims*ic - ib; @@ -1722,6 +1723,14 @@ kernel void kernel_rope( dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + const int64_t i0 = ic; + + device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; } } } diff --git a/ggml.c b/ggml.c index ad546a731..6da65bd92 100644 --- a/ggml.c +++ b/ggml.c @@ -4098,6 +4098,14 @@ struct ggml_tensor * ggml_mul_mat( return result; } +void ggml_mul_mat_set_prec( + struct ggml_tensor * a, + enum ggml_prec prec) { + const int32_t prec_i32 = (int32_t) prec; + + ggml_set_op_params_i32(a, 0, prec_i32); +} + // ggml_mul_mat_id struct ggml_tensor * ggml_mul_mat_id( @@ -9168,6 +9176,8 @@ static void ggml_compute_forward_norm_f32( float eps; memcpy(&eps, dst->op_params, sizeof(float)); + GGML_ASSERT(eps > 0.0f); + // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -9237,6 +9247,8 @@ static void ggml_compute_forward_rms_norm_f32( float eps; memcpy(&eps, dst->op_params, sizeof(float)); + GGML_ASSERT(eps > 0.0f); + // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -11562,10 +11574,13 @@ static void ggml_compute_forward_rope_f32( } } else { // TODO: this might be wrong for ne0 != n_dims - need double check - // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + // it seems we have to rope just the first n_dims elements and do nothing with the rest + // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26 theta_base *= freq_scale; - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { + for (int64_t ic = 0; ic < ne0; ic += 2) { + if (ic < n_dims) { + const int64_t ib = 0; + // simplified from `(ib * n_dims + ic) * inv_ndims` float cur_rot = inv_ndims * ic - ib; @@ -11588,6 +11603,14 @@ static void ggml_compute_forward_rope_f32( dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta; + } else { + const int64_t i0 = ic; + + const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; } } } @@ -11715,10 +11738,13 @@ static void ggml_compute_forward_rope_f16( } } else { // TODO: this might be wrong for ne0 != n_dims - need double check - // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + // it seems we have to rope just the first n_dims elements and do nothing with the rest + // ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26 theta_base *= freq_scale; - for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { - for (int64_t ic = 0; ic < n_dims; ic += 2) { + for (int64_t ic = 0; ic < ne0; ic += 2) { + if (ic < n_dims) { + const int64_t ib = 0; + // simplified from `(ib * n_dims + ic) * inv_ndims` float cur_rot = inv_ndims * ic - ib; @@ -11741,6 +11767,14 @@ static void ggml_compute_forward_rope_f16( dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + } else { + const int64_t i0 = ic; + + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + dst_data[0] = src[0]; + dst_data[1] = src[1]; } } } diff --git a/ggml.h b/ggml.h index 68f7833b6..f1003984f 100644 --- a/ggml.h +++ b/ggml.h @@ -343,6 +343,12 @@ extern "C" { GGML_TYPE_COUNT, }; + // precision + enum ggml_prec { + GGML_PREC_DEFAULT, + GGML_PREC_F32, + }; + enum ggml_backend_type { GGML_BACKEND_CPU = 0, GGML_BACKEND_GPU = 10, @@ -1057,6 +1063,12 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // change the precision of a matrix multiplication + // set to GGML_PREC_F32 for higher precision (useful for phi-2) + GGML_API void ggml_mul_mat_set_prec( + struct ggml_tensor * a, + enum ggml_prec prec); + // indirect matrix multiplication // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b) GGML_API struct ggml_tensor * ggml_mul_mat_id( diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 12133882b..390dca049 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -95,6 +95,7 @@ class MODEL_ARCH(IntEnum): BLOOM = auto() STABLELM = auto() QWEN = auto() + PHI2 = auto() class MODEL_TENSOR(IntEnum): @@ -140,6 +141,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", + MODEL_ARCH.PHI2: "phi2", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -350,6 +352,17 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_ARCH.GPT2: [ # TODO ], + MODEL_ARCH.PHI2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ] # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0115ea1c6..6fcbdbc1c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -17,6 +17,7 @@ class TensorNameMap: "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert "language_model.embedding.word_embeddings", # persimmon + "transformer.embd.wte", # phi2 ), # Token type embeddings @@ -41,6 +42,7 @@ class TensorNameMap: "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen "output", # llama-pth bloom "word_embeddings_for_head", # persimmon + "lm_head.linear", # phi2 ), # Output norm @@ -53,6 +55,7 @@ class TensorNameMap: "transformer.norm_f", # mpt "ln_f", # refact bloom qwen "language_model.encoder.final_layernorm", # persimmon + "lm_head.ln", # phi2 ), # Rope frequencies @@ -75,6 +78,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.LayerNorm", # bert "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi + "transformer.h.{bid}.ln", # phi2 ), # Attention norm 2 @@ -90,6 +94,7 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon + "transformer.h.{bid}.mixer.Wqkv", # phi2 ), # Attention query @@ -128,6 +133,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "transformer.h.{bid}.mixer.out_proj", # phi2 ), # Rotary embeddings @@ -167,6 +173,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.fc_in", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon "transformer.h.{bid}.mlp.w1", # qwen + "transformer.h.{bid}.mlp.fc1", # phi2 ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -198,6 +205,7 @@ class TensorNameMap: "encoder.layer.{bid}.output.dense", # bert "transformer.h.{bid}.mlp.fc_out", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "transformer.h.{bid}.mlp.fc2", # phi2 ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/llama.cpp b/llama.cpp index 99facbf77..edd2910b3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -195,6 +195,7 @@ enum llm_arch { LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, + LLM_ARCH_PHI2, LLM_ARCH_UNKNOWN, }; @@ -212,6 +213,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, + { LLM_ARCH_PHI2, "phi2" }, }; enum llm_kv { @@ -550,6 +552,19 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_PHI2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, @@ -1420,6 +1435,7 @@ struct llama_model { struct ggml_tensor * output_norm; struct ggml_tensor * output_norm_b; struct ggml_tensor * output; + struct ggml_tensor * output_b; std::vector layers; @@ -2635,6 +2651,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_PHI2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_3B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -2987,7 +3012,7 @@ static void llm_load_tensors( (void) main_gpu; - enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; + enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU; #ifdef GGML_USE_CUBLAS @@ -3630,7 +3655,73 @@ static void llm_load_tensors( } } } break; + case LLM_ARCH_PHI2: + { + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + // output + { + ggml_backend_type backend_norm; + ggml_backend_type backend_output; + + if (n_gpu_layers > int(n_layer)) { + backend_norm = llama_backend_offload; + backend_output = llama_backend_offload; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + vram_weights += ggml_nbytes(model.output_norm_b); + vram_weights += ggml_nbytes(model.output); + vram_weights += ggml_nbytes(model.output_b); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) + + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b); + } + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -3991,6 +4082,7 @@ static struct ggml_tensor * llm_build_ffn( // if max_alibi_bias > 0 then apply ALiBi static struct ggml_tensor * llm_build_kqv( struct ggml_context * ctx, + const llama_model & model, const llama_hparams & hparams, const llama_kv_cache & kv, struct ggml_tensor * wo, @@ -4002,6 +4094,7 @@ static struct ggml_tensor * llm_build_kqv( int32_t n_tokens, int32_t n_kv, float max_alibi_bias, + float scale, const llm_build_cb & cb, int il) { const int64_t n_embd = hparams.n_embd; @@ -4024,6 +4117,12 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); cb(kq, "kq", il); + if (model.arch == LLM_ARCH_PHI2) { + // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs + // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + } + if (max_alibi_bias > 0.0f) { // temporary branch until we figure out how to handle ggml_alibi through ggml_add kq = ggml_scale(ctx, kq, kq_scale); @@ -4043,7 +4142,7 @@ static struct ggml_tensor * llm_build_kqv( kq = ggml_soft_max(ctx, kq); cb(kq, "kq_soft_max", il); } else { - kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head))); + kq = ggml_soft_max_ext(ctx, kq, kq_mask, scale); cb(kq, "kq_soft_max_ext", il); } @@ -4250,9 +4349,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4433,9 +4532,9 @@ struct llm_build_context { // apply ALiBi for 13B model const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4557,9 +4656,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4657,9 +4756,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4866,9 +4965,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); // TODO: not tested, could be broken - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4957,9 +5056,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5054,9 +5153,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5148,9 +5247,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5261,9 +5360,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5320,15 +5419,15 @@ struct llm_build_context { cb(inpL, "inp_embd", -1); // inp_pos - contains the positions - struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); // KQ_scale - struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed @@ -5378,9 +5477,9 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); - cur = llm_build_kqv(ctx0, hparams, kv_self, + cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5422,6 +5521,122 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); + return gf; + } + struct ggml_cgraph * build_phi2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + struct ggml_tensor * cur; + struct ggml_tensor * attn_norm_output; + struct ggml_tensor * ffn_output; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // Q_scale + struct ggml_tensor * Q_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(Q_scale, "Q_scale", -1); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + attn_norm_output = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(attn_norm_output, "attn_norm", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_custom( + ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Qcur = ggml_scale(ctx0, Qcur, Q_scale); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, model, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il); + cb(cur, "kqv_out", il); + } + + // FF + { + ffn_output = llm_build_ffn(ctx0, attn_norm_output, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(ffn_output, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_output); + cb(cur, "l_out", il); + + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output_no_bias", -1); + + cur = ggml_add(ctx0, cur, model.output_b); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + return gf; } }; @@ -5437,7 +5652,7 @@ enum llm_offload_func_e { OFFLOAD_FUNC_FRC, // force offload OFFLOAD_FUNC_KQV, OFFLOAD_FUNC_NR, - OFFLOAD_FUNC_EMB, + OFFLOAD_FUNC_EMB, // embeddings OFFLOAD_FUNC_OUT, }; @@ -5522,6 +5737,7 @@ static const std::unordered_map k_offload_map { "pos_embd", OFFLOAD_FUNC_NR }, { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope) + { "Q_scale", OFFLOAD_FUNC_FRC }, { "KQ_scale", OFFLOAD_FUNC_FRC }, { "KQ_mask", OFFLOAD_FUNC_FRC }, { "K_shift", OFFLOAD_FUNC_FRC }, @@ -5606,6 +5822,7 @@ static const std::unordered_map k_offload_map { "l_out", OFFLOAD_FUNC }, { "result_norm", OFFLOAD_FUNC_EMB }, + { "result_output_no_bias", OFFLOAD_FUNC_EMB }, { "result_output", OFFLOAD_FUNC_OUT }, }; @@ -5623,6 +5840,7 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_tokens = false; bool alloc_inp_embd = false; bool alloc_inp_pos = false; + bool alloc_inp_Q_scale = false; bool alloc_inp_KQ_scale = false; bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; @@ -5690,7 +5908,7 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_pos = true; } - if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { + if (!alloc_inp_Q_scale && strcmp(name, "Q_scale") == 0) { ggml_allocr_alloc(lctx.alloc, cur); if (!ggml_allocr_is_measure(lctx.alloc)) { @@ -5698,6 +5916,23 @@ static struct ggml_cgraph * llama_build_graph( ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); } + alloc_inp_Q_scale = true; + } + + if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd_head = model.hparams.n_embd_head(); + if (model.arch == LLM_ARCH_PHI2) { + // with phi2, we scale the Q to avoid precision issues + // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 + ggml_set_f32(cur, 1.0f); + } else { + ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + } + } + alloc_inp_KQ_scale = true; } @@ -5922,6 +6157,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_qwen(); } break; + case LLM_ARCH_PHI2: + { + result = llm.build_phi2(); + } break; default: GGML_ASSERT(false); } @@ -6055,12 +6294,16 @@ static int llama_decode_internal( ggml_allocr_alloc_graph(lctx.alloc, gf); - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + // the output is always the last tensor in the graph + struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; + GGML_ASSERT(strcmp(res->name, "result_output") == 0); + + // the embeddings could be the second to last tensor, or the third to last tensor struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; - - GGML_ASSERT(strcmp(res->name, "result_output") == 0); - GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); - + if (strcmp(embeddings->name, "result_norm") != 0) { + embeddings = gf->nodes[gf->n_nodes - 3]; + GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); + } #ifdef GGML_USE_CUBLAS for (int i = 0; i < gf->n_leafs; i++) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index df2c3fb6e..f04b9438a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1555,6 +1555,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512)); // neox (falcon 40B) test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512)); // neox (falcon 40B) test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512)); // neox (stablelm) + test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512)); // neox (phi-2) } test_cases.emplace_back(new test_alibi()); From 6ff39b129d0281d045f83d515e51b7197b44b253 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Dec 2023 20:05:12 +0200 Subject: [PATCH 22/43] llama.swiftui : add more models --- .../llama.cpp.swift/LibLlama.swift | 2 +- .../llama.swiftui/UI/ContentView.swift | 31 +++++++++++++++++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 272e1fd8a..464fb3277 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -203,7 +203,7 @@ actor LlamaContext { var pp_std: Double = 0 var tg_std: Double = 0 - for r in 0.. Date: Mon, 18 Dec 2023 20:17:43 +0200 Subject: [PATCH 23/43] llama.swiftui : add tinyllama 1.1B F16 --- .../llama.swiftui/llama.swiftui/UI/ContentView.swift | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift index 9cbe8efd6..c78f107b3 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift @@ -91,6 +91,15 @@ struct ContentView: View { ) .font(.system(size: 12)) + DownloadButton( + llamaState: llamaState, + modelName: "TinyLlama-1.1B (F16, 2.2 GiB)", + modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true", + filename: "tinyllama-1.1b-f16.gguf" + ) + .font(.system(size: 12)) + .frame(maxWidth: .infinity, alignment: .leading) + DownloadButton( llamaState: llamaState, modelName: "Phi-2.7B (Q4_0, 1.6 GiB)", @@ -98,7 +107,6 @@ struct ContentView: View { filename: "phi-2-q4_0.gguf" ) .font(.system(size: 12)) - .frame(maxWidth: .infinity, alignment: .leading) DownloadButton( llamaState: llamaState, @@ -107,6 +115,7 @@ struct ContentView: View { filename: "phi-2-q8_0.gguf" ) .font(.system(size: 12)) + .frame(maxWidth: .infinity, alignment: .leading) DownloadButton( llamaState: llamaState, @@ -115,7 +124,6 @@ struct ContentView: View { filename: "mistral-7b-v0.1.Q4_0.gguf" ) .font(.system(size: 12)) - .frame(maxWidth: .infinity, alignment: .leading) Button("Clear downloaded models") { ContentView.cleanupModelCaches() From a7aee47b98e45539d491071b25778b833b77e387 Mon Sep 17 00:00:00 2001 From: arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> Date: Mon, 18 Dec 2023 22:33:45 +0100 Subject: [PATCH 24/43] ggml-cuda: Fix HIP build (#4528) regression of #4490 Adds defines for two new datatypes cublasComputeType_t, cudaDataType_t. Currently using deprecated hipblasDatatype_t since newer ones very recent. --- ggml-cuda.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d0f3d8034..f20846fef 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -31,6 +31,7 @@ #define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_32F HIPBLAS_R_32F #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 #define cublasCreate hipblasCreate #define cublasGemmEx hipblasGemmEx #define cublasGemmBatchedEx hipblasGemmBatchedEx @@ -40,6 +41,7 @@ #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t +#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess From 328b83de23b33240e28f4e74900d1d06726f5eb1 Mon Sep 17 00:00:00 2001 From: Eric Sommerlade Date: Tue, 19 Dec 2023 16:17:01 +0000 Subject: [PATCH 25/43] ggml : fixed check for _MSC_VER (#4535) Co-authored-by: Eric Sommerlade --- ggml.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.h b/ggml.h index f1003984f..beacdc8be 100644 --- a/ggml.h +++ b/ggml.h @@ -303,7 +303,7 @@ extern "C" { #if defined(__ARM_NEON) && defined(__CUDACC__) typedef half ggml_fp16_t; -#elif defined(__ARM_NEON) +#elif defined(__ARM_NEON) && !defined(_MSC_VER) typedef __fp16 ggml_fp16_t; #else typedef uint16_t ggml_fp16_t; From 799fc2268989482054944c902874cca76337580f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 20 Dec 2023 15:41:22 +0100 Subject: [PATCH 26/43] CUDA: Faster Mixtral prompt processing (#4538) * CUDA: make MoE tensors contiguous for batch size>1 * Update ggml-cuda.cu Co-authored-by: slaren --------- Co-authored-by: slaren --- ggml-cuda.cu | 118 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 25 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f20846fef..9f4b188cb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7830,6 +7830,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) { } #ifdef NDEBUG + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(cudaDeviceSynchronize()); + } + for (int id = 0; id < g_device_count; ++id) { CUDA_CHECK(ggml_cuda_set_device(id)); @@ -7881,8 +7886,6 @@ static void ggml_cuda_op_mul_mat( const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - ggml_cuda_set_peer_access(ne11); - GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); @@ -8781,16 +8784,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + const int64_t nb11 = src1->nb[1]; + const int64_t nb1 = dst->nb[1]; + const struct ggml_tensor * ids = src0; const int32_t id = ((int32_t *) dst->op_params)[0]; const int32_t n_as = ((int32_t *) dst->op_params)[1]; std::vector ids_host(ggml_nbytes(ids)); + const cudaStream_t stream = g_cudaStreams[g_main_device][0]; + if (ids->backend == GGML_BACKEND_GPU) { const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); - CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } else { memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); } @@ -8804,37 +8812,93 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; - src1_row.ne[1] = 1; - dst_row.ne[1] = 1; - - src1_row.nb[2] = src1_row.nb[1]; - dst_row.nb[2] = dst_row.nb[1]; - - src1_row.nb[3] = src1_row.nb[1]; - dst_row.nb[3] = dst_row.nb[1]; - src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; + char * src1_original = (char *) src1_extra->data_device[g_main_device]; + char * dst_original = (char *) dst_extra->data_device[g_main_device]; - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - //int32_t row_id; - //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); - //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + if (src1->ne[1] == 1) { + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + //int32_t row_id; + //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); + //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); - const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); - GGML_ASSERT(row_id >= 0 && row_id < n_as); + GGML_ASSERT(row_id >= 0 && row_id < n_as); - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1]; - src1_row.data = (char *) src1->data + i01*src1->nb[1]; + src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1]; + src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set? - dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1]; - dst_row.data = (char *) dst->data + i01*dst->nb[1]; + dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1]; + dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set? - ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + } + } else { + size_t as_src1, as_dst; + char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1); + char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst); + + src1_row_extra.data_device[g_main_device] = src1_contiguous; + dst_row_extra.data_device[g_main_device] = dst_contiguous; + + for (int32_t row_id = 0; row_id < n_as; ++row_id) { + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + + int64_t num_src1_rows = 0; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + if (row_id_i != row_id) { + continue; + } + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11, + nb11, cudaMemcpyDeviceToDevice, stream)); + num_src1_rows++; + } + + if (num_src1_rows == 0) { + continue; + } + + src1_row.ne[1] = num_src1_rows; + dst_row.ne[1] = num_src1_rows; + + src1_row.nb[1] = nb11; + src1_row.nb[2] = num_src1_rows*nb11; + src1_row.nb[3] = num_src1_rows*nb11; + + dst_row.nb[1] = nb1; + dst_row.nb[2] = num_src1_rows*nb1; + dst_row.nb[3] = num_src1_rows*nb1; + + ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + + num_src1_rows = 0; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + if (row_id_i != row_id) { + continue; + } + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1, + nb1, cudaMemcpyDeviceToDevice, stream)); + num_src1_rows++; + } + } + + ggml_cuda_pool_free(src1_contiguous, as_src1); + ggml_cuda_pool_free(dst_contiguous, as_dst); } } @@ -9370,6 +9434,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ return false; } + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { + ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); + } + if (params->ith != 0) { return true; } From 1d7a1912cea2227f9a1a449758ed622c560542f9 Mon Sep 17 00:00:00 2001 From: LoganDark Date: Thu, 21 Dec 2023 01:59:27 -0800 Subject: [PATCH 27/43] Fix access violation in ggml_cuda_free_data if tensor->extra is NULL (#4554) --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 9f4b188cb..28d378784 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9091,7 +9091,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { } void ggml_cuda_free_data(struct ggml_tensor * tensor) { - if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { + if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { return; } From d3223afdad0ed2821a8ddf739c291cd410c92a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 21 Dec 2023 17:34:17 +0100 Subject: [PATCH 28/43] llama : disable per-tensor info prints on model load (#4562) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index edd2910b3..90d860eb9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2083,7 +2083,7 @@ struct llama_model_loader { type_max = meta->type; } - LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str()); + // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str()); } switch (type_max) { From 139882392258671ffe5acdfcadc0bc08572d6eef Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 21 Dec 2023 18:02:30 +0100 Subject: [PATCH 29/43] cuda : replace asserts in wrong architecture checks with __trap (#4556) * cuda : replace asserts in wrong architecture checks with __trap * make bad_arch noreturn, remove returns --- ggml-cuda.cu | 82 +++++++++++++++++++++++----------------------------- 1 file changed, 36 insertions(+), 46 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 28d378784..e7c9dee45 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -512,6 +512,14 @@ static size_t g_scratch_offset = 0; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; +[[noreturn]] +static __device__ void bad_arch() { + printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n"); + __trap(); + + (void) bad_arch; // suppress unused function warning +} + static __device__ __forceinline__ float warp_reduce_sum(float x) { #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { @@ -1972,8 +1980,7 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp // second part effectively subtracts 8 from each quant value return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2010,8 +2017,7 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2046,8 +2052,7 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp // second part effectively subtracts 16 from each quant value return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2092,8 +2097,7 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp return sumi*d5d8 + m5s8 / (QI5_1 / vdr); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2114,8 +2118,7 @@ template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp return d8_0*d8_1 * sumi; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2145,8 +2148,7 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it return sumi*d8d8 + m8s8 / (QI8_1 / vdr); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2181,8 +2183,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( return dm2f.x*sumf_d - dm2f.y*sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2219,8 +2220,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2260,8 +2260,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( return d3 * sumf; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2286,8 +2285,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( return d3*d8 * sumi; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2320,8 +2318,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2354,8 +2351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2395,8 +2391,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( return dm5f.x*sumf_d - dm5f.y*sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2429,8 +2424,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2460,8 +2454,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( return d*sumf; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2492,8 +2485,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( return d6 * sumf_d; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -3359,8 +3351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( return dall * sumf_d - dmin * sumf_m; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif @@ -3543,8 +3534,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( return d * sumf_d; #else - assert(false); - return 0.0f; // only to satisfy the compiler + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif @@ -3954,7 +3944,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_0_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4023,7 +4013,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_1_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4090,7 +4080,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_0_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4157,7 +4147,7 @@ mul_mat_q5_1( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_1_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4224,7 +4214,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q8_0_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4291,7 +4281,7 @@ mul_mat_q2_K( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q2_K_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4360,7 +4350,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q3_K_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4429,7 +4419,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_K_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4496,7 +4486,7 @@ mul_mat_q5_K( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_K_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4565,7 +4555,7 @@ template static __global__ void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q6_K_q8_1_mul_mat; - assert(false); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } From 66f35a2f48e1965a13835a523e677223dbf148be Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Thu, 21 Dec 2023 17:06:44 +0000 Subject: [PATCH 30/43] cuda : better error message for ggml_get_rows (#4561) * Update ggml-cuda.cu * Update ggml-cuda.cu * Update ggml-cuda.cu --------- Co-authored-by: Georgi Gerganov --- ggml-cuda.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e7c9dee45..1ca071d90 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6815,6 +6815,7 @@ static void ggml_cuda_op_get_rows( break; default: // TODO: k-quants + fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); GGML_ASSERT(false); break; } From 880e352277fc017df4d5794f0c21c44e1eae2b84 Mon Sep 17 00:00:00 2001 From: howlger Date: Thu, 21 Dec 2023 18:07:34 +0100 Subject: [PATCH 31/43] py : open merges file as 'utf-8' (#4566) Otherwise, on Windows converting bling-phi-2-v0 () via convert-hf-to-gguf.py will fail with the following error: ``` Traceback (most recent call last): File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 1061, in model_instance.set_vocab() File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 52, in set_vocab self._set_vocab_gpt2() File "C:\Users\User\git\gguf\convert-hf-to-gguf.py", line 264, in _set_vocab_gpt2 special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) File "C:\Users\User\git\gguf\gguf\vocab.py", line 33, in __init__ self._load(Path(path)) File "C:\Users\User\git\gguf\gguf\vocab.py", line 81, in _load self._try_load_merges_txt(path) File "C:\Users\User\git\gguf\gguf\vocab.py", line 95, in _try_load_merges_txt for line in fp: File "C:\Users\User\miniconda3\envs\gguf\lib\encodings\cp1252.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 1415: character maps to ``` --- gguf-py/gguf/vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 76924d8f2..cd1942975 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -84,7 +84,7 @@ class SpecialVocab: merges_file = path / 'merges.txt' if not merges_file.is_file(): return False - with open(merges_file, 'r') as fp: + with open(merges_file, 'r', encoding = 'utf-8') as fp: first_line = next(fp, '').strip() if not first_line.startswith('#'): fp.seek(0) From c083718c895b7c8c7fb2a4660643fb78d0c64dfd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Dec 2023 19:27:14 +0200 Subject: [PATCH 32/43] readme : update coding guidelines --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 01aef2afc..80ce194ca 100644 --- a/README.md +++ b/README.md @@ -982,6 +982,8 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m / - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions +- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices +- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT` ### Docs From 9154494808dc865475c59022c29060b4947a803b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 21 Dec 2023 18:42:59 +0100 Subject: [PATCH 33/43] CUDA: mul_mat_id always on GPU for batches >= 32 (#4553) --- ggml-cuda.cu | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 1ca071d90..036668bfd 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -8773,8 +8773,6 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s // TODO: mmq/mmv support #endif - GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); - const int64_t nb11 = src1->nb[1]; const int64_t nb1 = dst->nb[1]; @@ -8803,13 +8801,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; + src1_row.backend = GGML_BACKEND_GPU; + dst_row.backend = GGML_BACKEND_GPU; + src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = (char *) src1_extra->data_device[g_main_device]; - char * dst_original = (char *) dst_extra->data_device[g_main_device]; + char * src1_original = src1->backend == GGML_BACKEND_CPU ? + (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; + char * dst_original = dst->backend == GGML_BACKEND_CPU ? + (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; if (src1->ne[1] == 1) { + GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { //int32_t row_id; //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); @@ -8837,6 +8843,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row_extra.data_device[g_main_device] = src1_contiguous; dst_row_extra.data_device[g_main_device] = dst_contiguous; + const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ? + cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; + const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ? + cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; + for (int32_t row_id = 0; row_id < n_as; ++row_id) { const struct ggml_tensor * src0_row = dst->src[row_id + 2]; @@ -8851,7 +8862,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(row_id >= 0 && row_id < n_as); CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11, - nb11, cudaMemcpyDeviceToDevice, stream)); + nb11, src1_kind, stream)); num_src1_rows++; } @@ -8883,7 +8894,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(row_id >= 0 && row_id < n_as); CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1, - nb1, cudaMemcpyDeviceToDevice, stream)); + nb1, dst_kind, stream)); num_src1_rows++; } } @@ -8891,6 +8902,10 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s ggml_cuda_pool_free(src1_contiguous, as_src1); ggml_cuda_pool_free(dst_contiguous, as_dst); } + + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(cudaStreamSynchronize(stream)); + } } static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -9289,7 +9304,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); - if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) { + if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { return false; } From 8fe03ffddaaa0ab5d48feaafe398151c9f22d4f6 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Thu, 21 Dec 2023 12:55:34 -0500 Subject: [PATCH 34/43] common : remove incorrect --model-draft default (#4568) --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 93d5483e4..b3425ab09 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -920,7 +920,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -md FNAME, --model-draft FNAME\n"); - printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str()); + printf(" draft model for speculative decoding\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); From 562cf222b5129e40b312877e928eac3a02e4ec33 Mon Sep 17 00:00:00 2001 From: arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> Date: Thu, 21 Dec 2023 20:13:25 +0100 Subject: [PATCH 35/43] ggml-cuda: Fix HIP build by adding define for __trap (#4569) Regression of 139882392258671ffe5acdfcadc0bc08572d6eef HIP doesn't have trap, only abort --- ggml-cuda.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 036668bfd..61d92d7ef 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -80,6 +80,7 @@ #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) #define cudaStream_t hipStream_t #define cudaSuccess hipSuccess +#define __trap abort #else #include #include From 0f630fbc924aaabeea6eaf466bb4b47d13015c3e Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 21 Dec 2023 13:45:32 -0600 Subject: [PATCH 36/43] cuda : ROCm AMD Unified Memory Architecture (UMA) handling (#4449) * AMD ROCm: handle UMA memory VRAM expansions This resolves #2797 by allowing ROCm AMD GPU users with a UMA to dynamically expand the VRAM allocated to the GPU. Without this, AMD ROCm users with shared CPU/GPU memory usually are stuck with the BIOS-set (or fixed) framebuffer VRAM, making it impossible to load more than 1-2 layers. Note that the model is duplicated in RAM because it's loaded once for the CPU and then copied into a second set of allocations that are managed by the HIP UMA system. We can fix this later. * clarify build process for ROCm on linux with cmake * avoid using deprecated ROCm hipMallocHost * keep simplifying the change required for UMA * cmake: enable UMA-compatible allocation when LLAMA_HIP_UMA=ON --- CMakeLists.txt | 4 ++++ README.md | 16 +++++++++------- ggml-cuda.cu | 5 +++++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e3cd43ab3..6fc6508c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,6 +91,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) +option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) @@ -377,6 +378,9 @@ if (LLAMA_HIPBLAS) if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "HIP and hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + if (LLAMA_HIP_UMA) + add_compile_definitions(GGML_HIP_UMA) + endif() add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) if (BUILD_SHARED_LIBS) set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/README.md b/README.md index 80ce194ca..73fe59bb4 100644 --- a/README.md +++ b/README.md @@ -432,14 +432,15 @@ Building the program with BLAS support may lead to some performance improvements ```bash make LLAMA_HIPBLAS=1 ``` - - Using `CMake` for Linux: + - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash - mkdir build - cd build - CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON - cmake --build . + CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \ + cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + && cmake --build build -- -j 16 ``` - - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS): + On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`. + However, this hurts performance for non-integrated GPUs. + - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): ```bash set PATH=%HIP_PATH%\bin;%PATH% mkdir build @@ -448,10 +449,11 @@ Building the program with BLAS support may lead to some performance improvements cmake --build . ``` Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) + Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`. The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used. - If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3. + If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): | Option | Legal values | Default | Description | diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 61d92d7ef..32603a8d1 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -60,8 +60,13 @@ #define cudaGetDeviceProperties hipGetDeviceProperties #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError +#ifdef GGML_HIP_UMA +#define cudaMalloc hipMallocManaged +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) +#else #define cudaMalloc hipMalloc #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#endif #define cudaMemcpy hipMemcpy #define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpyAsync hipMemcpyAsync From 56fa50819f7a3ca2128f63b81c17c08a4454479e Mon Sep 17 00:00:00 2001 From: Finn Voorhees Date: Thu, 21 Dec 2023 14:55:02 -0500 Subject: [PATCH 37/43] metal : fix `ggml_metal_log` vargs (#4373) From 31f27758faf4a4bd08101a57c7ec3a473f771f86 Mon Sep 17 00:00:00 2001 From: Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> Date: Thu, 21 Dec 2023 11:57:48 -0800 Subject: [PATCH 38/43] llama : allow getting n_batch from llama_context in c api (#4540) * allowed getting n_batch from llama_context in c api * changed to use `uint32_t` instead of `int` * changed to use `uint32_t` instead of `int` in `llama_n_ctx` * Update llama.h --------- Co-authored-by: Georgi Gerganov --- llama.cpp | 6 +++++- llama.h | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 90d860eb9..63ebe581b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9532,10 +9532,14 @@ const llama_model * llama_get_model(const struct llama_context * ctx) { return &ctx->model; } -int llama_n_ctx(const struct llama_context * ctx) { +uint32_t llama_n_ctx(const struct llama_context * ctx) { return ctx->cparams.n_ctx; } +uint32_t llama_n_batch(const struct llama_context * ctx) { + return ctx->cparams.n_batch; +} + enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { return model->vocab.type; } diff --git a/llama.h b/llama.h index 15ab4f80e..0be4b1337 100644 --- a/llama.h +++ b/llama.h @@ -314,7 +314,9 @@ extern "C" { LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); - LLAMA_API int llama_n_ctx (const struct llama_context * ctx); + // TODO: become more consistent with returned int types across the API + LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); + LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); From d232aca5a73b290e218a2e48b91023d5e994203f Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 21 Dec 2023 21:07:46 +0100 Subject: [PATCH 39/43] llama : initial ggml-backend integration (#4520) * llama : initial ggml-backend integration * add ggml-metal * cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST access all tensor data with ggml_backend_tensor_get/set * add ggml_backend_buffer_clear zero-init KV cache buffer * add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data * disable gpu backends with ngl 0 * more accurate mlock * unmap offloaded part of the model * use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap * update quantize and lora * update session copy/set to use ggml-backend ggml-ci * use posix_fadvise instead of posix_fadvise64 * ggml_backend_alloc_ctx_tensors_from_buft : remove old print * llama_mmap::align_offset : use pointers instead of references for out parameters * restore progress_callback behavior * move final progress_callback call to load_all_data * cuda : fix fprintf format string (minor) * do not offload scales * llama_mmap : avoid unmapping the same fragments again in the destructor * remove unnecessary unmap * metal : add default log function that prints to stderr, cleanup code ggml-ci --------- Co-authored-by: Georgi Gerganov --- Makefile | 2 +- ggml-alloc.c | 16 +- ggml-backend-impl.h | 20 +- ggml-backend.c | 80 ++- ggml-backend.h | 7 + ggml-cuda.cu | 89 ++-- ggml-metal.h | 3 + ggml-metal.m | 228 +++++++-- ggml.c | 24 +- ggml.h | 13 +- llama.cpp | 1196 ++++++++++++++++++++----------------------- 11 files changed, 926 insertions(+), 752 deletions(-) diff --git a/Makefile b/Makefile index 8273f8400..512407a1d 100644 --- a/Makefile +++ b/Makefile @@ -65,7 +65,7 @@ test: $(TEST_TARGETS) ./$$test_target; \ fi; \ if [ $$? -ne 0 ]; then \ - printf 'Test $$test_target FAILED!\n\n' $$test_target; \ + printf 'Test %s FAILED!\n\n' $$test_target; \ failures=$$(( failures + 1 )); \ else \ printf 'Test %s passed.\n\n' $$test_target; \ diff --git a/ggml-alloc.c b/ggml-alloc.c index d3049efb4..a97436b17 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd if (update_backend) { view->backend = view->view_src->backend; } - view->buffer = view->view_src->buffer; + // views are initialized in the alloc buffer rather than the view_src buffer + view->buffer = alloc->buffer; view->data = (char *)view->view_src->data + view->view_offs; - // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend - // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft); if (!alloc->measure) { @@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) { } void ggml_allocr_free(ggml_allocr_t alloc) { + if (alloc == NULL) { + return; + } + ggml_gallocr_free(alloc->galloc); ggml_tallocr_free(alloc->talloc); free(alloc); @@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (nbytes == 0) { - fprintf(stderr, "%s: no tensors to allocate\n", __func__); + // all the tensors in the context are already allocated return NULL; } @@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } else { ggml_backend_view_init(buffer, t); } + } else { + if (t->view_src != NULL) { + // view of a pre-allocated tensor + ggml_backend_view_init(buffer, t); + } } } diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index f588af602..05859935a 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -20,6 +20,9 @@ extern "C" { size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + // check if tensor data is in host memory + // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) + bool (*is_host) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { @@ -31,15 +34,16 @@ extern "C" { typedef void * ggml_backend_buffer_context_t; struct ggml_backend_buffer_i { - void (*free_buffer)(ggml_backend_buffer_t buffer); + void (*free_buffer) (ggml_backend_buffer_t buffer); //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras - void * (*get_base) (ggml_backend_buffer_t buffer); - void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void * (*get_base) (ggml_backend_buffer_t buffer); + void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) copy tensor between different buffer-type, allow for single-copy tranfers - void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); }; struct ggml_backend_buffer { @@ -78,7 +82,7 @@ extern "C" { void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*synchronize) (ggml_backend_t backend); + void (*synchronize)(ggml_backend_t backend); // compute graph with a plan ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); diff --git a/ggml-backend.c b/ggml-backend.c index 3a22cd085..0c8c9ec43 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba return buft->iface.supports_backend(buft, backend); } +bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { + if (buft->iface.is_host) { + return buft->iface.is_host(buft); + } + return false; +} + // backend buffer ggml_backend_buffer_t ggml_backend_buffer_init( @@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor); } +void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + buffer->iface.clear(buffer, value); +} + +bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { + return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer)); +} + ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) { return buffer->buft; } @@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); - GGML_UNUSED(buffer); } static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, GGML_UNUSED(buffer); } +static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + memset(buffer->context, value, buffer->size); +} + static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, @@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, + /* .clear = */ ggml_backend_cpu_buffer_clear, }; // for buffers from ptr, free is not called @@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, + /* .clear = */ ggml_backend_cpu_buffer_clear, }; static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 @@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty GGML_UNUSED(buft); } +static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + GGML_UNUSED(buft); +} + ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, }; - return &ggml_backend_buffer_type_cpu; + return &ggml_backend_cpu_buffer_type; } +#ifdef GGML_USE_CPU_HBM + +// buffer type HBM + +#include + +static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { + hbw_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + //void * ptr = hbw_malloc(size); + void * ptr; + int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); + if (result != 0) { + fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size); + return NULL; + } + + // FIXME: this is a hack to avoid having to implement a new buffer type + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { + /* .iface = */ { + /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + }, + /* .context = */ NULL, + }; + + return &ggml_backend_cpu_buffer_type_hbm; +} +#endif + struct ggml_backend_cpu_context { int n_threads; void * work_data; @@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); - cpu_plan->cgraph = *cgraph; + cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); @@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml // utils void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); - GGML_ASSERT(tensor->data == NULL); + //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); diff --git a/ggml-backend.h b/ggml-backend.h index 58d5ccae6..a9d2fddd7 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -21,6 +21,7 @@ extern "C" { GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); @@ -29,6 +30,8 @@ extern "C" { GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer); // @@ -76,6 +79,10 @@ extern "C" { GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); +#ifdef GGML_USE_CPU_HBM + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); +#endif + // // Backend registry // diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 32603a8d1..f5e060d32 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9081,7 +9081,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { char * buf; CUDA_CHECK(cudaMalloc(&buf, size)); - char * buf_host = (char*)data + offset_split; + char * buf_host = (char *)data + offset_split; // set padding to 0 to avoid possible NaN values if (size > original_size) { @@ -9226,11 +9226,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); - const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || - tensor->op == GGML_OP_VIEW; + const bool inplace = tensor->view_src != nullptr; - if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t view_offset = 0; if (tensor->op == GGML_OP_VIEW) { @@ -9317,7 +9316,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ if (tensor->op == GGML_OP_MUL_MAT) { if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { #ifndef NDEBUG - fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); + fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); #endif return false; } @@ -9523,7 +9522,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { - assert(tensor->view_src->buffer->buft == buffer->buft); // TODO + assert(tensor->view_src->buffer->buft == buffer->buft); tensor->backend = tensor->view_src->backend; tensor->extra = tensor->view_src->extra; return; @@ -9554,23 +9553,34 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g } static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; - UNUSED(buffer); + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); } static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; - UNUSED(buffer); + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); +} + +static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size)); } static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { @@ -9581,6 +9591,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, /* .cpy_tensor_from = */ NULL, /* .cpy_tensor_to = */ NULL, + /* .clear = */ ggml_backend_cuda_buffer_clear, }; // cuda buffer type @@ -9632,35 +9643,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t UNUSED(buft); } -static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = { +static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, + /* .is_host = */ nullptr, }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES]; - static bool ggml_backend_buffer_type_cuda_initialized = false; - if (!ggml_backend_buffer_type_cuda_initialized) { + static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; + + static bool ggml_backend_cuda_buffer_type_initialized = false; + + if (!ggml_backend_cuda_buffer_type_initialized) { for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { - ggml_backend_buffer_type_cuda[i] = { - /* .iface = */ cuda_backend_buffer_type_interface, + ggml_backend_cuda_buffer_types[i] = { + /* .iface = */ ggml_backend_cuda_buffer_type_interface, /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, }; } - ggml_backend_buffer_type_cuda_initialized = true; + ggml_backend_cuda_buffer_type_initialized = true; } - return &ggml_backend_buffer_type_cuda[device]; + return &ggml_backend_cuda_buffer_types[device]; } // host buffer type static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; - CUDA_CHECK(cudaFreeHost(ctx->dev_ptr)); - delete ctx; + CUDA_CHECK(cudaFreeHost(buffer->context)); } static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -9673,24 +9685,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; return buffer; - - UNUSED(buft); } -struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = { - /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, -}; - ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = { - /* .iface = */ cuda_backend_host_buffer_type_interface, + static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { + /* .iface = */ { + /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, /* .context = */ nullptr, }; - return &ggml_backend_buffer_type_cuda_host; + return &ggml_backend_cuda_buffer_type_host; } // backend @@ -9722,8 +9731,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); @@ -9733,8 +9740,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); diff --git a/ggml-metal.h b/ggml-metal.h index bf52d9cd3..b5e02b668 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); +GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); + GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); + GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family diff --git a/ggml-metal.m b/ggml-metal.m index 465679a6b..e60b93b36 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -180,7 +180,15 @@ struct ggml_metal_context { @implementation GGMLMetalClass @end -ggml_log_callback ggml_metal_log_callback = NULL; + +static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { + fprintf(stderr, "%s", msg); + + UNUSED(level); + UNUSED(user_data); +} + +ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback; void * ggml_metal_log_user_data = NULL; void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) { @@ -607,12 +615,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { } // temporarily defined here for compatibility between ggml-backend and the old API -struct ggml_backend_metal_buffer_context { - void * data; + +struct ggml_backend_metal_buffer { + void * data; + size_t size; id metal; }; +struct ggml_backend_metal_buffer_context { + void * all_data; + size_t all_size; + bool owned; + + // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap + int n_buffers; + struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; +}; + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -622,17 +642,29 @@ static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru const int64_t tsize = ggml_nbytes(t); + ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; + // compatibility with ggml-backend - if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) { - struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context; + if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) { + struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context; - const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data; + // find the view that contains the tensor fully + for (int i = 0; i < buf_ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; - GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size); + //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); + if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { + *offs = (size_t) ioffs; - *offs = (size_t) ioffs; + //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); - return buf_ctx->metal; + return buf_ctx->buffers[i].metal; + } + } + + GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); + + return nil; } // find the view that contains the tensor fully @@ -2361,6 +2393,7 @@ void ggml_metal_graph_compute( // backend interface +// default buffer static id g_backend_device = nil; static int g_backend_device_ref_count = 0; @@ -2388,34 +2421,31 @@ static void ggml_backend_metal_free_device(void) { static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - return ctx->data; + return ctx->all_data; } static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - [ctx->metal release]; + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->buffers[i].metal release]; + } ggml_backend_metal_free_device(); - free(ctx->data); - free(ctx); + if (ctx->owned) { + free(ctx->all_data); + } - UNUSED(buffer); + free(ctx); } static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy((char *)tensor->data + offset, data, size); UNUSED(buffer); } static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy(data, (const char *)tensor->data + offset, size); UNUSED(buffer); @@ -2433,7 +2463,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer UNUSED(buffer); } -static struct ggml_backend_buffer_i metal_backend_buffer_i = { +static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + memset(ctx->all_data, value, ctx->all_size); +} + +static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, /* .get_base = */ ggml_backend_metal_buffer_get_base, /* .init_tensor = */ NULL, @@ -2441,8 +2477,11 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = { /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor, /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to, + /* .clear = */ ggml_backend_metal_buffer_clear, }; +// default buffer type + static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); @@ -2453,13 +2492,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba size_aligned += (size_page - (size_aligned % size_page)); } - ctx->data = ggml_metal_host_malloc(size); - ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data + id device = ggml_backend_metal_get_device(); + + ctx->all_data = ggml_metal_host_malloc(size_aligned); + ctx->all_size = size_aligned; + ctx->owned = true; + ctx->n_buffers = 1; + + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size); + if (ctx->buffers[0].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + free(ctx); + ggml_backend_metal_free_device(); + return NULL; + } + + GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); + + +#if TARGET_OS_OSX + GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", + device.currentAllocatedSize / 1024.0 / 1024.0, + device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + + if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { + GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + } else { + GGML_METAL_LOG_INFO("\n"); + } +#else + GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); +#endif + + + return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -2470,7 +2542,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); - GGML_UNUSED(buft); + UNUSED(buft); +} + +static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + UNUSED(buft); } ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { @@ -2480,6 +2558,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_metal_buffer_type_is_host, }, /* .context = */ NULL, }; @@ -2487,6 +2566,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { return &ggml_backend_buffer_type_metal; } +// buffer from ptr + +ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { + struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); + + ctx->all_data = data; + ctx->all_size = size; + ctx->owned = false; + ctx->n_buffers = 0; + + const size_t size_page = sysconf(_SC_PAGESIZE); + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + id device = ggml_backend_metal_get_device(); + + // the buffer fits into the max buffer size allowed by the device + if (size_aligned <= device.maxBufferLength) { + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; + + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + return false; + } + + GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); + + ++ctx->n_buffers; + } else { + // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into + // one of the views + const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case + const size_t size_step = device.maxBufferLength - size_ovlp; + const size_t size_view = device.maxBufferLength; + + for (size_t i = 0; i < size; i += size_step) { + const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); + + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); + return false; + } + + GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i); + if (i + size_step < size) { + GGML_METAL_LOG_INFO("\n"); + } + + ++ctx->n_buffers; + } + } + +#if TARGET_OS_OSX + GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", + device.currentAllocatedSize / 1024.0 / 1024.0, + device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + + if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { + GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + } else { + GGML_METAL_LOG_INFO("\n"); + } +#else + GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); +#endif + + return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); +} + +// backend + static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; @@ -2499,10 +2659,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) { free(backend); } -static void ggml_backend_metal_synchronize(ggml_backend_t backend) { - UNUSED(backend); -} - static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_metal_buffer_type(); @@ -2529,25 +2685,15 @@ static struct ggml_backend_i metal_backend_i = { /* .get_tensor_async = */ NULL, /* .cpy_tensor_from_async = */ NULL, /* .cpy_tensor_to_async = */ NULL, - /* .synchronize = */ ggml_backend_metal_synchronize, - /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, /* .supports_op = */ ggml_backend_metal_supports_op, }; -// TODO: make a common log callback for all backends in ggml-backend -static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) { - fprintf(stderr, "%s", msg); - - UNUSED(level); - UNUSED(user_data); -} - ggml_backend_t ggml_backend_metal_init(void) { - ggml_metal_log_set_callback(ggml_backend_log_callback, NULL); - struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); if (ctx == NULL) { diff --git a/ggml.c b/ggml.c index 6da65bd92..236148514 100644 --- a/ggml.c +++ b/ggml.c @@ -2383,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) { size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { size_t max_size = 0; - struct ggml_object * obj = ctx->objects_begin; - - while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { - struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs); - - const size_t size = ggml_nbytes(tensor); - - if (max_size < size) { - max_size = size; - } - } - - obj = obj->next; + for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) { + max_size = MAX(max_size, ggml_nbytes(tensor)); } return max_size; @@ -3093,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor( return result; } -struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) { +struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) { struct ggml_object * obj = ctx->objects_begin; char * const mem_buffer = ctx->mem_buffer; @@ -3109,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) { return NULL; } -struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) { +struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) { struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE); obj = obj->next; @@ -19213,6 +19201,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) { return ctx->infos[i].name.data; } +enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) { + return ctx->infos[i].type; +} + // returns the index static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { const int idx = gguf_find_key(ctx, key); diff --git a/ggml.h b/ggml.h index beacdc8be..b17314897 100644 --- a/ggml.h +++ b/ggml.h @@ -735,8 +735,8 @@ extern "C" { GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); // Context tensor enumeration and lookup - GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx); - GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx); + GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); @@ -2135,10 +2135,11 @@ extern "C" { GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); - GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); - GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); - GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); - GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); + GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); + GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); + GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); + GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); + GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i); // overrides existing values or adds a new one GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); diff --git a/llama.cpp b/llama.cpp index 63ebe581b..ba970ce8d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,11 +1,12 @@ #define LLAMA_API_INTERNAL +//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading #include "llama.h" #include "unicode.h" #include "ggml.h" - #include "ggml-alloc.h" +#include "ggml-backend.h" #ifdef GGML_USE_CUBLAS # include "ggml-cuda.h" @@ -32,6 +33,7 @@ #include #if defined(_POSIX_MAPPED_FILES) #include + #include #endif #if defined(_POSIX_MEMLOCK_RANGE) #include @@ -712,38 +714,6 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * // llama helpers // -inline void * llama_host_malloc(size_t n) { -#ifdef GGML_USE_CUBLAS - if (ggml_cublas_loaded()) { - return ggml_cuda_host_malloc(n); - } else { - return malloc(n); - } -#elif GGML_USE_METAL - return ggml_metal_host_malloc(n); -#elif GGML_USE_CPU_HBM - return hbw_malloc(n); -#else - return malloc(n); -#endif -} - -inline void llama_host_free(void * ptr) { -#ifdef GGML_USE_CUBLAS - if (ggml_cublas_loaded()) { - return ggml_cuda_host_free(ptr); - } else { - return free(ptr); - } -#elif GGML_USE_METAL - return ggml_metal_host_free(ptr); -#elif GGML_USE_CPU_HBM - return hbw_free(ptr); -#else - return free(ptr); -#endif -} - #if defined(_WIN32) static std::string llama_format_win_err(DWORD err) { LPSTR buf; @@ -758,40 +728,10 @@ static std::string llama_format_win_err(DWORD err) { } #endif -struct llama_buffer { - void * data = NULL; - size_t size = 0; - - // fallback to malloc / free - // useful in cases where CUDA can try to allocate PINNED memory - bool fallback = false; - - void resize(size_t n) { - llama_host_free(data); - - data = llama_host_malloc(n); - if (!data) { - fallback = true; - data = malloc(n); - } else { - fallback = false; - } - - GGML_ASSERT(data); - size = n; - } - - ~llama_buffer() { - if (data) { - if (fallback) { // NOLINT - free(data); - } else { - llama_host_free(data); - } - } - - data = NULL; - } +template +struct no_init { + T value; + no_init() { /* do nothing */ } }; struct llama_file { @@ -879,6 +819,9 @@ struct llama_mmap { #ifdef _POSIX_MAPPED_FILES static constexpr bool SUPPORTED = true; + // list of mapped fragments (first_offset, last_offset) + std::vector> mapped_fragments; + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { size = file->size; int fd = fileno(file->fp); @@ -886,17 +829,22 @@ struct llama_mmap { // prefetch/readahead impairs performance on NUMA systems if (numa) { prefetch = 0; } #ifdef __linux__ + // advise the kernel to read the file sequentially (increases readahead) + if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) { + LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n", + strerror(errno)); + } if (prefetch) { flags |= MAP_POPULATE; } #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - if (addr == MAP_FAILED) { + if (addr == MAP_FAILED) { // NOLINT throw std::runtime_error(format("mmap failed: %s", strerror(errno))); } if (prefetch > 0) { - // Advise the kernel to preload the mapped memory + // advise the kernel to preload the mapped memory if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) { - fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", + LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", strerror(errno)); } } @@ -904,14 +852,81 @@ struct llama_mmap { // advise the kernel not to use readahead // (because the next page might not belong on the same node) if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) { - fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", + LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", strerror(errno)); } } + + // initialize list of mapped_fragments + mapped_fragments.emplace_back(0, file->size); + } + + static void align_range(size_t * first, size_t * last, size_t page_size) { + // align first to the next page + size_t offset_in_page = *first & (page_size - 1); + size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page; + *first += offset_to_page; + + // align last to the previous page + *last = *last & ~(page_size - 1); + + if (*last <= *first) { + *last = *first; + } + } + + // partially unmap the file in the range [first, last) + void unmap_fragment(size_t first, size_t last) { + // note: this function must not be called multiple times with overlapping ranges + // otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings + int page_size = sysconf(_SC_PAGESIZE); + align_range(&first, &last, page_size); + size_t len = last - first; + + if (len == 0) { + return; + } + + GGML_ASSERT(first % page_size == 0); + GGML_ASSERT(last % page_size == 0); + GGML_ASSERT(last > first); + + void * next_page_start = (uint8_t *) addr + first; + + // unmap the range + if (munmap(next_page_start, len)) { + LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); + } + + // update the list of mapped fragments to avoid unmapping the same range again in the destructor + std::vector> new_mapped_fragments; + for (const auto & frag : mapped_fragments) { + if (frag.first < first && frag.second > last) { + // the range is in the middle of the fragment, split it + new_mapped_fragments.emplace_back(frag.first, first); + new_mapped_fragments.emplace_back(last, frag.second); + } else if (frag.first < first && frag.second > first) { + // the range starts in the middle of the fragment + new_mapped_fragments.emplace_back(frag.first, first); + } else if (frag.first < last && frag.second > last) { + // the range ends in the middle of the fragment + new_mapped_fragments.emplace_back(last, frag.second); + } else if (frag.first >= first && frag.second <= last) { + // the range covers the entire fragment + } else { + // the range is outside the fragment + new_mapped_fragments.push_back(frag); + } + } + mapped_fragments = std::move(new_mapped_fragments); } ~llama_mmap() { - munmap(addr, size); + for (const auto & frag : mapped_fragments) { + if (munmap((char *) addr + frag.first, frag.second - frag.first)) { + LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); + } + } } #elif defined(_WIN32) static constexpr bool SUPPORTED = true; @@ -959,6 +974,12 @@ struct llama_mmap { } } + void unmap_fragment(size_t first, size_t last) { + // not supported + GGML_UNUSED(first); + GGML_UNUSED(last); + } + ~llama_mmap() { if (!UnmapViewOfFile(addr)) { fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", @@ -975,6 +996,13 @@ struct llama_mmap { throw std::runtime_error(std::string("mmap not supported")); } + + void unmap(size_t offset, size_t len) { + (void) offset; + (void) len; + + throw std::runtime_error(std::string("mmap not supported")); + } #endif }; @@ -1148,6 +1176,26 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ return std::string(result.data(), result.size()); } +static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { +#ifdef GGML_USE_METAL + if (n_gpu_layers > 0) { + return ggml_backend_metal_buffer_type(); + } +#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) + if (n_gpu_layers > 0) { + return ggml_backend_cuda_buffer_type(0); + } +#elif defined(GGML_USE_CUBLAS) + return ggml_backend_cuda_host_buffer_type(); +#elif defined(GGML_USE_CPU_HBM) + return ggml_backend_cpu_hbm_buffer_type(); +#endif + + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(n_gpu_layers); +} + // // globals // @@ -1348,14 +1396,10 @@ struct llama_kv_cache { struct ggml_context * ctx = NULL; - llama_buffer buf; + ggml_backend_buffer_t buf = NULL; ~llama_kv_cache() { - if (ctx) { - ggml_free(ctx); - } - -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) if (ggml_cublas_loaded()) { for (size_t i = 0; i < k_l.size(); ++i) { ggml_cuda_free_data(k_l[i]); @@ -1363,6 +1407,11 @@ struct llama_kv_cache { } } #endif + if (ctx) { + ggml_free(ctx); + } + + ggml_backend_buffer_free(buf); } }; @@ -1402,11 +1451,11 @@ struct llama_vocab { id special_suffix_id = 32008; id special_eot_id = 32010; - int find_bpe_rank(std::string token_left, std::string token_right) const { - GGML_ASSERT(token_left.find(" ") == std::string::npos); - GGML_ASSERT(token_left.find("\n") == std::string::npos); - GGML_ASSERT(token_right.find(" ") == std::string::npos); - GGML_ASSERT(token_right.find("\n") == std::string::npos); + int find_bpe_rank(const std::string & token_left, const std::string & token_right) const { + GGML_ASSERT(token_left.find(' ') == std::string::npos); + GGML_ASSERT(token_left.find('\n') == std::string::npos); + GGML_ASSERT(token_right.find(' ') == std::string::npos); + GGML_ASSERT(token_right.find('\n') == std::string::npos); auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); if (it == bpe_ranks.end()) { @@ -1448,7 +1497,7 @@ struct llama_model { struct ggml_context * ctx = NULL; // the model memory buffer - llama_buffer buf; + ggml_backend_buffer_t buf = NULL; // model memory mapped file std::unique_ptr mapping; @@ -1464,11 +1513,7 @@ struct llama_model { int64_t t_start_us = 0; ~llama_model() { - if (ctx) { - ggml_free(ctx); - } - -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) if (ggml_cublas_loaded()) { for (size_t i = 0; i < tensors_by_name.size(); ++i) { ggml_cuda_free_data(tensors_by_name[i].second); @@ -1482,24 +1527,26 @@ struct llama_model { ggml_cl_free_data(tensors_by_name[i].second); } #endif + if (ctx) { + ggml_free(ctx); + } + + ggml_backend_buffer_free(buf); } }; struct llama_context { llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} ~llama_context() { -#ifdef GGML_USE_METAL - if (ctx_metal) { - ggml_metal_free(ctx_metal); - } -#endif - if (alloc) { - ggml_allocr_free(alloc); - } + ggml_allocr_free(alloc); + ggml_backend_buffer_free(buf_alloc); + ggml_backend_free(backend); } llama_cparams cparams; + ggml_backend_t backend = nullptr; + const llama_model & model; // key + value cache for the self attention @@ -1530,18 +1577,13 @@ struct llama_context { // input embedding (1-dimensional array: [n_embd]) std::vector embedding; - // reusable buffer for `struct ggml_graph_plan.work_data` - std::vector work_buffer; - // memory buffers used to evaluate the model - llama_buffer buf_compute; - - llama_buffer buf_alloc; + std::vector buf_compute_meta; + ggml_backend_buffer_t buf_alloc = NULL; ggml_allocr * alloc = NULL; -#ifdef GGML_USE_METAL - ggml_metal_context * ctx_metal = NULL; -#endif + // temporary buffer for copying data to/from the backend + std::vector> buf_copy; #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; @@ -1563,9 +1605,6 @@ static bool llama_kv_cache_init( const uint32_t n_embd = hparams.n_embd_gqa(); const uint32_t n_layer = hparams.n_layer; - const int64_t n_mem = n_layer*n_ctx; - const int64_t n_elements = n_embd*n_mem; - cache.has_shift = false; cache.head = 0; @@ -1575,13 +1614,10 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead()); - memset(cache.buf.data, 0, cache.buf.size); - struct ggml_init_params params; - params.mem_size = cache.buf.size; - params.mem_buffer = cache.buf.data; - params.no_alloc = false; + params.mem_size = 2u*n_layer*ggml_tensor_overhead(); + params.mem_buffer = NULL; + params.no_alloc = true; cache.ctx = ggml_init(params); @@ -1595,9 +1631,7 @@ static bool llama_kv_cache_init( cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); - const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start); - - GGML_UNUSED(offload); + const int i_gpu_start = (int) n_layer - n_gpu_layers; for (int i = 0; i < (int) n_layer; i++) { ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx); @@ -1606,23 +1640,35 @@ static bool llama_kv_cache_init( ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) if (i >= i_gpu_start) { if (offload) { ggml_cuda_assign_buffers_no_scratch(k); - vram_kv_cache += ggml_nbytes(k); ggml_cuda_assign_buffers_no_scratch(v); + vram_kv_cache += ggml_nbytes(k); vram_kv_cache += ggml_nbytes(v); + // HACK: mark tensor as allocated + k->data = v->data = (void *)(uintptr_t)1; } } #endif // GGML_USE_CUBLAS } + // allocate tensors + cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers)); + + // buf may be NULL with full offload + if (cache.buf) { + // initialize the buffer to avoid NaNs in the padding + ggml_backend_buffer_clear(cache.buf, 0); + } + if (vram_kv_cache > 0) { LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); } - GGML_UNUSED(n_gpu_layers); + GGML_UNUSED(i_gpu_start); + GGML_UNUSED(offload); return true; } @@ -2073,14 +2119,13 @@ struct llama_model_loader { enum ggml_type type_max = GGML_TYPE_F32; for (int i = 0; i < n_tensors; i++) { - const char * name = gguf_get_tensor_name(ctx_gguf, i); - struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name); + enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i); - n_type[meta->type]++; + n_type[type]++; - if (n_type_max < n_type[meta->type]) { - n_type_max = n_type[meta->type]; - type_max = meta->type; + if (n_type_max < n_type[type]) { + n_type_max = n_type[type]; + type_max = type; } // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str()); @@ -2221,34 +2266,19 @@ struct llama_model_loader { return gguf_get_tensor_name(ctx_gguf, i); } - struct ggml_tensor * get_tensor_meta(int i) const { - return ggml_get_tensor(ctx_meta, get_tensor_name(i)); + struct ggml_tensor * get_tensor_meta(const char * name) const { + return ggml_get_tensor(ctx_meta, name); } - void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const { - ctx_size_p = 0; - mmapped_size_p = 0; - - for (int i = 0; i < n_tensors; i++) { - struct ggml_tensor * meta = get_tensor_meta(i); - ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; - (use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta); - } + struct ggml_tensor * get_tensor_meta(int i) const { + return get_tensor_meta(get_tensor_name(i)); } struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) { - if (backend != GGML_BACKEND_CPU) { - ggml_set_no_alloc(ctx, true); - } - struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); tensor->backend = backend; // TODO: ggml_set_backend ggml_set_name(tensor, ggml_get_name(meta)); - if (backend != GGML_BACKEND_CPU) { - ggml_set_no_alloc(ctx, use_mmap); - } - n_created++; return tensor; @@ -2306,90 +2336,137 @@ struct llama_model_loader { return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); } + void init_mapping(bool prefetch = true) { + /* + // prefetch only CPU tensors + if (use_mmap) { + size_t size_pref = 0; // prefetch + + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { + struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); + if (cur->backend == GGML_BACKEND_CPU) { + size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur); + size_pref = std::max(size_pref, tensor_end); + } + } + mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa())); + } + */ + // prefetch the whole file - all the data is needed anyway + if (use_mmap) { + mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); + } + } + + // for backwards compatibility, does not support ggml-backend void load_data_for(struct ggml_tensor * cur) const { const size_t offs = file_offset(ggml_get_name(cur)); - if (use_mmap) { - cur->data = (uint8_t *) mapping->addr + offs; + if (use_mmap && mapping) { + GGML_ASSERT(cur->data == nullptr); + cur->data = (uint8_t *)mapping->addr + offs; } else { + GGML_ASSERT(cur->data != nullptr); file.seek(offs, SEEK_SET); file.read_raw(cur->data, ggml_nbytes(cur)); } } - void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { size_t size_data = 0; - size_t size_lock = 0; - size_t size_pref = 0; // prefetch for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); size_data += ggml_nbytes(cur); - if (cur->backend == GGML_BACKEND_CPU) { - size_pref += ggml_nbytes(cur); - } } - if (use_mmap) { - mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa())); + if (use_mmap && buf_mmap) { if (lmlock) { lmlock->init(mapping->addr); } } - size_t done_size = 0; +#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST) + const bool legacy_offload = true; +#else + const bool legacy_offload = false; +#endif + + std::vector> read_buf; + + size_t size_done = 0; + + size_t mmap_first = -1; + size_t mmap_last = 0; + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); GGML_ASSERT(cur); // unused tensors should have been caught by load_data already if (progress_callback) { - progress_callback((float) done_size / size_data, progress_callback_user_data); + progress_callback((float) size_done / size_data, progress_callback_user_data); } - // allocate temp buffer if not using mmap - if (!use_mmap && cur->data == NULL) { - GGML_ASSERT(cur->backend != GGML_BACKEND_CPU); - #ifdef GGML_USE_CPU_HBM - cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur)); - #else - cur->data = (uint8_t*)malloc(ggml_nbytes(cur)); - #endif - } + const size_t offs = file_offset(ggml_get_name(cur)); - load_data_for(cur); - - switch (cur->backend) { - case GGML_BACKEND_CPU: - if (use_mmap && lmlock) { - size_lock += ggml_nbytes(cur); - lmlock->grow_to(size_lock); + if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) { + if (use_mmap && mapping) { + if (buf_mmap) { + ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); + if (lmlock) { + lmlock->grow_to(offs + ggml_nbytes(cur)); + } + mmap_first = std::min(mmap_first, offs); + mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur)); + } else { + ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); } - break; -#ifdef GGML_USE_CUBLAS - case GGML_BACKEND_GPU: - case GGML_BACKEND_GPU_SPLIT: - // old code: - //ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); - - // TODO: test if this works !! - ggml_cuda_transform_tensor(cur->data, cur); - if (!use_mmap) { - free(cur->data); + } else { + if (ggml_backend_buffer_is_host(cur->buffer)) { + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, ggml_nbytes(cur)); + } else { + read_buf.resize(ggml_nbytes(cur)); + file.seek(offs, SEEK_SET); + file.read_raw(read_buf.data(), ggml_nbytes(cur)); + ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); } - break; + } + } else { + // HACK: mark tensor as allocated + cur->data = (void *)(uintptr_t)1; + void * data; + if (use_mmap && mapping) { + data = (uint8_t *) mapping->addr + offs; + } else { + read_buf.resize(ggml_nbytes(cur)); + file.seek(offs, SEEK_SET); + file.read_raw(read_buf.data(), ggml_nbytes(cur)); + data = read_buf.data(); + } + +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) + ggml_cuda_transform_tensor(data, cur); #elif defined(GGML_USE_CLBLAST) - case GGML_BACKEND_GPU: - ggml_cl_transform_tensor(cur->data, cur); - if (!use_mmap) { - free(cur->data); - } - break; + GGML_ASSERT(cur->backend == GGML_BACKEND_GPU); + ggml_cl_transform_tensor(data, cur); +#else + GGML_ASSERT(!"GPU tensor without a GPU backend"); + GGML_UNUSED(data); #endif - default: - continue; } - done_size += ggml_nbytes(cur); + size_done += ggml_nbytes(cur); + } + + // unmap offloaded tensors and metadata + if (use_mmap && mapping) { + mapping->unmap_fragment(0, mmap_first); + mapping->unmap_fragment(mmap_last, mapping->size); + } + + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); } } }; @@ -2983,25 +3060,16 @@ static void llm_load_tensors( model.n_gpu_layers = n_gpu_layers; - size_t ctx_size; - size_t mmapped_size; + size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors; - ml.calc_sizes(ctx_size, mmapped_size); - - LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0); + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0); // create the ggml context { - model.buf.resize(ctx_size); - if (use_mlock) { - model.mlock_buf.init (model.buf.data); - model.mlock_buf.grow_to(model.buf.size); - } - struct ggml_init_params params = { - /*.mem_size =*/ model.buf.size, - /*.mem_buffer =*/ model.buf.data, - /*.no_alloc =*/ ml.use_mmap, + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, }; model.ctx = ggml_init(params); @@ -3015,22 +3083,21 @@ static void llm_load_tensors( enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU; -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) if (ggml_cublas_loaded()) { LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); ggml_cuda_set_main_device(main_gpu); - llama_backend_offload = GGML_BACKEND_GPU; + llama_backend_offload = GGML_BACKEND_GPU; llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT; } #elif defined(GGML_USE_CLBLAST) LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); - llama_backend_offload = GGML_BACKEND_GPU; + llama_backend_offload = GGML_BACKEND_GPU; llama_backend_offload_split = GGML_BACKEND_GPU; #endif - // prepare memory for the weights - size_t vram_weights = 0; + // create tensors for the weights { const int64_t n_embd = hparams.n_embd; const int64_t n_embd_gqa = hparams.n_embd_gqa(); @@ -3059,13 +3126,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3115,28 +3175,6 @@ static void llm_load_tensors( layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); } } - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + - (layer.bq ? ggml_nbytes(layer.bq) : 0) + - (layer.bk ? ggml_nbytes(layer.bk) : 0) + - (layer.bv ? ggml_nbytes(layer.bv) : 0) + - (layer.bo ? ggml_nbytes(layer.bo) : 0) + - ggml_nbytes(layer.ffn_norm); - - if (layer.ffn_gate_inp == nullptr) { - vram_weights += - ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); - } else { - vram_weights += ggml_nbytes(layer.ffn_gate_inp); - for (uint32_t x = 0; x < hparams.n_expert; ++x) { - vram_weights += - ggml_nbytes(layer.ffn_gate_exp[x]) + ggml_nbytes(layer.ffn_down_exp[x]) + ggml_nbytes(layer.ffn_up_exp[x]); - } - } - } } } break; case LLM_ARCH_BAICHUAN: @@ -3156,13 +3194,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3189,19 +3220,10 @@ static void llm_load_tensors( layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); - } } } break; case LLM_ARCH_FALCON: { - // TODO: CPU-only for now - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output @@ -3220,14 +3242,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - vram_weights += ggml_nbytes(model.output_norm_b); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3248,11 +3262,6 @@ static void llm_load_tensors( if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend); layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(layer.attn_norm_2); - vram_weights += ggml_nbytes(layer.attn_norm_2_b); - } } layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); @@ -3260,13 +3269,6 @@ static void llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + - ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + - ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); - } } } break; case LLM_ARCH_STARCODER: @@ -3290,14 +3292,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - vram_weights += ggml_nbytes(model.output_norm_b); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3329,16 +3323,6 @@ static void llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + - ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + - ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + - ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + - ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) + - ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b); - } } } break; case LLM_ARCH_PERSIMMON: @@ -3360,14 +3344,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - vram_weights += ggml_nbytes(model.output_norm_b); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3397,8 +3373,6 @@ static void llm_load_tensors( } break; case LLM_ARCH_BLOOM: { - // TODO: CPU-only for now - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); @@ -3419,14 +3393,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - vram_weights += ggml_nbytes(model.output_norm_b); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3458,16 +3424,6 @@ static void llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + - ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + - ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + - ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + - ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) + - ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b); - } } } break; case LLM_ARCH_MPT: @@ -3489,13 +3445,6 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3518,16 +3467,6 @@ static void llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + - ggml_nbytes(layer.wqkv) + - ggml_nbytes(layer.wo) + - ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.ffn_down) + - ggml_nbytes(layer.ffn_up); - } } } break; case LLM_ARCH_STABLELM: @@ -3550,13 +3489,6 @@ static void llm_load_tensors( model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } } const uint32_t n_ff = hparams.n_ff; @@ -3588,13 +3520,6 @@ static void llm_load_tensors( layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); - } } } break; case LLM_ARCH_QWEN: @@ -3614,14 +3539,7 @@ static void llm_load_tensors( model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - } - if (backend_output == GGML_BACKEND_GPU_SPLIT) { - vram_weights += ggml_nbytes(model.output); - } - } + } const uint32_t n_ff = hparams.n_ff / 2; @@ -3646,13 +3564,6 @@ static void llm_load_tensors( layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + - ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) + - ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); - } } } break; case LLM_ARCH_PHI2: @@ -3676,13 +3587,6 @@ static void llm_load_tensors( model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output); - - if (backend_norm == GGML_BACKEND_GPU) { - vram_weights += ggml_nbytes(model.output_norm); - vram_weights += ggml_nbytes(model.output_norm_b); - vram_weights += ggml_nbytes(model.output); - vram_weights += ggml_nbytes(model.output_b); - } } const uint32_t n_ff = hparams.n_ff; @@ -3711,15 +3615,6 @@ static void llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); - - if (backend == GGML_BACKEND_GPU) { - vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + - ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + - ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + - ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) + - ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b); - } } } break; default: @@ -3729,16 +3624,78 @@ static void llm_load_tensors( ml.done_getting_tensors(); + ml.init_mapping(); + + // allocate tensors + size_t vram_weights = 0; + size_t buf_size = 0; + + ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers); + + for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend + if (t->backend == GGML_BACKEND_CPU) { + buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft)); + } else { + vram_weights += ggml_nbytes(t); + } + } + + // create backend buffer + ggml_backend_buffer_t buf_mmap = nullptr; + +#ifdef GGML_USE_METAL + if (n_gpu_layers > 0) { + if (ml.use_mmap) { + const size_t max_size = ggml_get_max_tensor_size(ctx); + model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size); + buf_mmap = model.buf; + } else { + model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); + } + } +#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) + // for testing only + if (n_gpu_layers > 0) { + model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0)); + } +#endif + + if (model.buf == nullptr) { + // CPU backend, and indirectly CUDA and OpenCL + if (ml.use_mmap) { + model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); + buf_mmap = model.buf; + } else { + // allocate only CPU tensors + model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size); + ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf); + for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + if (t->backend == GGML_BACKEND_CPU) { + ggml_tallocr_alloc(alloc, t); + } + } + ggml_tallocr_free(alloc); + } + } + + if (use_mlock && ggml_backend_buffer_is_host(model.buf)) { + model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf)); + model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf)); + } + // print memory requirements { - // this is the total memory required to run the inference - size_t mem_required = - ctx_size + - mmapped_size - vram_weights; // weights in VRAM not in memory + size_t sys_mem_required = ctx_size + buf_size; - LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0); + if (sys_mem_required > 0) { + LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0); + } + if (vram_weights > 0) { + LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); + } -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); @@ -3746,39 +3703,26 @@ static void llm_load_tensors( LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__); } -#ifdef GGML_USE_CUBLAS const int max_backend_supported_layers = hparams.n_layer + 1; const int max_offloadable_layers = hparams.n_layer + 1; -#elif GGML_USE_CLBLAST - const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; -#endif // GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); -#else - (void) n_gpu_layers; #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) } - // populate `tensors_by_name` +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) + ggml_cuda_set_tensor_split(tensor_split); +#else + GGML_UNUSED(tensor_split); +#endif // GGML_USE_CUBLAS + + // populate tensors_by_name for (int i = 0; i < ml.n_tensors; ++i) { struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i)); model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } - (void) tensor_split; -#ifdef GGML_USE_CUBLAS - { - ggml_cuda_set_tensor_split(tensor_split); - } -#endif - - ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); - - if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); - } + ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); model.mapping = std::move(ml.mapping); @@ -4211,7 +4155,7 @@ struct llm_build_context { const llm_build_cb & cb; - llama_buffer & buf_compute; + std::vector & buf_compute_meta; struct ggml_context * ctx0 = nullptr; @@ -4221,35 +4165,35 @@ struct llm_build_context { const llama_batch & batch, const llm_build_cb & cb, bool worst_case) : - model (lctx.model), - hparams (model.hparams), - cparams (lctx.cparams), - batch (batch), - kv_self (lctx.kv_self), - n_embd (hparams.n_embd), - n_layer (hparams.n_layer), - n_ctx (cparams.n_ctx), - n_head (hparams.n_head), - n_head_kv (hparams.n_head_kv), - n_embd_head (hparams.n_embd_head()), - n_embd_gqa (hparams.n_embd_gqa()), - n_expert (hparams.n_expert), - n_expert_used (hparams.n_expert_used), - freq_base (cparams.rope_freq_base), - freq_scale (cparams.rope_freq_scale), - ext_factor (cparams.yarn_ext_factor), - attn_factor (cparams.yarn_attn_factor), - beta_fast (cparams.yarn_beta_fast), - beta_slow (cparams.yarn_beta_slow), - norm_eps (hparams.f_norm_eps), - norm_rms_eps (hparams.f_norm_rms_eps), - n_tokens (batch.n_tokens), - n_kv (worst_case ? n_ctx : kv_self.n), - kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), - n_orig_ctx (cparams.n_yarn_orig_ctx), - do_rope_shift (worst_case || kv_self.has_shift), - cb (cb), - buf_compute (lctx.buf_compute) { + model (lctx.model), + hparams (model.hparams), + cparams (lctx.cparams), + batch (batch), + kv_self (lctx.kv_self), + n_embd (hparams.n_embd), + n_layer (hparams.n_layer), + n_ctx (cparams.n_ctx), + n_head (hparams.n_head), + n_head_kv (hparams.n_head_kv), + n_embd_head (hparams.n_embd_head()), + n_embd_gqa (hparams.n_embd_gqa()), + n_expert (hparams.n_expert), + n_expert_used (hparams.n_expert_used), + freq_base (cparams.rope_freq_base), + freq_scale (cparams.rope_freq_scale), + ext_factor (cparams.yarn_ext_factor), + attn_factor (cparams.yarn_attn_factor), + beta_fast (cparams.yarn_beta_fast), + beta_slow (cparams.yarn_beta_slow), + norm_eps (hparams.f_norm_eps), + norm_rms_eps (hparams.f_norm_rms_eps), + n_tokens (batch.n_tokens), + n_kv (worst_case ? n_ctx : kv_self.n), + kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), + n_orig_ctx (cparams.n_yarn_orig_ctx), + do_rope_shift (worst_case || kv_self.has_shift), + cb (cb), + buf_compute_meta (lctx.buf_compute_meta) { GGML_ASSERT(!!kv_self.ctx); // all initializations should be done in init() @@ -4257,8 +4201,8 @@ struct llm_build_context { void init() { struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), /*.no_alloc =*/ true, }; @@ -5737,8 +5681,8 @@ static const std::unordered_map k_offload_map { "pos_embd", OFFLOAD_FUNC_NR }, { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope) - { "Q_scale", OFFLOAD_FUNC_FRC }, - { "KQ_scale", OFFLOAD_FUNC_FRC }, + { "Q_scale", OFFLOAD_FUNC_NOP }, + { "KQ_scale", OFFLOAD_FUNC_NOP }, { "KQ_mask", OFFLOAD_FUNC_FRC }, { "K_shift", OFFLOAD_FUNC_FRC }, @@ -5845,7 +5789,7 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) const bool do_offload = true; #else const bool do_offload = true; // TODO: set to false after finishing refactoring @@ -5873,7 +5817,7 @@ static struct ggml_cgraph * llama_build_graph( if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) { const int64_t n_tokens = cur->ne[0]; - memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); + ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur)); } alloc_inp_tokens = true; @@ -5886,7 +5830,7 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; - memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur)); + ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*ggml_element_size(cur)); } alloc_inp_embd = true; @@ -5898,11 +5842,8 @@ static struct ggml_cgraph * llama_build_graph( if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) { const int64_t n_tokens = cur->ne[0]; - int32_t * data = (int32_t *) cur->data; - - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } + static_assert(std::is_same::value, "llama_pos must be int32_t"); + ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*ggml_element_size(cur)); } alloc_inp_pos = true; @@ -5913,7 +5854,8 @@ static struct ggml_cgraph * llama_build_graph( if (!ggml_allocr_is_measure(lctx.alloc)) { const int64_t n_embd_head = model.hparams.n_embd_head(); - ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + float f = 1.0f/sqrtf(float(n_embd_head)); + ggml_backend_tensor_set(cur, &f, 0, sizeof(f)); } alloc_inp_Q_scale = true; @@ -5924,13 +5866,15 @@ static struct ggml_cgraph * llama_build_graph( if (!ggml_allocr_is_measure(lctx.alloc)) { const int64_t n_embd_head = model.hparams.n_embd_head(); + float f; if (model.arch == LLM_ARCH_PHI2) { // with phi2, we scale the Q to avoid precision issues // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 - ggml_set_f32(cur, 1.0f); + f = 1.0f; } else { - ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + f = 1.0f/sqrtf(float(n_embd_head)); } + ggml_backend_tensor_set(cur, &f, 0, sizeof(f)); } alloc_inp_KQ_scale = true; @@ -5943,8 +5887,13 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_kv = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; - float * data = (float *) cur->data; - memset(data, 0, ggml_nbytes(cur)); + float * data; + if (ggml_backend_buffer_is_host(cur->buffer)) { + data = (float *) cur->data; + } else { + lctx.buf_copy.resize(ggml_nbytes(cur)); + data = (float *) lctx.buf_copy.data(); + } for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -5952,12 +5901,20 @@ static struct ggml_cgraph * llama_build_graph( const llama_seq_id seq_id = batch.seq_id[j][0]; for (int i = 0; i < n_kv; ++i) { + float f; if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + f = -INFINITY; + } else { + f = 0; } + data[h*(n_kv*n_tokens) + j*n_kv + i] = f; } } } + + if (data != cur->data) { + ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur)); + } } alloc_inp_KQ_mask = true; @@ -5969,11 +5926,21 @@ static struct ggml_cgraph * llama_build_graph( if (!ggml_allocr_is_measure(lctx.alloc)) { const int64_t n_ctx = cur->ne[0]; - int32_t * data = (int32_t *) cur->data; + int32_t * data; + if (ggml_backend_buffer_is_host(cur->buffer)) { + data = (int32_t *) cur->data; + } else { + lctx.buf_copy.resize(ggml_nbytes(cur)); + data = (int32_t *) lctx.buf_copy.data(); + } for (int i = 0; i < n_ctx; ++i) { data[i] = lctx.kv_self.cells[i].delta; } + + if (data != cur->data) { + ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur)); + } } alloc_inp_K_shift = true; @@ -6010,7 +5977,7 @@ static struct ggml_cgraph * llama_build_graph( static const std::unordered_map> k_offload_func_name = { { OFFLOAD_FUNC_NOP, "CPU" }, { OFFLOAD_FUNC_OUT, "CPU" }, -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) { OFFLOAD_FUNC, "GPU (CUDA)" }, { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" }, { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" }, @@ -6083,7 +6050,7 @@ static struct ggml_cgraph * llama_build_graph( offload_func_t func = ggml_offload_nop; // this is needed for compatibility with Metal for example -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc; #else static offload_func_t ggml_offload_gpu = ggml_offload_nop; @@ -6305,11 +6272,12 @@ static int llama_decode_internal( GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); } -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) + char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc); for (int i = 0; i < gf->n_leafs; i++) { ggml_tensor * node = gf->leafs[i]; if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); + ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base); ggml_cuda_copy_to_device(node); } } @@ -6317,7 +6285,7 @@ static int llama_decode_internal( for (int i = 0; i < gf->n_nodes; i++) { ggml_tensor * node = gf->nodes[i]; if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); + ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base); } } @@ -6344,23 +6312,23 @@ static int llama_decode_internal( n_threads = 1; } -#if GGML_USE_MPI +#ifdef GGML_USE_MPI const int64_t n_layer = hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif #ifdef GGML_USE_METAL - if (lctx.ctx_metal) { - ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); - ggml_metal_graph_compute(lctx.ctx_metal, gf); - } else { - ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); + if (ggml_backend_is_metal(lctx.backend)) { + ggml_backend_metal_set_n_cb(lctx.backend, n_threads); } -#else - ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); #endif -#if GGML_USE_MPI + if (ggml_backend_is_cpu(lctx.backend)) { + ggml_backend_cpu_set_n_threads(lctx.backend, n_threads); + } + ggml_backend_graph_compute(lctx.backend, gf); + +#ifdef GGML_USE_MPI ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); #endif @@ -6412,20 +6380,20 @@ static int llama_decode_internal( if (batch.logits[i] == 0) { continue; } - memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab); + ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[i] = true; #endif } } else if (lctx.logits_all) { logits_out.resize(n_vocab * n_tokens); - memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens); + ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); #ifndef NDEBUG std::fill(logits_valid.begin(), logits_valid.end(), true); #endif } else { logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); + ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[0] = true; #endif @@ -6437,7 +6405,7 @@ static int llama_decode_internal( auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); - memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd); + ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); } // measure the performance only for the single-token evals @@ -8395,12 +8363,6 @@ void llama_beam_search(llama_context * ctx, // quantization // -template -struct no_init { - T value; - no_init() { /* do nothing */ } -}; - struct quantize_state_internal { const llama_model & model; const llama_model_quantize_params * params; @@ -8643,9 +8605,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s #endif llama_model_loader ml(fname_inp, use_mmap, NULL); - if (ml.use_mmap) { - ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa())); - } + ml.init_mapping(false); // no prefetching? llama_model model; llm_load_arch(ml, model); @@ -8944,29 +8904,10 @@ static int llama_apply_lora_from_file_internal( // load base model std::unique_ptr ml; - unique_context base_ctx(nullptr, ggml_free); - std::vector base_buf; - if (path_base_model) { + if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL)); - - size_t ctx_size; - size_t mmapped_size; - ml->calc_sizes(ctx_size, mmapped_size); - - base_buf.resize(ctx_size); - - ggml_init_params base_params; - base_params.mem_size = base_buf.size(); - base_params.mem_buffer = base_buf.data(); - base_params.no_alloc = ml->use_mmap; - - base_ctx.reset(ggml_init(base_params)); - - // maybe this should be in llama_model_loader - if (ml->use_mmap) { - ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa())); - } + ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr)); + ml->init_mapping(false); // no prefetching } // read tensors and apply @@ -9058,7 +8999,7 @@ static int llama_apply_lora_from_file_internal( offload_func_t offload_func = ggml_offload_nop; offload_func_t offload_func_force_inplace = ggml_offload_nop; -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { if (dest_t->type != GGML_TYPE_F16) { throw std::runtime_error(format( @@ -9079,7 +9020,7 @@ static int llama_apply_lora_from_file_internal( return 1; } - base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU); + base_t = ml->get_tensor_meta(base_name.c_str()); ml->load_data_for(base_t); } else { base_t = dest_t; @@ -9364,7 +9305,39 @@ struct llama_context * llama_new_context_with_model( // reserve memory for context buffers if (!hparams.vocab_only) { - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) { + // initialize backend +#ifdef GGML_USE_METAL + if (model->n_gpu_layers > 0) { + ctx->backend = ggml_backend_metal_init(); + if (ctx->backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__); + } + } +#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) + // for testing only + if (model->n_gpu_layers > 0) { + ctx->backend = ggml_backend_cuda_init(0); + if (ctx->backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__); + } + } +#endif + + if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) { + ctx->backend = ggml_backend_cpu_init(); + if (ctx->backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + } + } + + if (ctx->backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__); + delete ctx; + return nullptr; + } + + if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, + cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -9400,12 +9373,11 @@ struct llama_context * llama_new_context_with_model( } { - static const size_t tensor_alignment = 32; // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data - ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead()); + ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead()); // create measure allocator - ctx->alloc = ggml_allocr_new_measure(tensor_alignment); + ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend); // build worst-case graph int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); @@ -9413,98 +9385,50 @@ struct llama_context * llama_new_context_with_model( llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); -#ifdef GGML_USE_METAL - if (model->n_gpu_layers > 0) { - ctx->ctx_metal = ggml_metal_init(1); - if (!ctx->ctx_metal) { - LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); - llama_free(ctx); - return NULL; - } - //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); - //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); - } -#endif // measure memory requirements for the graph - size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; + size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf); - LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0); - // recreate allocator with exact memory requirements + // create allocator again with exact memory requirements ggml_allocr_free(ctx->alloc); - ctx->buf_alloc.resize(alloc_size); - ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment); -#ifdef GGML_USE_METAL - if (ctx->ctx_metal) { - //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); - } -#endif -#ifdef GGML_USE_CUBLAS - ggml_cuda_set_scratch_size(alloc_size); - LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); + ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size); + ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc); +#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) + if (model->n_gpu_layers > 0) { + ggml_cuda_set_scratch_size(alloc_size); + LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); - // calculate total VRAM usage - auto add_tensor = [](const ggml_tensor * t, size_t & size) { - if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) { - size += ggml_nbytes(t); + // calculate total VRAM usage + auto add_tensor = [](const ggml_tensor * t, size_t & size) { + if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) { + size += ggml_nbytes(t); + } + }; + size_t model_vram_size = 0; + for (const auto & kv : model->tensors_by_name) { + add_tensor(kv.second, model_vram_size); } - }; - size_t model_vram_size = 0; - for (const auto & kv : model->tensors_by_name) { - add_tensor(kv.second, model_vram_size); - } - size_t kv_vram_size = 0; - for (auto & k : ctx->kv_self.k_l) { - add_tensor(k, kv_vram_size); - } - for (auto & v : ctx->kv_self.v_l) { - add_tensor(v, kv_vram_size); - } + size_t kv_vram_size = 0; + for (auto & k : ctx->kv_self.k_l) { + add_tensor(k, kv_vram_size); + } + for (auto & v : ctx->kv_self.v_l) { + add_tensor(v, kv_vram_size); + } - size_t ctx_vram_size = alloc_size + kv_vram_size; - size_t total_vram_size = model_vram_size + ctx_vram_size; + size_t ctx_vram_size = alloc_size + kv_vram_size; + size_t total_vram_size = model_vram_size + ctx_vram_size; - LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__, - total_vram_size / 1024.0 / 1024.0, - model_vram_size / 1024.0 / 1024.0, - ctx_vram_size / 1024.0 / 1024.0); + LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__, + total_vram_size / 1024.0 / 1024.0, + model_vram_size / 1024.0 / 1024.0, + ctx_vram_size / 1024.0 / 1024.0); + } #endif } - -#ifdef GGML_USE_METAL - if (model->n_gpu_layers > 0) { - // this allocates all Metal resources and memory buffers - - void * data_ptr = NULL; - size_t data_size = 0; - - if (ctx->model.mapping) { - data_ptr = ctx->model.mapping->addr; - data_size = ctx->model.mapping->size; - } else { - data_ptr = ggml_get_mem_buffer(ctx->model.ctx); - data_size = ggml_get_mem_size (ctx->model.ctx); - } - - const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - - LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0); - -#define LLAMA_METAL_CHECK_BUF(result) \ - if (!(result)) { \ - LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ - llama_free(ctx); \ - return NULL; \ - } - - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0)); -#undef LLAMA_METAL_CHECK_BUF - } -#endif } #ifdef GGML_USE_MPI @@ -9796,7 +9720,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { const size_t s_embedding = ctx->embedding.size() * sizeof(float); const size_t s_kv_size = sizeof(size_t); const size_t s_kv_ntok = sizeof(int); - const size_t s_kv = ctx->kv_self.buf.size; + const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf); const size_t s_total = ( + s_rng_size @@ -9924,7 +9848,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto n_embd = hparams.n_embd_gqa(); const auto n_ctx = cparams.n_ctx; - const size_t kv_buf_size = kv_self.buf.size; + const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf); const uint32_t kv_head = kv_self.head; const uint32_t kv_size = kv_self.size; const uint32_t kv_used = kv_self.used; @@ -9940,17 +9864,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); ggml_cgraph * gf = ggml_new_graph(cpy_ctx); - std::vector> kout2d_data(n_layer); - std::vector> vout2d_data(n_layer); + std::vector kout2d(n_layer); + std::vector vout2d(n_layer); for (int il = 0; il < (int) n_layer; ++il) { - ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); - kout2d_data[il].resize(ggml_nbytes(kout2d)); - kout2d->data = kout2d_data[il].data(); - - ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); - vout2d_data[il].resize(ggml_nbytes(vout2d)); - vout2d->data = vout2d_data[il].data(); + kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); + vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], n_embd, kv_head, @@ -9960,20 +9879,28 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat kv_head, n_embd, elt_size*n_ctx, 0); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d)); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il])); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il])); } - ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend); + + ggml_backend_graph_compute(ctx->backend, gf); + + std::vector tmp_buf; + for (int il = 0; il < (int) n_layer; ++il) { + tmp_buf.resize(ggml_nbytes(kout2d[il])); + ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size()); + data_ctx->write(tmp_buf.data(), tmp_buf.size()); + + tmp_buf.resize(ggml_nbytes(vout2d[il])); + ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size()); + data_ctx->write(tmp_buf.data(), tmp_buf.size()); + } ggml_free(cpy_ctx); - // our data is now in the kout2d_data and vout2d_data buffers - // write them to file - for (uint32_t il = 0; il < n_layer; ++il) { - data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size()); - data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size()); - } + ggml_backend_buffer_free(buf); } for (uint32_t i = 0; i < kv_size; ++i) { @@ -10071,21 +9998,19 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); if (kv_buf_size) { - GGML_ASSERT(kv_self.buf.size == kv_buf_size); + GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size); const size_t elt_size = ggml_element_size(kv_self.k_l[0]); ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); ggml_cgraph * gf = ggml_new_graph(cpy_ctx); - for (int il = 0; il < n_layer; ++il) { - ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); - kin2d->data = (void *) inp; - inp += ggml_nbytes(kin2d); + std::vector kin2d(n_layer); + std::vector vin2d(n_layer); - ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); - vin2d->data = (void *) inp; - inp += ggml_nbytes(vin2d); + for (int il = 0; il < n_layer; ++il) { + kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); + vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], n_embd, kv_head, @@ -10095,13 +10020,26 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { kv_head, n_embd, elt_size*n_ctx, 0); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d)); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d)); } - ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend); + + // load data into the tensors + for (int il = 0; il < n_layer; ++il) { + ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il])); + inp += ggml_nbytes(kin2d[il]); + + ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il])); + inp += ggml_nbytes(vin2d[il]); + } + + ggml_backend_graph_compute(ctx->backend, gf); ggml_free(cpy_ctx); + + ggml_backend_buffer_free(buf); } ctx->kv_self.head = kv_head; From 4a5f9d629ecfd0a53afdddbaf54a4fa02d9a9ce9 Mon Sep 17 00:00:00 2001 From: Samuel Maynard Date: Thu, 21 Dec 2023 22:36:26 +0200 Subject: [PATCH 40/43] ci : add `jlumbroso/free-disk-space` to docker workflow (#4150) * [github][workflows][docker]: removes hardcoded `ggerganov` from `ghcr` repo * [github][workflows][docker]: adds `jlumbroso/free-disk-space` --- .github/workflows/docker.yml | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 9c90c77ac..a7165a38f 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -52,6 +52,23 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} + # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: false + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + - name: Build and push Docker image (versioned) if: github.event_name == 'push' uses: docker/build-push-action@v4 @@ -59,7 +76,7 @@ jobs: context: . push: true platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" + tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" file: ${{ matrix.config.dockerfile }} - name: Build and push Docker image (tagged) @@ -68,5 +85,5 @@ jobs: context: . push: ${{ github.event_name == 'push' }} platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}" + tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}" file: ${{ matrix.config.dockerfile }} From 32259b2dade6f6856739bf7ba0a4ff7b474dc760 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Dec 2023 23:07:58 +0200 Subject: [PATCH 41/43] gguf : simplify example dependencies --- Makefile | 2 +- examples/gguf/CMakeLists.txt | 2 +- examples/gguf/gguf.cpp | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 512407a1d..68df7702a 100644 --- a/Makefile +++ b/Makefile @@ -606,7 +606,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual -gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) +gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt index 7d1806af3..6481f087b 100644 --- a/examples/gguf/CMakeLists.txt +++ b/examples/gguf/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET gguf) add_executable(${TARGET} gguf.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 9e24bf24c..e67be4fb2 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -1,5 +1,4 @@ #include "ggml.h" -#include "llama.h" #include #include From 769a7bc85eaa44e3d7eadf39abfeff7bb0b9cc2f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Dec 2023 23:20:36 +0200 Subject: [PATCH 42/43] gguf-py : fix broken link --- gguf-py/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/README.md b/gguf-py/README.md index a27d2fc0e..22d7ffa52 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -3,7 +3,7 @@ This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) (GGML Universal File) format. -See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py) +See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) as an example for its usage. ## Installation From afefa319f1f59b002dfa0d1ef407a2c74bd9770b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 Dec 2023 23:20:49 +0200 Subject: [PATCH 43/43] ggml : change ggml_scale to take a float instead of tensor (#4573) * ggml : change ggml_scale to take a float instead of tensor * ggml : fix CPU implementation * tests : fix test-grad0 ggml-ci --- examples/baby-llama/baby-llama.cpp | 15 +-- examples/export-lora/export-lora.cpp | 2 +- examples/finetune/finetune.cpp | 42 +++---- examples/llava/clip.cpp | 8 +- .../train-text-from-scratch.cpp | 14 +-- ggml-cuda.cu | 14 +-- ggml-metal.m | 6 +- ggml.c | 42 +++---- ggml.h | 4 +- llama.cpp | 119 +++--------------- tests/test-backend-ops.cpp | 9 +- tests/test-grad0.cpp | 12 +- 12 files changed, 82 insertions(+), 205 deletions(-) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 2dc2988d3..e7d2ad592 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -575,10 +575,7 @@ static struct ggml_tensor * forward( // KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // KQ_masked shape [n_past + N, N, n_head, 1] @@ -844,10 +841,7 @@ static struct ggml_tensor * forward_batch( // KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch); // KQ_masked = mask_past(KQ_scaled) @@ -1131,10 +1125,7 @@ static struct ggml_tensor * forward_lora( // KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // KQ_masked shape [n_past + N, N, n_head, 1] diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index c8754ce70..58fbe204d 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -309,7 +309,7 @@ static struct ggml_cgraph * build_graph_lora( ) { struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b); if (scaling != 1.0f) { - ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling)); + ab = ggml_scale(ctx, ab, scaling); } struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab); diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 6a668d764..7b1333a9d 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -269,7 +269,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h float rope_freq_scale = 1.0f; GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); - GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); if (rope_freq_scale != 1.0f) { hparams->rope_freq_scale = 1.0f / rope_freq_scale; } @@ -612,6 +612,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( const int n_rot = hparams.n_embd_head(); const int n_embd_head = hparams.n_embd_head(); const int n_embd_gqa = hparams.n_embd_gqa(); + const float rms_norm_eps = hparams.f_norm_rms_eps; const float rope_freq_base = hparams.rope_freq_base; const float rope_freq_scale = hparams.rope_freq_scale; @@ -680,10 +681,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( checkpoints.push_back(t01); } - struct ggml_tensor * kv_scale = NULL; - if (!enable_flash_attn) { - kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)); - } + const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head); for (int il = 0; il < n_layer; ++il) { struct my_llama_layer & layer = model->layers[il]; @@ -781,32 +779,32 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // make sure some tensors are not reallocated by inserting new temporary nodes depending on them int n_leafs_before = gb->n_leafs; int n_nodes_before = gb->n_nodes; - struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f); + // output tensors - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f)); // input gradient - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); ggml_allocr_alloc(alloc, t36->grad); // KQ_pos - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); // make sure base model tensors data cannot be used in viewable operations - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f)); for (int il = 0; il < n_layer; ++il) { struct my_llama_layer & layer = model->layers[il]; - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f)); } // allocating checkpoints in one block to reduce memory fragmentation diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 112465968..f06ec400d 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -330,12 +330,6 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima ggml_repeat(ctx0, model.pre_ln_b, embeddings)); } - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(ctx->alloc, KQ_scale); - if (!ggml_allocr_is_measure(ctx->alloc)) { - ggml_set_f32(KQ_scale, 1.0f / sqrt((float)d_head)); - } - // loop over layers for (int il = 0; il < n_layer - 1; il++) { struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states @@ -356,7 +350,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima struct ggml_tensor * Q = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), ggml_mul_mat(ctx0, model.layers[il].q_w, cur)); - Q = ggml_scale_inplace(ctx0, Q, KQ_scale); + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index f7ed63365..4a9a2340b 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -369,10 +369,7 @@ static struct ggml_tensor * llama_build_train_graphs( checkpoints.push_back(t00); checkpoints.push_back(t01); - struct ggml_tensor * kv_scale = NULL; - if (!enable_flash_attn) { - kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)); - } + const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head); for (int il = 0; il < n_layer; ++il) { struct my_llama_layer & layer = model->layers[il]; @@ -444,14 +441,13 @@ static struct ggml_tensor * llama_build_train_graphs( // make sure some tensors are not reallocated by inserting new temporary nodes depending on them int n_leafs_before = gb->n_leafs; int n_nodes_before = gb->n_nodes; - struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f); // output tensors - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f)); // input gradient - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); // KQ_pos - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); ggml_allocr_alloc(alloc, t36->grad); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f5e060d32..ac91ee12e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -7700,17 +7700,9 @@ inline void ggml_cuda_op_scale( const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - float scale; - // HACK: support for ggml backend interface - if (src1->backend == GGML_BACKEND_CPU) { - scale = ((float *) src1->data)[0]; - } else { - // TODO: pass pointer to kernel instead of copying to host - CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost)); - } + const float scale = ((float *) dst->op_params)[0]; scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); @@ -7757,8 +7749,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; - const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE; - // dd = data device float * src0_ddf = nullptr; float * src1_ddf = nullptr; @@ -7779,7 +7769,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream)); } - if (use_src1 && !src1_stays_on_host) { + if (use_src1) { if (src1_on_device) { src1_ddf = (float *) src1_extra->data_device[g_main_device]; } else { diff --git a/ggml-metal.m b/ggml-metal.m index e60b93b36..51a72ae33 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1293,7 +1293,7 @@ void ggml_metal_graph_compute( { GGML_ASSERT(ggml_is_contiguous(src0)); - const float scale = *(const float *) src1->data; + const float scale = *(const float *) dst->op_params; int64_t n = ggml_nelements(dst); @@ -1304,8 +1304,8 @@ void ggml_metal_graph_compute( [encoder setComputePipelineState:ctx->pipeline_scale]; } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; diff --git a/ggml.c b/ggml.c index 236148514..f27920a2d 100644 --- a/ggml.c +++ b/ggml.c @@ -4171,23 +4171,23 @@ struct ggml_tensor * ggml_out_prod( static struct ggml_tensor * ggml_scale_impl( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b, + float s, bool inplace) { - GGML_ASSERT(ggml_is_scalar(b)); GGML_ASSERT(ggml_is_padded_1d(a)); bool is_node = false; - if (a->grad || b->grad) { + if (a->grad) { is_node = true; } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + ggml_set_op_params(result, &s, sizeof(s)); + result->op = GGML_OP_SCALE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -4195,15 +4195,15 @@ static struct ggml_tensor * ggml_scale_impl( struct ggml_tensor * ggml_scale( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_scale_impl(ctx, a, b, false); + float s) { + return ggml_scale_impl(ctx, a, s, false); } struct ggml_tensor * ggml_scale_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_scale_impl(ctx, a, b, true); + float s) { + return ggml_scale_impl(ctx, a, s, true); } // ggml_set @@ -10325,19 +10325,17 @@ static void ggml_compute_forward_out_prod( static void ggml_compute_forward_scale_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } // scale factor - const float v = *(float *) src1->data; + const float v = *(float *) dst->op_params; const int ith = params->ith; const int nth = params->nth; @@ -10368,12 +10366,11 @@ static void ggml_compute_forward_scale_f32( static void ggml_compute_forward_scale( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_scale_f32(params, src0, src1, dst); + ggml_compute_forward_scale_f32(params, src0, dst); } break; default: { @@ -14383,7 +14380,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_SCALE: { - ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_scale(params, tensor->src[0], tensor); } break; case GGML_OP_SET: { @@ -14839,7 +14836,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) { if (ggml_hash_contains(zero_table, a)) { - struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0)); + struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f); return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); } else { return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); @@ -14975,7 +14972,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad, ggml_scale(ctx, ggml_mul(ctx, src0, tensor->grad), - ggml_new_f32(ctx, 2.0f)), + 2.0f), zero_table); } } break; @@ -14989,7 +14986,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_div(ctx, tensor->grad, tensor), - ggml_new_f32(ctx, 0.5f)), + 0.5f), zero_table); } } break; @@ -15155,17 +15152,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { + const float s = ((float *) tensor->op_params)[0]; + src0->grad = ggml_add_or_set(ctx, src0->grad, - ggml_scale_impl(ctx, tensor->grad, src1, false), - zero_table); - } - if (src1->grad) { - src1->grad = - ggml_add_or_set(ctx, - src1->grad, - ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)), + ggml_scale_impl(ctx, tensor->grad, s, false), zero_table); } } break; diff --git a/ggml.h b/ggml.h index b17314897..75918502b 100644 --- a/ggml.h +++ b/ggml.h @@ -1094,13 +1094,13 @@ extern "C" { GGML_API struct ggml_tensor * ggml_scale( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + float s); // in-place, returns view(a) GGML_API struct ggml_tensor * ggml_scale_inplace( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + float s); // b -> view(a,offset,nb1,nb2,3), return modified a GGML_API struct ggml_tensor * ggml_set( diff --git a/llama.cpp b/llama.cpp index ba970ce8d..d6c192441 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4032,13 +4032,12 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * wo, struct ggml_tensor * wo_b, struct ggml_tensor * q_cur, - struct ggml_tensor * kq_scale, struct ggml_tensor * kq_mask, int64_t n_ctx, int32_t n_tokens, int32_t n_kv, float max_alibi_bias, - float scale, + float kq_scale, const llm_build_cb & cb, int il) { const int64_t n_embd = hparams.n_embd; @@ -4086,7 +4085,7 @@ static struct ggml_tensor * llm_build_kqv( kq = ggml_soft_max(ctx, kq); cb(kq, "kq_soft_max", il); } else { - kq = ggml_soft_max_ext(ctx, kq, kq_mask, scale); + kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale); cb(kq, "kq_soft_max_ext", il); } @@ -4231,10 +4230,6 @@ struct llm_build_context { struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -4295,7 +4290,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4416,10 +4411,6 @@ struct llm_build_context { struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -4478,7 +4469,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4536,10 +4527,6 @@ struct llm_build_context { struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -4602,7 +4589,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4659,10 +4646,6 @@ struct llm_build_context { struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -4702,7 +4685,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4759,10 +4742,6 @@ struct llm_build_context { struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -4911,7 +4890,7 @@ struct llm_build_context { // TODO: not tested, could be broken cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4965,10 +4944,6 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); cb(inpL, "inp_embd", -1); - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -5002,7 +4977,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5056,10 +5031,6 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); cb(inpL, "inp_embd", -1); - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -5099,7 +5070,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5150,10 +5121,6 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); cb(inpL, "inp_embd", -1); - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -5193,7 +5160,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5253,10 +5220,6 @@ struct llm_build_context { struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -5306,7 +5269,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5366,10 +5329,6 @@ struct llm_build_context { struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -5423,7 +5382,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5482,14 +5441,6 @@ struct llm_build_context { struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // Q_scale - struct ggml_tensor * Q_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(Q_scale, "Q_scale", -1); - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -5531,7 +5482,9 @@ struct llm_build_context { ); cb(Qcur, "Qcur", il); - Qcur = ggml_scale(ctx0, Qcur, Q_scale); + // with phi2, we scale the Q to avoid precision issues + // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 + Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head))); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( @@ -5544,7 +5497,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il); cb(cur, "kqv_out", il); } @@ -5681,8 +5634,6 @@ static const std::unordered_map k_offload_map { "pos_embd", OFFLOAD_FUNC_NR }, { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope) - { "Q_scale", OFFLOAD_FUNC_NOP }, - { "KQ_scale", OFFLOAD_FUNC_NOP }, { "KQ_mask", OFFLOAD_FUNC_FRC }, { "K_shift", OFFLOAD_FUNC_FRC }, @@ -5784,8 +5735,6 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_tokens = false; bool alloc_inp_embd = false; bool alloc_inp_pos = false; - bool alloc_inp_Q_scale = false; - bool alloc_inp_KQ_scale = false; bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; @@ -5849,37 +5798,6 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_pos = true; } - if (!alloc_inp_Q_scale && strcmp(name, "Q_scale") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_embd_head = model.hparams.n_embd_head(); - float f = 1.0f/sqrtf(float(n_embd_head)); - ggml_backend_tensor_set(cur, &f, 0, sizeof(f)); - } - - alloc_inp_Q_scale = true; - } - - if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_embd_head = model.hparams.n_embd_head(); - float f; - if (model.arch == LLM_ARCH_PHI2) { - // with phi2, we scale the Q to avoid precision issues - // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 - f = 1.0f; - } else { - f = 1.0f/sqrtf(float(n_embd_head)); - } - ggml_backend_tensor_set(cur, &f, 0, sizeof(f)); - } - - alloc_inp_KQ_scale = true; - } - if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { ggml_allocr_alloc(lctx.alloc, cur); @@ -9054,10 +8972,7 @@ static int llama_apply_lora_from_file_internal( ggml_set_name(BA, "BA"); if (scaling != 1.0f) { - ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx.get(), scaling); - ggml_set_name(scale_tensor, "scale_tensor"); - - BA = ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor); + BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling); offload_func(BA); ggml_set_name(BA, "BA_scaled"); } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f04b9438a..f3df8a8c6 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -766,18 +766,19 @@ struct test_bin_bcast : public test_case { struct test_scale : public test_case { const ggml_type type; const std::array ne; + float scale; std::string vars() override { - return VARS_TO_STR2(type, ne); + return VARS_TO_STR3(type, ne, scale); } test_scale(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}) - : type(type), ne(ne) {} + std::array ne = {10, 10, 10, 10}, + float scale = 2.0f) + : type(type), ne(ne), scale(scale) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); - ggml_tensor * scale = ggml_new_tensor_1d(ctx, type, 1); ggml_tensor * out = ggml_scale(ctx, a, scale); return out; } diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp index 81c20a89c..14914def5 100644 --- a/tests/test-grad0.cpp +++ b/tests/test-grad0.cpp @@ -881,19 +881,19 @@ int main(int argc, const char ** argv) { // scale { srand(seed); - const int nargs = 2; + const int nargs = 1; int64_t ne2[4]; ne2[0] = 1; for (int ndims = 1; ndims <= 2; ++ndims) { - x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f); - ggml_set_param(ctx0, x[0]); - ggml_set_param(ctx0, x[1]); + const float s = -1.0f + 2.0f*frand(); - struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], x[1])); + ggml_set_param(ctx0, x[0]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s)); check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); } @@ -1395,7 +1395,7 @@ int main(int argc, const char ** argv) { ggml_add1(ctx0, ggml_scale(ctx0, ggml_soft_max(ctx0, x[0]), - ggml_new_f32(ctx0, 1.0f - eps)), + 1.0f - eps), ggml_new_f32(ctx0, eps)))); check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);