From 2347e45e7bdb09c9a7d74b2c0bc86c2b65f0c343 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 13 Jun 2023 20:20:07 +0300 Subject: [PATCH 01/21] llama : do a warm-up eval at start for better timings (#1824) --- examples/main/main.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 66d563143..efa913e16 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -331,6 +331,13 @@ int main(int argc, char ** argv) { std::vector embd; + // do one empty run to warm up the model + { + const std::vector tmp = { llama_token_bos(), }; + llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); + llama_reset_timings(ctx); + } + while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict if (embd.size() > 0) { From e32089b2c20b1b87b22912f4a8b93fe01647d5b9 Mon Sep 17 00:00:00 2001 From: xaedes Date: Tue, 13 Jun 2023 21:04:40 +0200 Subject: [PATCH 02/21] train : improved training-from-scratch example (#1652) * add python wrapper https://gist.github.com/abetlen/2b90e5f153f6efd00931d098de5c73ce * fix decoding error. adds errors=ignore parameter * add python bindings for functions to get and set the whole llama state (rng, logits, embedding and kv_cache) * update python bindings * add text generating baby-llama from scratch example * fix race condition bug in ggml_compute_forward_diag_mask_f32 * implement ggml_soft_max_back for more performant backward pass of soft_max avoids creating big intermediate matrices of size n_embd x n_embd for llama layers and n_vocab x n_vocab for cross entropy loss * improve softmax backward pass go from quadratic runtime to linear runtime by simplifying the formulas * fix race condition bug in non-inplace ggml_compute_forward_diag_mask_f32 memcpy needs to be synchronized across threads to avoid race conditions. => do it in INIT phase * fix bug in ggml_compute_forward_soft_max_back_f32 on DEBUG build * improve performance of mul_mat backward pass avoid transpose by using mul_mat with swapped arguments * avoid printing too much newlines in baby-llama-text * activate threading in baby-llama-text * add ggml_out_prod and use it for mul_mat backward pass for improved performance performance stats report improvement from 37 seconds to 16 seconds runtime during my training tests * better weight initialization improves training convergence at start * better weight initialization improves training convergence at start * improve ggml_out_prod performance - change iteration order (>15s -> 10s runtime) - parallelize over one more dimension: over dst matrix rows (10s -> <5s runtime) * add llama sampler, shuffle samples and constrain sampling to tokens occurring in train data * fix get_samples call, add model tensor names, increase model size, start training samples after newline * save train trained model to checkpoint and load model to be trained from checkpoint * use inplace functions where possible * initialize rng with srand * use different arguments for input and output checkpoint * ggml fixes to support backward pass on inplace operations * remove duplicate include * fix cross entropy loss - add target probabilities for each sample which is then used in cross entropy loss * print used memory before and after optimization * sample with non-greedy sampling parameters at the end of training * add cmake target for baby-llama-text * add ggml_add1_inplace to header * enable gradient propagation for inplace add1 and scale operations those functions backward passes don't need the original src0, so they also work when forward is inplace * implement AdamW in ggml_opt_adam by adding weight decay parameter (default 0.001f) also add a schedule parameter (default 1.0f) that can be used to scale alpha and decay according to learning schedule. setting the decay parameter to zero disables AdamW resulting in normal Adam optimizer. since the difference between Adam and AdamW is minimal it is not implemented as another optimizer, but integrated into the existing Adam optimizer. * use inplace operations in cross_entropy_loss * fix random weight initialization scale * add missing default parameters for adam optimizer * add ggml_opt_context, so that we can properly resume training otherwise the optimizer states, tracking statistics about the error function and its derivates, will reset to zero each time ggml_opt is called, hindering convergence on resumed training. now the optimizer context and all its memory is stored in a separate struct. * fix bug in llama_sample_token_mirostat_v2 when all candidates are filtered out through mu threshold, the following soft_max operation will fail. so keep at least one. * add forward function without using cache, for more performant training during training on whole samples no cache is required. removing the cache and simplifying the remaining code results in performance and memory usage improvement. * print suppressed newline tokens as string "\n" printing too much actual newlines is suppressed to avoid flooding the console. * store optimizer state in training checkpoint and add learning schedule persistent optimizer state allows to resume training without resetting the optimizer learning schedule consists of linear warmup ramp followed by cosine decay with restarts * remove unused functions * fix bug in get_samples which corrupted training targets * save checkpoint only when it was trained * simplify code * remove trailing whitespace * simplify backward pass for SQRT * replace inefficient repeat backward pass with dedicated repeat_back operation * add ggml_cross_entropy_loss with backward pass for faster training cross entropy loss can also be implemented using softmax and log, but as dedicated operation it is faster and especially avoids unnecessary memory overhead. * add tests for cross_entropy_loss backward pass finite differences regularly results in estimated gradient of zero, despite the backward pass giving non zero gradient. _probably_ the finite differences fails due to numerical issues * use ggml_cross_entropy_loss in text training example * remove trailing whitespace * slightly improve how cross entropy loss is compute btw: directly implemented cross entropy loss seems to have way lower magnitudes than when implemented with softmax and log. probably the input to log gets closer to zero due to float numerics. maybe the multiplication by (1.0-eps)/sum is more accurate.. * add llama_get_vocab to get the vocabulary as output parameters * set default model.type for unknown models with few layers * add export of training checkpoint to llama compatible model file * get vocabulary for exporting training checkpoint to llama compatible model file * implement backward pass of flash attention * bugfixes for backward pass of flash attention * test flash attention backward pass need to set loose error bounds to pass. the finitie differences are close to numeric limits and often return quite different values than the backward pass. reducing eps further lets the gradients vanish completely. likewise setting eps to big results in wronger values. the softmax in the middle of the function is probably the most responsible for the numeric issues using finite differences. * add option to train with flash attention and move options to the top of the main function training from scratch also works with flash attention training convergence and generation results after fix number of iterations are worse than when not using flash attention. maybe there still lingers a bug in the flash attention backward pass? but training works, just with slower convergence. flash attention is still worth to use, because it requires way less memory and is faster with high n_ctx * add train_params and command line option parser * remove unnecessary comments * add train params to specify memory size * remove python bindings * rename baby-llama-text to train-text-from-scratch * replace auto parameters in lambda function * add #include * add explicit cast to fix compile error "error: non-constant-expression cannot be narrowed from type 'int64_t' (aka 'long long') to 'uint32_t' (aka 'unsigned int') in initializer list [-Wc++11-narrowing]" * remove trailing whitespace * add ggml_opt_resume_g which accepts forward and backward cgraphs * fix formulas in comments * bug fix for ggml_compute_forward_get_rows_back_f32 the result should be set to zero, not to whatever data is in opt0 * improve training memory usage with scratch buffers instead of relying on the automatic backward pass, we manually create the graph for the backward pass. it turns out that all backward pass operations need only temporary memory which can be reused after each layer. will compute backward pass for ALL model parameters * add option to use scratch buffers in training or not make it configurable because currently training with scratch buffers implies flash attention and optimization over all parameters. * ci : disable temporary * store view offset and permute axes in opt[0] instead of storing it in padding use memcpy to store offset, because offset is of type size_t. when storing it as int32_t offset would have to be smaller than 2^31 which is not necessarily true. * minor : fix compile warnings + minor style changes * fix bug in threaded indices calculation of ggml_compute_forward_flash_attn_back_f32 * store view offset like in master branch * bug fix in forward_batch_wo_cache_flash_attn_train * scratch buffer bug fixes in forward_batch_wo_cache_flash_attn_train data of permute and reshape is the same as their input. if we want to preserve the output of permute/reshape, we also need to preserve their inputs. replace reshape(src0, src1) with reshape_nd calls so that we don't need src1. replace (temporary) t03 with ggml_repeat(ctx0, layer.attention_norm, t02). in the future we could also use the new broadcasting ggml_mul to avoid these repeat calls. for this we need backward pass of broadcasting ggml_mul. * remove unnecessary scratch buffer 0 buf 0 is persistent memory, so we can just disable scratch for this by using buf -1 * avoid creating unnecessary grad tensors previously we need to create grads for model parameters, so that expand(..) correctly populates cgraph->leafs & cgraph->grads this wasted memory, because unnecessary grad for each op were automatically created: the automatically generated grad was unnecessary because we later manually set the grad (e.g. t35->grad = expand(gb, ...) ). this discarded the automatically generated grad resulting in wasted memory. improved this by changing expand(..) to not use ggml_build_forward_expand. expand set cgraph->nodes but not the leafs. cgraph->leafs & cgraph->grads are set in another pass after the last expand call. * print used training seed * zero initialize gfbuf and gbbuf * ci : re-enable workflows + add README for training --------- Co-authored-by: Georgi Gerganov --- examples/CMakeLists.txt | 1 + examples/baby-llama/baby-llama.cpp | 13 +- .../train-text-from-scratch/CMakeLists.txt | 4 + examples/train-text-from-scratch/README.md | 22 + .../train-text-from-scratch.cpp | 3399 +++++++++++++++++ ggml.c | 2097 ++++++++-- ggml.h | 127 +- llama.cpp | 25 + llama.h | 8 + tests/test-grad0.c | 60 +- 10 files changed, 5492 insertions(+), 264 deletions(-) create mode 100644 examples/train-text-from-scratch/CMakeLists.txt create mode 100644 examples/train-text-from-scratch/README.md create mode 100644 examples/train-text-from-scratch/train-text-from-scratch.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 3deff4077..de005f3e3 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -37,6 +37,7 @@ else() add_subdirectory(save-load-state) add_subdirectory(benchmark) add_subdirectory(baby-llama) + add_subdirectory(train-text-from-scratch) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 5573c154b..e5639da37 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -79,34 +79,39 @@ struct ggml_tensor * randomize_tensor_normal( int ndims, const int64_t ne[], struct random_normal_distribution * rnd) { + float scale = 1.0; // xavier switch (ndims) { case 1: + scale /= sqrtf(ne[0]); for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i0] = frand_normal(rnd); + ((float *)tensor->data)[i0] = scale * frand_normal(rnd); } break; case 2: + scale /= sqrtf(ne[0]+ne[1]); for (int i1 = 0; i1 < ne[1]; i1++) { for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i1*ne[0] + i0] = frand_normal(rnd); + ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd); } } break; case 3: + scale /= sqrtf(ne[0]+ne[1]); for (int i2 = 0; i2 < ne[2]; i2++) { for (int i1 = 0; i1 < ne[1]; i1++) { for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd); + ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd); } } } break; case 4: + scale /= sqrtf(ne[0]+ne[1]); for (int i3 = 0; i3 < ne[3]; i3++) { for (int i2 = 0; i2 < ne[2]; i2++) { for (int i1 = 0; i1 < ne[1]; i1++) { for (int i0 = 0; i0 < ne[0]; i0++) { - ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd); + ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd); } } } diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt new file mode 100644 index 000000000..1a44c4961 --- /dev/null +++ b/examples/train-text-from-scratch/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET train-text-from-scratch) +add_executable(${TARGET} train-text-from-scratch.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md new file mode 100644 index 000000000..5344d1f52 --- /dev/null +++ b/examples/train-text-from-scratch/README.md @@ -0,0 +1,22 @@ +# train-text-from-scratch + +Basic usage instructions: + +```bash +# get training data +wget https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt + +# train +./bin/train-text-from-scratch \ + --vocab-model ../models/ggml-vocab.bin \ + --ctx 64 --embd 256 --head 8 --layer 16 \ + --checkpoint-in chk-shakespeare-256x16.bin \ + --checkpoint-out chk-shakespeare-256x16.bin \ + --model-out ggml-shakespeare-256x16-f32.bin \ + --train-data "shakespeare.txt" \ + -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \ + --print-details-interval 0 --predict 16 --use-flash + +# predict +./bin/main -m ggml-shakespeare-256x16-f32.bin +``` diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp new file mode 100644 index 000000000..51271b497 --- /dev/null +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -0,0 +1,3399 @@ +#include "ggml.h" +#include "llama.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +struct random_normal_distribution { + std::mt19937 gen; + std::normal_distribution rd; + float min; + float max; +}; + + +struct random_uniform_distribution { + std::mt19937 gen; + std::uniform_real_distribution rd; +}; + +void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { + rnd->gen = std::mt19937(seed); + rnd->rd = std::normal_distribution{mean, std}; + rnd->min = min; + rnd->max = max; +} + +void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) { + rnd->gen = std::mt19937(seed); + rnd->rd = std::uniform_real_distribution{min, max}; +} + +int clamp(const int v, const int min, const int max) { + return ((v < min) ? (min) : (v > max) ? (max) : v); +} + +float fclamp(const float v, const float min, const float max) { + return ((v < min) ? (min) : (v > max) ? (max) : v); +} + +float frand() { + return (float)rand()/(float)RAND_MAX; +} + +float frand_normal(struct random_normal_distribution * rnd) { + return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); +} + +float frand_uniform(struct random_uniform_distribution * rnd) { + return rnd->rd(rnd->gen); +} + +struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { + float scale = 1.0f; // xavier + switch (tensor->n_dims) { + case 1: + scale /= sqrtf(tensor->ne[0]); + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = scale * frand_normal(rnd); + } + break; + case 2: + scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = scale * frand_normal(rnd); + } + } + break; + case 3: + scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = scale * frand_normal(rnd); + } + } + } + break; + case 4: + scale /= sqrtf(tensor->ne[0]+tensor->ne[1]); + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = scale * frand_normal(rnd); + } + } + } + } + break; + default: + assert(false); + }; + return tensor; +} + +struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { + switch (tensor->n_dims) { + case 1: + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); + *dst = frand_uniform(rnd); + } + break; + case 2: + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *dst = frand_uniform(rnd); + } + } + break; + case 3: + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *dst = frand_uniform(rnd); + } + } + } + break; + case 4: + for (int i3 = 0; i3 < tensor->ne[3]; i3++) { + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]); + *dst = frand_uniform(rnd); + } + } + } + } + break; + default: + assert(false); + }; + return tensor; +} + +struct llama_vocab { + using id = int32_t; + using token = std::string; + + struct token_score { + token tok; + float score; + }; + + std::unordered_map token_to_id; + std::vector id_to_token; +}; + +struct my_llama_hparams { + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 4; + uint32_t n_head = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + + bool operator!=(const my_llama_hparams& other) const { + return memcmp(this, &other, sizeof(my_llama_hparams)); + } +}; + +struct my_llama_layer { + // normalization + struct ggml_tensor * attention_norm; + + // attention + struct ggml_tensor * wq; + struct ggml_tensor * wk; + struct ggml_tensor * wv; + struct ggml_tensor * wo; + + // normalization + struct ggml_tensor * ffn_norm; + + // ff + struct ggml_tensor * w1; + struct ggml_tensor * w2; + struct ggml_tensor * w3; +}; + +struct my_llama_kv_cache { + struct ggml_context * ctx = NULL; + + struct ggml_tensor * k; + struct ggml_tensor * v; + + // llama_ctx_buffer buf; + + int n; // number of tokens currently in the cache +}; + +struct my_llama_model { + struct ggml_context * ctx = NULL; + + my_llama_hparams hparams; + + struct ggml_tensor * tok_embeddings; + + struct ggml_tensor * norm; + struct ggml_tensor * output; + + std::vector layers; + + uint32_t train_its = 0; + uint32_t train_samples = 0; + uint32_t train_tokens = 0; +}; + +uint32_t get_n_ff(const struct my_llama_hparams* hparams) { + const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; + return n_ff; +} + +void print_params(struct my_llama_hparams * params) { + printf("%s: n_vocab: %d\n", __func__, params->n_vocab); + printf("%s: n_ctx: %d\n", __func__, params->n_ctx); + printf("%s: n_embd: %d\n", __func__, params->n_embd); + printf("%s: n_mult: %d\n", __func__, params->n_mult); + printf("%s: n_head: %d\n", __func__, params->n_head); + printf("%s: n_ff: %d\n", __func__, get_n_ff(params)); + printf("%s: n_layer: %d\n", __func__, params->n_layer); + printf("%s: n_rot: %d\n", __func__, params->n_rot); +} + +void init_model(struct my_llama_model * model) { + const auto & hparams = model->hparams; + + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; + + const uint32_t n_ff = get_n_ff(&hparams); + + struct ggml_context * ctx = model->ctx; + + model->train_its = 0; + model->train_samples = 0; + model->train_tokens = 0; + + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); + model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); + + ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); + ggml_set_name(model->norm, "norm.weight"); + ggml_set_name(model->output, "output.weight"); + + model->layers.resize(n_layer); + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + std::string layers_i = "layers." + std::to_string(i); + + layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + + layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); + layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); + layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); + + ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str()); + + ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str()); + ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str()); + ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str()); + ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str()); + + ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str()); + + // 'layers.10.feed_forward.w1.weight' has length of 32. + // ggml_tensor->name only has 32 characters, but we need one more for the '\0' terminator. + // ggml_set_name will set the last character to '\0', so we can only store 'layers.10.feed_forward.w1.weigh'. + // when saving llama compatible model the tensors names will miss a character. + // ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str()); + // ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str()); + // ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str()); + + strncpy(layer.w1->name, (layers_i + ".feed_forward.w1.weight").c_str(), sizeof(layer.w1->name)); + strncpy(layer.w2->name, (layers_i + ".feed_forward.w2.weight").c_str(), sizeof(layer.w2->name)); + strncpy(layer.w3->name, (layers_i + ".feed_forward.w3.weight").c_str(), sizeof(layer.w3->name)); + layer.w1->padding[0] = 0; + layer.w2->padding[0] = 0; + layer.w3->padding[0] = 0; + } +} + +void set_param_model(struct my_llama_model * model) { + const auto& hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct ggml_context* ctx = model->ctx; + + ggml_set_param(ctx, model->tok_embeddings); + ggml_set_param(ctx, model->norm); + ggml_set_param(ctx, model->output); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + + ggml_set_param(ctx, layer.attention_norm); + ggml_set_param(ctx, layer.wq); + ggml_set_param(ctx, layer.wk); + ggml_set_param(ctx, layer.wv); + ggml_set_param(ctx, layer.wo); + ggml_set_param(ctx, layer.ffn_norm); + ggml_set_param(ctx, layer.w1); + ggml_set_param(ctx, layer.w2); + ggml_set_param(ctx, layer.w3); + } +} + +void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) { + const auto & hparams = model->hparams; + + const uint32_t n_layer = hparams.n_layer; + + struct random_normal_distribution rnd; + init_random_normal_distribution(&rnd, seed, mean, std, min, max); + + randomize_tensor_normal(model->tok_embeddings, &rnd); + randomize_tensor_normal(model->norm, &rnd); + randomize_tensor_normal(model->output, &rnd); + + for (uint32_t i = 0; i < n_layer; ++i) { + auto & layer = model->layers[i]; + randomize_tensor_normal(layer.attention_norm, &rnd); + + randomize_tensor_normal(layer.wq, &rnd); + randomize_tensor_normal(layer.wk, &rnd); + randomize_tensor_normal(layer.wv, &rnd); + randomize_tensor_normal(layer.wo, &rnd); + + randomize_tensor_normal(layer.ffn_norm, &rnd); + + randomize_tensor_normal(layer.w1, &rnd); + randomize_tensor_normal(layer.w2, &rnd); + randomize_tensor_normal(layer.w3, &rnd); + } +} + +bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * model, int n_batch) { + const auto & hparams = model->hparams; + + const uint32_t n_ctx = hparams.n_ctx; + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_layer = hparams.n_layer; + + const int64_t n_mem = n_layer*n_ctx*n_batch; + const int64_t n_elements = n_embd*n_mem; + + // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + + // struct ggml_init_params params; + // params.mem_size = cache.buf.size; + // params.mem_buffer = cache.buf.addr; + // params.no_alloc = false; + if (!cache->ctx) { + struct ggml_init_params params; + params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024; + params.mem_buffer = NULL; + params.no_alloc = false; + + cache->ctx = ggml_init(params); + + if (!cache->ctx) { + fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); + return false; + } + } + + cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); + cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); + + return true; +} + +struct ggml_tensor * forward( + struct my_llama_model * model, + struct my_llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past) { + + const int N = n_tokens; + + struct my_llama_kv_cache& kv_self = *cache; + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens)); + + struct ggml_tensor * kc = kv_self.k; + struct ggml_tensor * vc = kv_self.v; + + // inpL shape [n_embd,N,1,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // lctx.use_buf(ctx0, 0); + + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpL); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Kcur shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + // wv shape [n_embd, n_embd, 1, 1] + // Vcur shape [n_embd, N, 1, 1] + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N))); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // kv_self.v shape [n_embd * n_ctx * n_layer, 1] + // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] + // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] + + /* { + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } //*/ + + kc = ggml_set_1d_inplace(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + vc = ggml_set_2d_inplace(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + } + + // Qcur shape [n_embd/n_head, n_head, N, 1] + // Q shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + + // kv_self.k shape [n_embd * n_ctx * n_layer, 1] + // K shape [n_embd/n_head, n_past + N, n_head, 1] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); + + // K * Q + // KQ shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [n_past + N, N, n_head, 1] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + + // split cached V into n_head heads + //// V shape [n_past + N, n_embd/n_head, n_head, 1] + // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1] + struct ggml_tensor * V = + ggml_view_3d(ctx0, vc, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(vc), + n_ctx*ggml_element_size(vc)*n_embd/n_head, + il*n_ctx*ggml_element_size(vc)*n_embd); + + // KQV shape [n_embd/n_head, N, n_head, 1] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, 1] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + // KQV_merged shape + + // cur = KQV_merged.contiguous().view(n_embd, N) + // cur shape [n_embd,N,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N); + // cur = ggml_cpy(ctx0, + // KQV_merged, + // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + } + + // lctx.use_buf(ctx0, 1); + + // inpFF shape [n_embd,N,1,1] + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N,1,1] + cur = ggml_rms_norm(ctx0, inpFF); + + // cur = ffn_norm*cur + // cur shape [n_embd,N,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + } + + // tmp shape [n_ff,N,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + + // SILU activation + // cur shape [n_ff,N,1,1] + cur = ggml_silu(ctx0, cur); + + // cur shape [n_ff,N,1,1] + cur = ggml_mul(ctx0, cur, tmp); + + // cur shape [n_embd,N,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + } + + // cur shape [n_embd,N,1,1] + cur = ggml_add(ctx0, cur, inpFF); + + // input for next layer + // inpL shape [n_embd,N,1,1] + inpL = cur; + } + + // norm + { + + // inpL shape [n_embd,N,1,1] + inpL = ggml_rms_norm(ctx0, inpL); + + // inpL = norm*inpL + // inpL shape [n_embd,N,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + //embeddings = inpL; + } + + // lm_head + // inpL shape [n_vocab,N,1,1] + inpL = ggml_mul_mat(ctx0, model->output, inpL); + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + +void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { + GGML_ASSERT(tensor->n_dims == 1); + GGML_ASSERT(tensor->ne[0] == ne0); +} + +void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { + GGML_ASSERT(tensor->n_dims == 2); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); +} + +void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { + GGML_ASSERT(tensor->n_dims == 3); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); +} + +void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { + GGML_ASSERT(tensor->n_dims == 4); + GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == ne2); + GGML_ASSERT(tensor->ne[3] == ne3); +} + +struct ggml_tensor * forward_batch( + struct my_llama_model * model, + struct my_llama_kv_cache * cache, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_past, + const int n_batch) { + + const int N = n_tokens; + + struct my_llama_kv_cache& kv_self = *cache; + const auto & hparams = model->hparams; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + const int n_ff = get_n_ff(&hparams); + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); + memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); + + struct ggml_tensor * kc = kv_self.k; + struct ggml_tensor * vc = kv_self.v; + + // inpL shape [n_embd,N*n_batch,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + assert_shape_2d(inpL, n_embd, N*n_batch); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // lctx.use_buf(ctx0, 0); + + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Kcur shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); + assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); + + // store key and value to memory + { + // compute the transposed [N, n_embd] V matrix + // wv shape [n_embd, n_embd, 1, 1] + // Vcur shape [N, n_embd, n_batch, 1] + struct ggml_tensor * Vcur = ggml_cont(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_mul_mat(ctx0, + model->layers[il].wv, + cur), + n_embd, N, n_batch), + 1, 0, 2, 3)); + assert_shape_3d(Vcur, N, n_embd, n_batch); + + // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] + // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il] + // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il] + + /* { + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } //*/ + + kc = ggml_set_2d_inplace(ctx0, kc, + ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch), + ggml_element_size(kc)*n_embd*n_ctx, + (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past)); + vc = ggml_set_2d_inplace(ctx0, vc, + ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch), + ggml_element_size(vc)*n_ctx*n_embd, + ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx)); + + assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer); + assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer); + } + + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Q shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); + + // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // K shape [n_embd/n_head, n_past + N, n_head, n_batch] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_4d(ctx0, + ggml_view_3d(ctx0, + kc, + n_embd, + (n_past + N), + n_batch, + n_embd*ggml_element_size(kc), + n_ctx*n_embd*ggml_element_size(kc), + il*n_batch*n_ctx*n_embd*ggml_element_size(kc)), + n_embd/n_head, n_head, n_past + N, n_batch), + 0, 2, 1, 3); + assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch); + + // K * Q + // KQ shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + assert_shape_4d(KQ, n_past + N, N, n_head, n_batch); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [n_past + N, N, n_head, n_batch] + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch); + + // split cached V into n_head heads + // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] + // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il] + struct ggml_tensor * V = + ggml_view_4d(ctx0, vc, + n_past + N, n_embd/n_head, n_head, n_batch, + ggml_element_size(vc)*n_ctx, + ggml_element_size(vc)*n_ctx*n_embd/n_head, + ggml_element_size(vc)*n_ctx*n_embd, + il*n_batch*n_ctx*n_embd*ggml_element_size(vc)); + assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch); + + // KQV shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); + // KQV_merged shape + + // cur = KQV_merged.contiguous().view(n_embd, N) + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); + assert_shape_2d(cur, n_embd, N*n_batch); + // cur = ggml_cpy(ctx0, + // KQV_merged, + // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // lctx.use_buf(ctx0, 1); + + // inpFF shape [n_embd,N*n_batch,1,1] + struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); + assert_shape_2d(inpFF, n_embd, N*n_batch); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = ffn_norm*cur + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // tmp shape [n_ff,N*n_batch,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + assert_shape_2d(tmp, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // SILU activation + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_silu(ctx0, cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul(ctx0, cur, tmp); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_add_inplace(ctx0, cur, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // input for next layer + // inpL shape [n_embd,N*n_batch,1,1] + inpL = cur; + assert_shape_2d(inpL, n_embd, N*n_batch); + } + + // norm + { + + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(inpL, n_embd, N*n_batch); + + // inpL = norm*inpL + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + assert_shape_2d(inpL, n_embd, N*n_batch); + + //embeddings = inpL; + } + + // lm_head + // inpL shape [n_vocab,N*n_batch,1,1] + inpL = ggml_mul_mat(ctx0, model->output, inpL); + assert_shape_2d(inpL, n_vocab, N*n_batch); + + { + // inpL shape [n_vocab,N,n_batch,1] + inpL = ggml_reshape_3d(ctx0, + inpL, + n_vocab, N, n_batch); + assert_shape_3d(inpL, n_vocab, N, n_batch); + } + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + +struct ggml_tensor * forward_batch_wo_cache( + struct my_llama_model * model, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_batch) { + + const int n_past = 0; + const int N = n_tokens; + + const auto & hparams = model->hparams; + //const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + const int n_ff = get_n_ff(&hparams); + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); + memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); + + // inpL shape [n_embd,N*n_batch,1] + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + assert_shape_2d(inpL, n_embd, N*n_batch); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // lctx.use_buf(ctx0, 0); + + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Kcur shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); + assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); + + // Vcur shape [N, n_batch, n_embd/n_head, n_head] + struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head); + assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head); + + // Qcur shape [n_embd/n_head, n_head, N, n_batch] + // Q shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); + + // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] + // K shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * K = + ggml_permute(ctx0, + Kcur, + 0, 2, 1, 3); + assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch); + + // K * Q + // KQ shape [N, N, n_head, n_batch] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + assert_shape_4d(KQ, N, N, n_head, n_batch); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled shape [N, N, n_head, n_batch] + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); + assert_shape_4d(KQ_scaled, N, N, n_head, n_batch); + + // KQ_masked = mask_past(KQ_scaled) + // KQ_masked shape [N, N, n_head, n_batch] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + assert_shape_4d(KQ_masked, N, N, n_head, n_batch); + + // KQ = soft_max(KQ_masked) + // KQ_soft_max shape [N, N, n_head, n_batch] + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch); + + // Vcur shape [N, n_batch, n_embd/n_head, n_head] + // V shape [N, n_embd/n_head, n_head, n_batch] + struct ggml_tensor * V = + ggml_permute(ctx0, + Vcur, + 0, 3, 1, 2); + assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch); + + // KQV shape [n_embd/n_head, N, n_head, n_batch] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); + // KQV_merged shape + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); + assert_shape_2d(cur, n_embd, N*n_batch); + + // projection (no bias) + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // lctx.use_buf(ctx0, 1); + + // inpFF shape [n_embd,N*n_batch,1,1] + struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); + assert_shape_2d(inpFF, n_embd, N*n_batch); + + // feed-forward network + { + // norm + { + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_rms_norm(ctx0, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = ffn_norm*cur + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // tmp shape [n_ff,N*n_batch,1,1] + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + assert_shape_2d(tmp, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // SILU activation + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_silu(ctx0, cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_ff,N*n_batch,1,1] + cur = ggml_mul(ctx0, cur, tmp); + assert_shape_2d(cur, n_ff, N*n_batch); + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // cur shape [n_embd,N*n_batch,1,1] + cur = ggml_add_inplace(ctx0, cur, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // input for next layer + // inpL shape [n_embd,N*n_batch,1,1] + inpL = cur; + assert_shape_2d(inpL, n_embd, N*n_batch); + } + + // norm + { + + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(inpL, n_embd, N*n_batch); + + // inpL = norm*inpL + // inpL shape [n_embd,N*n_batch,1,1] + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + assert_shape_2d(inpL, n_embd, N*n_batch); + + //embeddings = inpL; + } + + // lm_head + // inpL shape [n_vocab,N*n_batch,1,1] + inpL = ggml_mul_mat(ctx0, model->output, inpL); + assert_shape_2d(inpL, n_vocab, N*n_batch); + + { + // inpL shape [n_vocab,N,n_batch,1] + inpL = ggml_reshape_3d(ctx0, + inpL, + n_vocab, N, n_batch); + assert_shape_3d(inpL, n_vocab, N, n_batch); + } + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + +struct ggml_tensor * forward_batch_wo_cache_flash_attn( + struct my_llama_model * model, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_tensor * tokens_input, + const int n_tokens, + const int n_batch) { + + const int n_past = 0; + const int N = n_tokens; + + const auto & hparams = model->hparams; + //const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + const int n_ff = get_n_ff(&hparams); + + struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); + memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); + + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); + assert_shape_2d(inpL, n_embd, N*n_batch); + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * cur; + + // norm + { + cur = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = attention_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].attention_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + // self-attention + { + // compute Q and K and RoPE them + // wq shape [n_embd, n_embd, 1, 1] + // wk shape [n_embd, n_embd, 1, 1] + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); + assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); + + struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head); + assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head); + + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); + + struct ggml_tensor * K = + ggml_permute(ctx0, + Kcur, + 0, 2, 1, 3); + assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch); + + struct ggml_tensor * V = + ggml_permute(ctx0, + Vcur, + 0, 3, 1, 2); + assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch); + + bool masked = true; + struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, masked); + assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); + + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); + cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); + assert_shape_2d(cur, n_embd, N*n_batch); + + // projection (no bias) + cur = ggml_mul_mat(ctx0, + model->layers[il].wo, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); + assert_shape_2d(inpFF, n_embd, N*n_batch); + + // feed-forward network + { + // norm + { + cur = ggml_rms_norm(ctx0, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // cur = ffn_norm*cur + cur = ggml_mul(ctx0, + ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model->layers[il].w3, + cur); + assert_shape_2d(tmp, n_ff, N*n_batch); + + cur = ggml_mul_mat(ctx0, + model->layers[il].w1, + cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + // SILU activation + cur = ggml_silu(ctx0, cur); + assert_shape_2d(cur, n_ff, N*n_batch); + + cur = ggml_mul(ctx0, cur, tmp); + assert_shape_2d(cur, n_ff, N*n_batch); + + cur = ggml_mul_mat(ctx0, + model->layers[il].w2, + cur); + assert_shape_2d(cur, n_embd, N*n_batch); + } + + cur = ggml_add_inplace(ctx0, cur, inpFF); + assert_shape_2d(cur, n_embd, N*n_batch); + + // input for next layer + inpL = cur; + assert_shape_2d(inpL, n_embd, N*n_batch); + } + + // norm + { + + inpL = ggml_rms_norm(ctx0, inpL); + assert_shape_2d(inpL, n_embd, N*n_batch); + + // inpL = norm*inpL + inpL = ggml_mul(ctx0, + ggml_repeat(ctx0, model->norm, inpL), + inpL); + + assert_shape_2d(inpL, n_embd, N*n_batch); + } + + // lm_head + inpL = ggml_mul_mat(ctx0, model->output, inpL); + assert_shape_2d(inpL, n_vocab, N*n_batch); + + { + inpL = ggml_reshape_3d(ctx0, + inpL, + n_vocab, N, n_batch); + assert_shape_3d(inpL, n_vocab, N, n_batch); + } + + // run the computation + ggml_build_forward_expand(gf, inpL); + + return inpL; +} + +// expand the graph nodes without creating leafs. +struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) { + // check if already visited + for (int i = 0; i < g->n_nodes; i++) { + if (g->nodes[i] == t) { + return t; + } + } + + for (int i = 0; i < g->n_leafs; i++) { + if (g->leafs[i] == t) { + return t; + } + } + + if (t->src0) { + expand(g, t->src0); + } + + if (t->src1) { + expand(g, t->src1); + } + + for (int i = 0; i < GGML_MAX_OPT; ++i) { + if (t->opt[i]) { + expand(g, t->opt[i]); + } + } + + GGML_ASSERT(g->n_nodes < GGML_MAX_NODES); + + if (strlen(t->name) == 0) { + snprintf(t->name, sizeof(t->name), "node_%d", g->n_nodes); + } + + g->nodes[g->n_nodes] = t; + g->grads[g->n_nodes] = t->grad; + g->n_nodes++; + return t; +} + +void graph_set_leafs_grads(struct ggml_cgraph * g) { + // moves leaf nodes to g->leafs. + // i.e. g->n_nodes might change. + int n_nodes = 0; + for (int i = 0; i < g->n_nodes; ++i) { + struct ggml_tensor * node = g->nodes[i]; + const bool is_leaf = node->op == GGML_OP_NONE && node->grad == NULL; + if (is_leaf) { + GGML_ASSERT(g->n_leafs < GGML_MAX_NODES); + + if (strlen(node->name) == 0) { + snprintf(node->name, sizeof(node->name), "leaf_%d", g->n_leafs); + } + + g->leafs[g->n_leafs] = node; + g->n_leafs++; + } else { + GGML_ASSERT(n_nodes < GGML_MAX_NODES); + + if (strlen(node->name) == 0) { + snprintf(node->name, sizeof(node->name), "node_%d", n_nodes); + } + + g->nodes[n_nodes] = node; + g->grads[n_nodes] = node->grad; + n_nodes++; + } + } + for (int i=n_nodes; i < g->n_nodes; ++i) { + g->nodes[n_nodes] = NULL; + g->grads[n_nodes] = NULL; + } + g->n_nodes = n_nodes; +} + +struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( + struct my_llama_model * model, + struct ggml_context * ctx0, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + struct ggml_tensor * * logits, + struct ggml_tensor * tokens_input, + struct ggml_tensor * targets, + void * compute_buf_0, + void * compute_buf_1, + size_t size_buf_0, + size_t size_buf_1, + const int n_tokens, + const int n_batch) { + + ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + + const int n_past = 0; + const int N = n_tokens; + + gf->n_nodes = 0; + gf->n_leafs = 0; + gf->work_size = 0; + gf->perf_runs = 0; + gf->perf_cycles = 0; + gf->perf_time_us = 0; + gf->work = NULL; + + const auto & hparams = model->hparams; + //const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_head = hparams.n_head; + const int n_rot = hparams.n_rot; + const int n_ff = get_n_ff(&hparams); + const int rope_mode = 0; + + int last_buf = -1; + size_t buf_offs[2] = { 0, 0 }; + size_t buf_size[2] = { size_buf_0, + size_buf_1 }; + void * buf_data[2] = { compute_buf_0, + compute_buf_1 }; + auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data] (int buf) { + size_t last_offs = 0; + last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + if (last_buf >= 0) { + buf_offs[last_buf] = last_offs; + } + if (buf >= 0) { + size_t offs = buf_offs[buf]; + size_t size = buf_size[buf]; + void * data = buf_data[buf]; + ggml_set_scratch(ctx0, { offs, size, data, }); + } + last_buf = buf; + }; + + bool track_max_mem = false; + size_t buf_maxs[2] = { 0, 0 }; + + auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) { + if (buf < 0) return; + if (track_max_mem) { + size_t last_offs = 0; + last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + if (last_buf >= 0) { + buf_offs[last_buf] = last_offs; + buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]); + } + } + buf_offs[buf] = 0; + if (track_max_mem && last_buf >= 0) { + size_t offs = buf_offs[last_buf]; + size_t size = buf_size[last_buf]; + void * data = buf_data[last_buf]; + ggml_set_scratch(ctx0, { offs, size, data, }); + } + }; + + + auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { + int64_t ne0 = n_embd/n_head; + int64_t ne1 = N; + int64_t ne2 = n_head; + int64_t ne3 = n_batch; + size_t nb0 = ggml_element_size(t); + size_t nb1 = nb0*ne0; + size_t nb2 = nb1*ne1; + size_t nb3 = nb2*ne2; + size_t offset = 0; + return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); + }; + + auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { + int64_t ne0 = n_embd/n_head; + int64_t ne1 = N; + int64_t ne2 = n_head; + int64_t ne3 = n_batch; + size_t nb0 = ggml_element_size(t); + size_t nb1 = nb0*ne0; + size_t nb2 = nb1*ne1; + size_t nb3 = nb2*ne2; + size_t offset = nb3*ne3; + return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); + }; + + auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { + int64_t ne0 = N; + int64_t ne1 = n_embd/n_head; + int64_t ne2 = n_head; + int64_t ne3 = n_batch; + size_t nb0 = ggml_element_size(t); + size_t nb1 = nb0*ne0; + size_t nb2 = nb1*ne1; + size_t nb3 = nb2*ne2; + size_t offset = 2*nb3*ne3; + return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); + }; + + auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * { + if (a == NULL) { + return b; + } else { + return ggml_add_inplace(ctx0, a, b); + } + }; + + use_buf(-1); + + model->tok_embeddings->grad = NULL; + model->norm->grad = NULL; + model->output->grad = NULL; + + for (int il = 0; il < n_layer; ++il) { + struct my_llama_layer & layer = model->layers[il]; + layer.attention_norm->grad = NULL; + layer.wq->grad = NULL; + layer.wk->grad = NULL; + layer.wv->grad = NULL; + layer.wo->grad = NULL; + layer.ffn_norm->grad = NULL; + layer.w1->grad = NULL; + layer.w2->grad = NULL; + layer.w3->grad = NULL; + } + + clr_buf(0); + clr_buf(1); + + use_buf(-1); + + struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch); + memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch); + + use_buf(-1); + + struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch); + + // need to remember these for the backward pass + std::vector t02L; t02L.resize(n_layer, NULL); + std::vector t03L; t03L.resize(n_layer, NULL); + std::vector t04L; t04L.resize(n_layer, NULL); + std::vector t05L; t05L.resize(n_layer, NULL); + std::vector t06L; t06L.resize(n_layer, NULL); + std::vector t07L; t07L.resize(n_layer, NULL); + std::vector t08L; t08L.resize(n_layer, NULL); + std::vector t09L; t09L.resize(n_layer, NULL); + std::vector t10L; t10L.resize(n_layer, NULL); + std::vector t11L; t11L.resize(n_layer, NULL); + std::vector t12L; t12L.resize(n_layer, NULL); + std::vector t13L; t13L.resize(n_layer, NULL); + std::vector t14L; t14L.resize(n_layer, NULL); + std::vector t15L; t15L.resize(n_layer, NULL); + std::vector t16L; t16L.resize(n_layer, NULL); + std::vector t17L; t17L.resize(n_layer, NULL); + std::vector t18L; t18L.resize(n_layer, NULL); + std::vector t19L; t19L.resize(n_layer, NULL); + std::vector t20L; t20L.resize(n_layer, NULL); + std::vector t21L; t21L.resize(n_layer, NULL); + std::vector t22L; t22L.resize(n_layer, NULL); + std::vector t23L; t23L.resize(n_layer, NULL); + std::vector t24L; t24L.resize(n_layer, NULL); + std::vector t25L; t25L.resize(n_layer, NULL); + std::vector t26L; t26L.resize(n_layer, NULL); + std::vector t27L; t27L.resize(n_layer, NULL); + std::vector t28L; t28L.resize(n_layer, NULL); + std::vector t29L; t29L.resize(n_layer, NULL); + std::vector t30L; t30L.resize(n_layer, NULL); + + struct ggml_tensor * cur = t01; + + for (int il = 0; il < n_layer; ++il) { + clr_buf(0); + struct my_llama_layer & layer = model->layers[il]; + // tensors with values necessary for backward pass are in persistent buf(-1) + // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed. + use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t02, n_embd, N*n_batch); + use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); + use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); + use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); + use_buf(-1); struct ggml_tensor * t14 = expand(gf, ggml_permute (ctx0, t10, 0, 2, 1, 3)); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); + use_buf(-1); struct ggml_tensor * t15 = expand(gf, ggml_permute (ctx0, t12, 0, 3, 1, 2)); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); + use_buf(-1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn (ctx0, t13, t14, t15, true)); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); + use_buf( 0); struct ggml_tensor * t17 = expand(gf, ggml_permute (ctx0, t16, 0, 2, 1, 3)); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t18 = expand(gf, ggml_cont (ctx0, t17)); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); + use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); + use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21)); assert_shape_2d(t22, n_embd, N*n_batch); + use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); + use_buf(-1); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat (ctx0, layer.w1, t24)); assert_shape_2d(t26, n_ff, N*n_batch); + use_buf(-1); struct ggml_tensor * t27 = expand(gf, ggml_silu (ctx0, t26)); assert_shape_2d(t27, n_ff, N*n_batch); + use_buf(-1); struct ggml_tensor * t28 = expand(gf, ggml_mul (ctx0, t27, t25)); assert_shape_2d(t28, n_ff, N*n_batch); + use_buf( 0); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat (ctx0, layer.w2, t28)); assert_shape_2d(t29, n_embd, N*n_batch); + use_buf(-1); struct ggml_tensor * t30 = expand(gf, ggml_add (ctx0, t21, t29)); assert_shape_2d(t30, n_embd, N*n_batch); + t02L[il] = t02; + t03L[il] = t03; + t04L[il] = t04; + t05L[il] = t05; + t06L[il] = t06; + t07L[il] = t07; + t08L[il] = t08; + t09L[il] = t09; + t10L[il] = t10; + t11L[il] = t11; + t12L[il] = t12; + t13L[il] = t13; + t14L[il] = t14; + t15L[il] = t15; + t16L[il] = t16; + t17L[il] = t17; + t18L[il] = t18; + t19L[il] = t19; + t20L[il] = t20; + t21L[il] = t21; + t22L[il] = t22; + t23L[il] = t23; + t24L[il] = t24; + t25L[il] = t25; + t26L[il] = t26; + t27L[il] = t27; + t28L[il] = t28; + t29L[il] = t29; + t30L[il] = t30; + + cur = t30; + } + clr_buf(0); + use_buf(0); + struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t31, n_embd, N*n_batch); + struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch); + struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch); + use_buf(-1); + struct ggml_tensor * t34 = expand(gf, ggml_mul_mat (ctx0, model->output, t33)); assert_shape_2d(t34, n_vocab, N*n_batch); + struct ggml_tensor * t35 = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch)); assert_shape_3d(t35, n_vocab, N, n_batch); + struct ggml_tensor * t36 = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets)); assert_shape_1d(t36, 1); + + { + /* + tok_embeddings | grad_tok_embeddings = ggml_get_rows_back(grad_t01, t00) + L0_att_norm | grad_L0_att_norm = ggml_repeat_back(grad_t03L0, L0_att_norm.shape) + L0_wq | grad_L0_wq = ggml_out_prod(t04L0, grad_t05L0) + L0_wk | grad_L0_wk = ggml_out_prod(t04L0, grad_t08L0) + L0_wv | grad_L0_wv = ggml_out_prod(t04L0, ggml_transpose(grad_t11L0)) + L0_wo | grad_L0_wo = ggml_out_prod(t19L0, grad_t20L0) + L0_ffn_norm | grad_L0_ffn_norm = ggml_repeat_back(grad_t23L0, L0_ffn_norm.shape) + L0_w1 | grad_L0_w1 = ggml_out_prod(t24L0, grad_t26L0) + L0_w2 | grad_L0_w2 = ggml_out_prod(t28L0, grad_t29L0) + L0_w3 | grad_L0_w3 = ggml_out_prod(t24L0, grad_t25L0) + L1_att_norm | grad_L1_att_norm = ggml_repeat_back(grad_t03L1, L1_att_norm.shape) + L1_wq | grad_L1_wq = ggml_out_prod(t04L1, grad_t05L1) + L1_wk | grad_L1_wk = ggml_out_prod(t04L1, grad_t08L1) + L1_wv | grad_L1_wv = ggml_out_prod(t04L1, ggml_transpose(grad_t11L1)) + L1_wo | grad_L1_wo = ggml_out_prod(t19L1, grad_t20L1) + L1_ffn_norm | grad_L1_ffn_norm = ggml_repeat_back(grad_t23L1, L1_ffn_norm.shape) + L1_w1 | grad_L1_w1 = ggml_out_prod(t24L1, grad_t26L1) + L1_w2 | grad_L1_w2 = ggml_out_prod(t28L1, grad_t29L1) + L1_w3 | grad_L1_w3 = ggml_out_prod(t24L1, grad_t25L1) + norm | grad_norm = ggml_repeat_back(grad_t32, norm.shape) + output | grad_output = ggml_out_prod(t33, grad_t34) + | + t01 = ggml_get_rows(tok_embeddings, t00) | grad_t01 = grad_t21L0 + ggml_rms_norm_back(t01, grad_t02L0) + for layer: | + t02L0*= ggml_rms_norm (t01) | grad_t02L0 = ggml_mul(grad_t04L0, t03L0) + t03L0 = ggml_repeat (L0_att_norm, t02L0_shape) | grad_t03L0 = ggml_mul(grad_t04L0, t02L0) + t04L0*= ggml_mul (t02L0, t03L0) | grad_t04L0 = ggml_out_prod(L0_wv, grad_t11L0) + ggml_out_prod(L0_wk, ggml_transpose(grad_t08L0)) + ggml_out_prod(L0_wq, ggml_transpose(grad_t05L0)) + t05L0 = ggml_mul_mat (L0_wq, t04L0) | grad_t05L0 = ggml_reshape(grad_t06L0, t05L0_shape) + t06L0 = ggml_reshape_4d (t05L0, n_embd/n_head, n_head, N, n_batch) | grad_t06L0 = ggml_rope_back(grad_t07L0) + t07L0 = ggml_rope_inplace (t06L0) | grad_t07L0 = ggml_permute_back(grad_t13L0, 0, 2, 1, 3) = ggml_permute(grad_t13L0, 0, 2, 1, 3) + t08L0 = ggml_mul_mat (L0_wk, t04L0) | grad_t08L0 = ggml_reshape(grad_t09L0, t08L0_shape) + t09L0 = ggml_reshape_4d (t08L0, n_embd/n_head, n_head, N, n_batch) | grad_t09L0 = ggml_rope_back(grad_t10L0) + t10L0 = ggml_rope_inplace (t09L0) | grad_t10L0 = ggml_permute_back(grad_t14L0, 0, 2, 1, 3) = ggml_permute(grad_t14L0, 0, 2, 1, 3) + t11L0 = ggml_mul_mat (t04L0, L0_wv) | grad_t11L0 = ggml_reshape(grad_t12L0, t11L0_shape) + t12L0 = ggml_reshape_4d (t11L0, N, n_batch, n_embd/n_head, n_head) | grad_t12L0 = ggml_permute_back(grad_t15L0, 0, 3, 1, 2) = ggml_permute(grad_t15L0, 0, 2, 3, 1) + t13L0*= ggml_permute (t07L0, 0, 2, 1, 3) | grad_t13L0 = view__q(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) + t14L0*= ggml_permute (t10L0, 0, 2, 1, 3) | grad_t14L0 = view__k(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) + t15L0*= ggml_permute (t12L0, 0, 3, 1, 2) | grad_t15L0 = view__v(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) + t16L0 = ggml_flash_attn (t13L0, t14L0, t15L0) | grad_t16L0 = ggml_permute_back(grad_t17L0, 0, 2, 1, 3) = ggml_permute(grad_t17L0, 0, 2, 1, 3) + t17L0 = ggml_permute (t16L0, 0, 2, 1, 3) | grad_t17L0 = grad_t18L0 + t18L0 = ggml_cont (t17L0) | grad_t18L0 = ggml_reshape(grad_t19L0, t18L0_shape) + t19L0*= ggml_reshape_2d (t18L0, n_embd, N*n_batch) | grad_t19L0 = ggml_out_prod(L0_wo, ggml_transpose(grad_t20L0)) + t20L0 = ggml_mul_mat (L0_wo, t19L0) | grad_t20L0 = grad_t21L0 + t21L0*= ggml_add (t20L0, t01) | grad_t21L0 = grad_t30L0 + ggml_rms_norm_back(t21L0, grad_t22L0) + t22L0*= ggml_rms_norm (t21L0) | grad_t22L0 = ggml_mul(grad_t24L0, t23L0) + t23L0 = ggml_repeat (L0_ffn_norm, t22L0_shape) | grad_t23L0 = ggml_mul(grad_t24L0, t22L0) + t24L0*= ggml_mul (t23L0, t22L0) | grad_t24L0 = ggml_out_prod(L0_w1, ggml_transpose(grad_t26L0)) + ggml_out_prod(L0_w3, ggml_transpose(grad_t25L0)) + t25L0*= ggml_mul_mat (L0_w3, t24L0) | grad_t25L0 = ggml_mul(grad_t28L0, t27L0) + t26L0*= ggml_mul_mat (L0_w1, t24L0) | grad_t26L0 = ggml_silu_back(t26L0, grad_t27L0) + t27L0*= ggml_silu (t26L0) | grad_t27L0 = ggml_mul(grad_t28L0, t25L0) + t28L0*= ggml_mul (t27L0, t25L0) | grad_t28L0 = ggml_out_prod(L0_w2, ggml_transpose(grad_t29L0)) + t29L0 = ggml_mul_mat (L0_w2, t28L0) | grad_t29L0 = grad_t30L0 + t30L0*= ggml_add (t21L0, t29L0) | grad_t30L0 = ggml_rms_norm_back(t30L0, grad_t02L1) + grad_t21L1 + ^ + t02L1*= ggml_rms_norm (t30L0) | grad_t02L1 = ggml_mul(grad_t04L1, t03L1) + t03L1 = ggml_repeat (L1_att_norm, t02L1_shape) | grad_t03L1 = ggml_mul(grad_t04L1, t02L1) + t04L1*= ggml_mul (t02L1, t03L1) | grad_t04L1 = ggml_out_prod(L1_wv, grad_t11L1) + ggml_out_prod(L1_wk, ggml_transpose(grad_t08L1)) + ggml_out_prod(L1_wq, ggml_transpose(grad_t05L1)) + t05L1 = ggml_mul_mat (L1_wq, t04L1) | grad_t05L1 = ggml_reshape(grad_t06L1, t05L1_shape) + t06L1 = ggml_reshape_4d (t05L1, n_embd/n_head, n_head, N, n_batch) | grad_t06L1 = ggml_rope_back(grad_t07L1) + t07L1 = ggml_rope_inplace (t06L1) | grad_t07L1 = ggml_permute_back(grad_t13L1, 0, 2, 1, 3) = ggml_permute(grad_t13L1, 0, 2, 1, 3) + t08L1 = ggml_mul_mat (L1_wk, t04L1) | grad_t08L1 = ggml_reshape(grad_t09L1, t08L1_shape) + t09L1 = ggml_reshape_4d (t08L1, n_embd/n_head, n_head, N, n_batch) | grad_t09L1 = ggml_rope_back(grad_t10L1) + t10L1 = ggml_rope_inplace (t09L1) | grad_t10L1 = ggml_permute_back(grad_t14L1, 0, 2, 1, 3) = ggml_permute(grad_t14L1, 0, 2, 1, 3) + t11L1 = ggml_mul_mat (t04L1, L1_wv) | grad_t11L1 = ggml_reshape(grad_t12L1, t11L1_shape) + t12L1 = ggml_reshape_4d (t11L1, N, n_batch, n_embd/n_head, n_head) | grad_t12L1 = ggml_permute_back(grad_t15L1, 0, 3, 1, 2) = ggml_permute(grad_t15L1, 0, 2, 3, 1) + t13L1*= ggml_permute (t07L1, 0, 2, 1, 3) | grad_t13L1 = view__q(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) + t14L1*= ggml_permute (t10L1, 0, 2, 1, 3) | grad_t14L1 = view__k(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) + t15L1*= ggml_permute (t12L1, 0, 3, 1, 2) | grad_t15L1 = view__v(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) + t16L1 = ggml_flash_attn (t13L1, t14L1, t15L1) | grad_t16L1 = ggml_permute_back(grad_t17L1, 0, 2, 1, 3) = ggml_permute(grad_t17L1, 0, 2, 1, 3) + t17L1 = ggml_permute (t16L1, 0, 2, 1, 3) | grad_t17L1 = grad_t18L1 + t18L1 = ggml_cont (t17L1) | grad_t18L1 = ggml_reshape(grad_t19L1, t18L1_shape) + t19L1*= ggml_reshape_2d (t18L1, n_embd, N*n_batch) | grad_t19L1 = ggml_out_prod(L1_wo, ggml_transpose(grad_t20L1)) + t20L1 = ggml_mul_mat (L1_wo, t19L1) | grad_t20L1 = grad_t21L1 + t21L1*= ggml_add (t20L1, t30L0) | grad_t21L1 = grad_t30L1 + ggml_rms_norm_back(t21L1, grad_t22L1) + t22L1*= ggml_rms_norm (t21L1) | grad_t22L1 = ggml_mul(grad_t24L1, t23L1) + t23L1 = ggml_repeat (L1_ffn_norm, t22L1_shape) | grad_t23L1 = ggml_mul(grad_t24L1, t22L1) + t24L1*= ggml_mul (t23L1, t22L1) | grad_t24L1 = ggml_out_prod(L1_w1, ggml_transpose(grad_t26L1)) + ggml_out_prod(L1_w3, ggml_transpose(grad_t25L1)) + t25L1*= ggml_mul_mat (L1_w3, t24L1) | grad_t25L1 = ggml_mul(grad_t28L1, t27L1) + t26L1*= ggml_mul_mat (L1_w1, t24L1) | grad_t26L1 = ggml_silu_back(t26L1, grad_t27L1) + t27L1*= ggml_silu (t26L1) | grad_t27L1 = ggml_mul(grad_t28L1, t25L1) + t28L1*= ggml_mul (t27L1, t25L1) | grad_t28L1 = ggml_out_prod(L1_w2, ggml_transpose(grad_t29L1)) + t29L1 = ggml_mul_mat (L1_w2, t28L1) | grad_t29L1 = grad_t30L1 + t30L1*= ggml_add (t21L1, t29L1) | grad_t30L1 = ggml_rms_norm_back(t30L1, grad_t31) + ^ + t31 = ggml_rms_norm (t30L1) | grad_t31 = ggml_mul(grad_t33, t32) + t32 = ggml_repeat (norm, t31.shape) | grad_t32 = ggml_mul(grad_t33, t31) + t33 = ggml_mul (t32, t31) | grad_t33 = ggml_out_prod(output, ggml_transpose(grad_t34)) + t34 = ggml_mul_mat (output, t33) | grad_t34 = ggml_reshape(grad_t35, t34.shape) + t35 = ggml_reshape_3d (t34, n_vocab, N, n_batch) | grad_t35 = ggml_cross_entropy_loss_back(t35, targets, grad_t36) + t36 = ggml_cross_entropy_loss(t35, targets) | grad_t36 = 1 (optimizer) + tensors marked with * need to be stored until grad computation + tensors during grad computation are all temporary + */ + } + + *gb = *gf; + + // t36->grad gets set to one by optimizer, so we need the tensor. + // initialize it with 1.0f to make sure. + use_buf(-1); + t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f)); + + use_buf(0); + t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad)); assert_shape_3d(t35->grad, n_vocab, N, n_batch); + t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch)); assert_shape_2d(t34->grad, n_vocab, N*n_batch); + t33->grad = expand(gb, ggml_out_prod (ctx0, model->output, ggml_transpose(ctx0, t34->grad))); assert_shape_2d(t33->grad, n_embd, N*n_batch); + t32->grad = expand(gb, ggml_mul (ctx0, t33->grad, t31)); assert_shape_2d(t32->grad, n_embd, N*n_batch); + + use_buf(-1); + + model->norm->grad = expand(gb, add_or_set(model->norm->grad, ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd); + model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad))); assert_shape_2d(model->output->grad, n_embd, n_vocab); + + clr_buf(1); + use_buf(1); + t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32)); assert_shape_2d(t31->grad, n_embd, N*n_batch); + + struct ggml_tensor * back_layer_inp = t31; + struct ggml_tensor * grad_layer_inp = NULL; + + for (int k = 0; k < n_layer; ++k) { + int il = n_layer-1-k; + struct my_llama_layer & layer = model->layers[il]; + + struct ggml_tensor * t02 = t02L[il]; + struct ggml_tensor * t03 = t03L[il]; + struct ggml_tensor * t04 = t04L[il]; + struct ggml_tensor * t05 = t05L[il]; + struct ggml_tensor * t06 = t06L[il]; + struct ggml_tensor * t07 = t07L[il]; + struct ggml_tensor * t08 = t08L[il]; + struct ggml_tensor * t09 = t09L[il]; + struct ggml_tensor * t10 = t10L[il]; + struct ggml_tensor * t11 = t11L[il]; + struct ggml_tensor * t12 = t12L[il]; + struct ggml_tensor * t13 = t13L[il]; + struct ggml_tensor * t14 = t14L[il]; + struct ggml_tensor * t15 = t15L[il]; + struct ggml_tensor * t16 = t16L[il]; + struct ggml_tensor * t17 = t17L[il]; + struct ggml_tensor * t18 = t18L[il]; + struct ggml_tensor * t19 = t19L[il]; + struct ggml_tensor * t20 = t20L[il]; + struct ggml_tensor * t21 = t21L[il]; + struct ggml_tensor * t22 = t22L[il]; + struct ggml_tensor * t23 = t23L[il]; + struct ggml_tensor * t24 = t24L[il]; + struct ggml_tensor * t25 = t25L[il]; + struct ggml_tensor * t26 = t26L[il]; + struct ggml_tensor * t27 = t27L[il]; + struct ggml_tensor * t28 = t28L[il]; + struct ggml_tensor * t29 = t29L[il]; + struct ggml_tensor * t30 = t30L[il]; + + clr_buf(0); + use_buf(0); + t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); + if (grad_layer_inp) { + t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); + } + clr_buf(1); + t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch); + t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad))); assert_shape_2d(t28->grad, n_ff, N*n_batch); + t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25)); assert_shape_2d(t27->grad, n_ff, N*n_batch); + t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad)); assert_shape_2d(t26->grad, n_ff, N*n_batch); + t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27)); assert_shape_2d(t25->grad, n_ff, N*n_batch); + t24->grad = expand(gb, ggml_add_inplace(ctx0, + ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)), + ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad)))); assert_shape_2d(t24->grad, n_embd, N*n_batch); + t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22)); assert_shape_2d(t23->grad, n_embd, N*n_batch); + t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad))); assert_shape_2d(t22->grad, n_embd, N*n_batch); + use_buf(1); + t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad))); assert_shape_2d(t21->grad, n_embd, N*n_batch); + grad_layer_inp = t21; + use_buf(0); + t20->grad = t21->grad; assert_shape_2d(t20->grad, n_embd, N*n_batch); + t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad))); assert_shape_2d(t19->grad, n_embd, N*n_batch); + t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch); + t17->grad = t18->grad; assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch); + t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3)); assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch); + struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true)); assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch); + t15->grad = expand(gb, view__v(flash_attn)); assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch); + t14->grad = expand(gb, view__k(flash_attn)); assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch); + t13->grad = expand(gb, view__q(flash_attn)); assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch); + t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head); + t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd); + t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch); + t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch); + t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch); + t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch); + t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch); + t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch); + t04->grad = expand(gb, ggml_add_inplace(ctx0, + ggml_add_inplace(ctx0, + ggml_out_prod(ctx0, layer.wv, t11->grad), + ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))), + ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad)))); assert_shape_2d(t04->grad, n_embd, N*n_batch); + t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02)); assert_shape_2d(t04->grad, n_embd, N*n_batch); + use_buf(1); + t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02))); assert_shape_2d(t02->grad, n_embd, N*n_batch); + back_layer_inp = t02; + // use_buf(0); + + use_buf(-1); + layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm))); assert_shape_1d(layer.attention_norm->grad, n_embd); + layer.wq->grad = expand(gb, add_or_set(layer.wq->grad, ggml_out_prod(ctx0, t04, t05->grad))); assert_shape_2d(layer.wq->grad, n_embd, n_embd); + layer.wk->grad = expand(gb, add_or_set(layer.wk->grad, ggml_out_prod(ctx0, t04, t08->grad))); assert_shape_2d(layer.wk->grad, n_embd, n_embd); + layer.wv->grad = expand(gb, add_or_set(layer.wv->grad, ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad, n_embd, n_embd); + layer.wo->grad = expand(gb, add_or_set(layer.wo->grad, ggml_out_prod(ctx0, t19, t20->grad))); assert_shape_2d(layer.wo->grad, n_embd, n_embd); + layer.ffn_norm->grad = expand(gb, add_or_set(layer.ffn_norm->grad, ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm))); assert_shape_1d(layer.ffn_norm->grad, n_embd); + layer.w1->grad = expand(gb, add_or_set(layer.w1->grad, ggml_out_prod(ctx0, t24, t26->grad))); assert_shape_2d(layer.w1->grad, n_embd, n_ff); + layer.w2->grad = expand(gb, add_or_set(layer.w2->grad, ggml_out_prod(ctx0, t28, t29->grad))); assert_shape_2d(layer.w2->grad, n_ff, n_embd); + layer.w3->grad = expand(gb, add_or_set(layer.w3->grad, ggml_out_prod(ctx0, t24, t25->grad))); assert_shape_2d(layer.w3->grad, n_embd, n_ff); + // use_buf(0); + } + clr_buf(0); + use_buf(0); + t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad))); assert_shape_2d(t01->grad, n_embd, N*n_batch); + use_buf(-1); + model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab); + // clr_buf(1); + // clr_buf(0); + + *logits = t35; + + if (track_max_mem) { + printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]); + printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]); + } + + // now that all grads are created, set the graph leafs and grads + graph_set_leafs_grads(gf); + graph_set_leafs_grads(gb); + + return t36; +} + +void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) { + float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]); + *ptr = value; +} + +void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) { + float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *ptr = value; +} + +void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) { + int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + *ptr = value; +} + +float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { + float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + return *ptr; +} + +int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { + int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); + return *ptr; +} + +void print_row(struct ggml_tensor * probs, int i) { + for (int k = 0; k < probs->ne[0]; ++k) { + float p = get_f32_2d(probs, k, i); + printf(" %.2f", p); + } + printf("\n"); +} + +void print_matrix(struct ggml_tensor * probs) { + assert(probs->n_dims == 2); + for (int i = 0; i < probs->ne[1]; ++i) { + for (int k = 0; k < probs->ne[0]; ++k) { + float p = get_f32_2d(probs, k, i); + printf(" %.2f", p); + } + printf("\n"); + } +} + + +void print_token(struct llama_context * ctx, llama_token token) { + printf("%s", llama_token_to_str(ctx, token)); +} + +void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) { + for (int i=0; ine[0]; ++i) { + int token = ggml_get_i32_1d(tokens, i); + print_token(ctx, token); + } +} + +void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) { + for (int i1=0; i1ne[1]; ++i1) { + //int num_newline = 0; + for (int i0=0; i0ne[0]; ++i0) { + int token = get_i32_2d(tokens, i0, i1); + print_token(ctx, token); + // bool isnl = (token == llama_token_nl()); + // if (isnl) { + // ++num_newline; + // } + // if (isnl) { + // if (num_newline < 2) { + // print_token(ctx, token); + // } else { + // printf("\\n"); + // } + // } else { + // print_token(ctx, token); + // } + } + printf("\n--\n"); + } +} + +void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { + int n_tokens = tokens_input->ne[0]; + int n_vocab = target_logits->ne[0]; + + size_t sample = train_samples[example_id % n_train_samples]; + GGML_ASSERT(sample+n_tokens-1 < n_train_data); + + ggml_set_f32(target_logits, -1.0f/n_vocab); + ggml_set_f32(target_probs, 0.0f); + ggml_set_i32_1d(tokens_input, 0, llama_token_bos()); + for (int i=1; in_dims == 2); + GGML_ASSERT(target_logits->n_dims == 3); + GGML_ASSERT(target_probs->n_dims == 3); + int n_vocab = target_logits->ne[0]; + int n_tokens = tokens_input->ne[0]; + int n_batch = tokens_input->ne[1]; + GGML_ASSERT(n_tokens == target_logits->ne[1]); + GGML_ASSERT(n_batch == target_logits->ne[2]); + GGML_ASSERT(n_vocab == target_probs->ne[0]); + GGML_ASSERT(n_tokens == target_probs->ne[1]); + GGML_ASSERT(n_batch == target_probs->ne[2]); + + ggml_set_f32(target_logits, -1.0f/n_vocab); + ggml_set_f32(target_probs, 0.0f); + for (int k=0; kne[0]; + int n_vocab = target_logits->ne[0]; + for (int i=0; i= 0 && size < INT_MAX); + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + +struct llama_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + size = 0; + } else { + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + GGML_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + GGML_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret != 1) { + throw std::runtime_error(std::string("unexpectedly reached end of file")); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + throw std::runtime_error(format("write error: %s", strerror(errno))); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +int tokenize_file(struct llama_context * lctx, const char * filename, std::vector& out) { + struct llama_file f(filename, "rb"); + + std::vector buf; + buf.resize(f.size+1); + + f.read_raw(buf.data(), f.size); + buf[f.size] = '\0'; + + out.resize(buf.size()); + + int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false); + if (n_tokens >= 0) { + out.resize(n_tokens); + } + + bool verify = false; + if (verify) { + const char * in = buf.data(); + const char * end = buf.data() + buf.size(); + for (int i = 0; i < (int) out.size(); ++i) { + const char * s = llama_token_to_str(lctx, out[i]); + int len = strlen(s); + if (in >= end) { + printf("%s: unexpected end of original text.\n", __func__); + break; + } + const bool matches = (strncmp(in, s, len) == 0); + if (matches) { + in += len; + } else { + printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s); + } + } + } + + return n_tokens; +} + +void shuffle_ints(int * begin, int * end) { + if (end <= begin) return; + int max=begin[0]; + for (int i=1; i max) { + max = begin[i]; + } + } + std::vector vals; + vals.resize(max+1); + for (int i=0; i candidates; + llama_token_data_array candidates_p; + +}; + +void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) { + sampler->ctx = ctx; + sampler->n_vocab = llama_n_vocab(sampler->ctx); + sampler->n_ctx = llama_n_ctx(sampler->ctx); + sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau; +} + +llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) { + GGML_ASSERT(sampler->ctx != NULL); + + struct llama_context * ctx = sampler->ctx; + + sampler->candidates.resize(sampler->n_vocab); + for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) { + sampler->candidates[token_id].id = token_id; + sampler->candidates[token_id].logit = logits[token_id]; + sampler->candidates[token_id].p = 0.0; + } + + llama_token_data_array * candidates_p = & sampler->candidates_p; + + candidates_p->data = sampler->candidates.data(); + candidates_p->size = sampler->candidates.size(); + candidates_p->sorted = false; + + const auto params = sampler->params; + + // Apply penalties + const float nl_logit = logits[llama_token_nl()]; + + const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx); + + llama_sample_repetition_penalty( + ctx, + candidates_p, + last_tokens + n_last_tokens - n_last, + n_last, + params.repeat_penalty); + llama_sample_frequency_and_presence_penalties( + ctx, + candidates_p, + last_tokens + n_last_tokens - n_last, + n_last, + params.alpha_frequency, + params.alpha_presence); + + if (!params.penalize_nl) { + logits[llama_token_nl()] = nl_logit; + } + + llama_token token = 0; + if (params.temp <= 0) { + // Greedy sampling + token = llama_sample_token_greedy(ctx, candidates_p); + } else { + if (params.mirostat == 1) { + int mirostat_m = 100; + llama_sample_temperature(ctx, candidates_p, params.temp); + token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu); + } else if (params.mirostat == 2) { + llama_sample_temperature(ctx, candidates_p, params.temp); + token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu); + } else { + // Temperature sampling + llama_sample_top_k (ctx, candidates_p, params.top_k, 1); + llama_sample_tail_free (ctx, candidates_p, params.tfs_z, 1); + llama_sample_typical (ctx, candidates_p, params.typical_p, 1); + + llama_sample_top_p (ctx, candidates_p, params.top_p, 1); + llama_sample_temperature (ctx, candidates_p, params.temp); + token = llama_sample_token(ctx, candidates_p); + } + } + return token; +} + +void set_logits_masked(struct ggml_tensor * logits, std::vector& mask, float value) { + GGML_ASSERT(logits->ne[0] == (int64_t) mask.size()); + for (int i2 = 0; i2 < logits->ne[2]; ++i2) { + for (int i1 = 0; i1 < logits->ne[1]; ++i1) { + for (int i0 = 0; i0 < logits->ne[0]; ++i0) { + if (!mask[i0]) continue; + float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]); + *ptr = value; + } + } + } +} + +void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { + if (tensor == NULL) { + file->write_u32(0); + file->write_u32(0); + file->write_u32(GGML_TYPE_F32); + file->seek(-file->tell() & 31, SEEK_CUR); + return; + } + const char * name = ggml_get_name(tensor); + uint32_t name_len = strlen(name); + uint32_t nd = tensor->n_dims; + uint32_t ne[4] = { (uint32_t)tensor->ne[0], + (uint32_t)tensor->ne[1], + (uint32_t)tensor->ne[2], + (uint32_t)tensor->ne[3] }; + file->write_u32(nd); + file->write_u32(name_len); + file->write_u32(tensor->type); + file->write_raw(ne, sizeof(ne[0]) * nd); + file->write_raw(name, name_len); + file->seek(-file->tell() & 31, SEEK_CUR); + file->write_raw(tensor->data, ggml_nbytes(tensor)); +} + +void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) { + int32_t nd = file->read_u32(); + GGML_ASSERT(nd == tensor->n_dims); + + uint32_t name_len = file->read_u32(); + enum ggml_type type = (enum ggml_type) file->read_u32(); + GGML_ASSERT(type == tensor->type); + + uint32_t ne[4]; + file->read_raw(ne, sizeof(ne[0]) * nd); + for (int i=0; ine[i]); + } + + std::string name = file->read_string(name_len); + GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0); + + file->seek(-file->tell() & 31, SEEK_CUR); + file->read_raw(tensor->data, ggml_nbytes(tensor)); +} + +void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) { + const uint32_t version = 0; + GGML_ASSERT(opt->nx >= 0); + GGML_ASSERT(opt->iter >= 0); + file->write_u32(version); + file->write_raw(&opt->params, sizeof(opt->params)); + file->write_raw(&opt->nx, sizeof(opt->nx)); + file->write_raw(&opt->iter, sizeof(opt->iter)); + file->write_u32((uint32_t) opt->just_initialized); + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + GGML_ASSERT(opt->adam.x != NULL); + write_tensor(file, opt->adam.x); + write_tensor(file, opt->adam.g1); + write_tensor(file, opt->adam.g2); + write_tensor(file, opt->adam.m); + write_tensor(file, opt->adam.v); + write_tensor(file, opt->adam.mh); + write_tensor(file, opt->adam.vh); + write_tensor(file, opt->adam.pf); + file->write_raw(&opt->adam.fx_best, sizeof(opt->adam.fx_best)); + file->write_raw(&opt->adam.fx_prev, sizeof(opt->adam.fx_prev)); + file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement)); + } break; + case GGML_OPT_LBFGS: + { + GGML_ASSERT(opt->adam.x != NULL); + write_tensor(file, opt->lbfgs.x); + write_tensor(file, opt->lbfgs.xp); + write_tensor(file, opt->lbfgs.g); + write_tensor(file, opt->lbfgs.gp); + write_tensor(file, opt->lbfgs.d); + write_tensor(file, opt->lbfgs.pf); + write_tensor(file, opt->lbfgs.lmal); + write_tensor(file, opt->lbfgs.lmys); + write_tensor(file, opt->lbfgs.lms); + write_tensor(file, opt->lbfgs.lmy); + file->write_raw(&opt->lbfgs.fx_best, sizeof(opt->lbfgs.fx_best)); + file->write_raw(&opt->lbfgs.step, sizeof(opt->lbfgs.step)); + file->write_raw(&opt->lbfgs.j, sizeof(opt->lbfgs.j)); + file->write_raw(&opt->lbfgs.k, sizeof(opt->lbfgs.k)); + file->write_raw(&opt->lbfgs.end, sizeof(opt->lbfgs.end)); + file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement)); + } break; + } +} + +void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) { + uint32_t version = file->read_u32(); + GGML_ASSERT(version == 0); + + file->read_raw(&opt->params, sizeof(opt->params)); + file->read_raw(&opt->nx, sizeof(opt->nx)); + ggml_opt_init(ctx, opt, opt->params, opt->nx); + + file->read_raw(&opt->iter, sizeof(opt->iter)); + opt->just_initialized = (bool) file->read_u32(); + + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + read_tensor(file, opt->adam.x); + read_tensor(file, opt->adam.g1); + read_tensor(file, opt->adam.g2); + read_tensor(file, opt->adam.m); + read_tensor(file, opt->adam.v); + read_tensor(file, opt->adam.mh); + read_tensor(file, opt->adam.vh); + if (opt->adam.pf) { read_tensor(file, opt->adam.pf); } + file->read_raw(&opt->adam.fx_best, sizeof(opt->adam.fx_best)); + file->read_raw(&opt->adam.fx_prev, sizeof(opt->adam.fx_prev)); + file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement)); + } break; + case GGML_OPT_LBFGS: + { + GGML_ASSERT(opt->adam.x != NULL); + read_tensor(file, opt->lbfgs.x); + read_tensor(file, opt->lbfgs.xp); + read_tensor(file, opt->lbfgs.g); + read_tensor(file, opt->lbfgs.gp); + read_tensor(file, opt->lbfgs.d); + if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); } + read_tensor(file, opt->lbfgs.lmal); + read_tensor(file, opt->lbfgs.lmys); + read_tensor(file, opt->lbfgs.lms); + read_tensor(file, opt->lbfgs.lmy); + file->read_raw(&opt->lbfgs.fx_best, sizeof(opt->lbfgs.fx_best)); + file->read_raw(&opt->lbfgs.step, sizeof(opt->lbfgs.step)); + file->read_raw(&opt->lbfgs.j, sizeof(opt->lbfgs.j)); + file->read_raw(&opt->lbfgs.k, sizeof(opt->lbfgs.k)); + file->read_raw(&opt->lbfgs.end, sizeof(opt->lbfgs.end)); + file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement)); + } break; + } +} + +void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename) { + struct llama_file file(filename, "wb"); + if (file.fp == NULL) { + return; + } + + const uint32_t magic = 'ggcp'; + const uint32_t version = 0; + + file.write_u32(magic); + file.write_u32(version); + file.write_u32(model->train_its); + file.write_u32(model->train_samples); + file.write_u32(model->train_tokens); + file.write_u32(model->hparams.n_vocab); + file.write_u32(model->hparams.n_embd); + file.write_u32(model->hparams.n_mult); + file.write_u32(model->hparams.n_head); + file.write_u32(model->hparams.n_layer); + file.write_u32(model->hparams.n_rot); + + write_tensor(&file, model->tok_embeddings); + write_tensor(&file, model->norm); + write_tensor(&file, model->output); + + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { + auto & layer = model->layers[i]; + + write_tensor(&file, layer.attention_norm); + write_tensor(&file, layer.wq); + write_tensor(&file, layer.wk); + write_tensor(&file, layer.wv); + write_tensor(&file, layer.wo); + write_tensor(&file, layer.ffn_norm); + write_tensor(&file, layer.w1); + write_tensor(&file, layer.w2); + write_tensor(&file, layer.w3); + } + + write_opt_context(&file, opt); +} + +bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) { + struct llama_file file(filename, "rb"); + + uint32_t magic; + uint32_t version; + + uint32_t train_its = 0; + uint32_t train_samples = 0; + uint32_t train_tokens = 0; + + if (file.fp) { + printf("%s: Loading model from '%s'.\n", __func__, filename); + magic = file.read_u32(); + GGML_ASSERT(magic == 'ggcp'); + version = file.read_u32(); + GGML_ASSERT(version == 0); + train_its = file.read_u32(); + train_samples = file.read_u32(); + train_tokens = file.read_u32(); + model->hparams.n_vocab = file.read_u32(); + model->hparams.n_embd = file.read_u32(); + model->hparams.n_mult = file.read_u32(); + model->hparams.n_head = file.read_u32(); + model->hparams.n_layer = file.read_u32(); + model->hparams.n_rot = file.read_u32(); + print_params(&model->hparams); + } + + if (init) { + init_model(model); + } + + if (file.fp) { + model->train_its = train_its; + model->train_samples = train_samples; + model->train_tokens = train_tokens; + } + + printf("%s: Training iterations: %u.\n", __func__, model->train_its); + printf("%s: Training samples: %u.\n", __func__, model->train_samples); + printf("%s: Training tokens: %u.\n", __func__, model->train_tokens); + + if (file.fp) { + read_tensor(&file, model->tok_embeddings); + read_tensor(&file, model->norm); + read_tensor(&file, model->output); + + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { + auto & layer = model->layers[i]; + + read_tensor(&file, layer.attention_norm); + read_tensor(&file, layer.wq); + read_tensor(&file, layer.wk); + read_tensor(&file, layer.wv); + read_tensor(&file, layer.wo); + read_tensor(&file, layer.ffn_norm); + read_tensor(&file, layer.w1); + read_tensor(&file, layer.w2); + read_tensor(&file, layer.w3); + } + + read_opt_context(&file, model->ctx, opt); + } + + return (file.fp != NULL); +} + +void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, const char * filename) { + struct llama_file file(filename, "wb"); + if (file.fp == NULL) { + return; + } + + // write_magic + file.write_u32(LLAMA_FILE_MAGIC); // magic + file.write_u32(LLAMA_FILE_VERSION); // version + // write_hparams + file.write_u32(model->hparams.n_vocab); + file.write_u32(model->hparams.n_embd); + file.write_u32(model->hparams.n_mult); + file.write_u32(model->hparams.n_head); + file.write_u32(model->hparams.n_layer); + file.write_u32(model->hparams.n_rot); + file.write_u32(LLAMA_FTYPE_ALL_F32); + // write_vocab + uint32_t n_vocab = model->hparams.n_vocab; + for (uint32_t i = 0; i < n_vocab; i++) { + const auto & token_score = vocab->id_to_token.at(i); + file.write_u32((uint32_t) token_score.tok.size()); + file.write_raw(token_score.tok.data(), token_score.tok.size()); + file.write_raw(&token_score.score, sizeof(token_score.score)); + } + // write tensors + write_tensor(&file, model->tok_embeddings); + write_tensor(&file, model->norm); + write_tensor(&file, model->output); + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { + auto & layer = model->layers[i]; + + write_tensor(&file, layer.attention_norm); + write_tensor(&file, layer.wq); + write_tensor(&file, layer.wk); + write_tensor(&file, layer.wv); + write_tensor(&file, layer.wo); + write_tensor(&file, layer.ffn_norm); + write_tensor(&file, layer.w1); + write_tensor(&file, layer.w2); + write_tensor(&file, layer.w3); + } +} + +float cosine_decay(const int decay_steps, const float alpha, int step) { + if (step > decay_steps) { + step = decay_steps; + } + const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); + const float decay = (1 - alpha)*cosine_decay + alpha; + return decay; +} + +float cosine_decay_restart(int decay_steps, const float alpha, int step, float restart_step_mult) { + while (step > decay_steps) { + step -= decay_steps; + decay_steps = (int) restart_step_mult * decay_steps; + } + return cosine_decay(decay_steps, alpha, step); +} + +struct train_params { + const char * fn_vocab_model; + const char * fn_train_data; + const char * fn_checkpoint_in; + const char * fn_checkpoint_out; + const char * fn_model_out; + + int seed; + int n_ctx; + int n_embd; + int n_mult; + int n_head; + int n_layer; + int n_rotmax; + + int n_threads; + int n_batch; + int n_examples; + int n_predict; + + int print_info_interval; + int print_details_interval; + + bool samples_start_after_nl; + bool use_adam; + bool use_flash; + bool use_scratch; + + // only adam + int warmup; + int cos_decay_steps; + float cos_decay_restart; + float cos_decay_alpha; + + int lbfgs_n_iter; + int adam_n_iter; + float adam_alpha; + float adam_decay; + + int mem_model_gb; + int mem_compute_gb; + int mem_compute0_gb; + int mem_compute1_gb; +}; + +struct train_params get_default_train_params() { + struct train_params params; + params.fn_vocab_model = "ggml-vic7b-uncensored-q4_0.bin"; + params.fn_train_data = "shakespeare.txt"; + params.fn_checkpoint_in = "checkpoint.bin"; + params.fn_checkpoint_out = "checkpoint.bin"; + params.fn_model_out = "ggml-checkpoint-f32.bin"; + + params.seed = -1; + + params.n_ctx = 128; + params.n_embd = 256; + params.n_mult = 256; + params.n_head = 8; + params.n_layer = 16; + params.n_rotmax = 64; + + params.n_threads = 6; + params.n_batch = 8; + params.n_examples = 8; + params.n_predict = 1024; + + params.print_info_interval = 1; + params.print_details_interval = 2; + + params.samples_start_after_nl = false; + params.use_adam = true; + params.use_flash = true; + params.use_scratch = true; + + // only adam + params.warmup = 100; + params.cos_decay_steps = 1000; + params.cos_decay_restart = 1.1f; + params.cos_decay_alpha = 0.0f; + + params.lbfgs_n_iter = 16; + params.adam_n_iter = 16; + params.adam_alpha = 1e-3; + params.adam_decay = 1e-3; + + params.mem_model_gb = 2; + params.mem_compute_gb = 24; + params.mem_compute0_gb = 8; + params.mem_compute1_gb = 2; + + return params; +} + +void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " --vocab-model FNAME model path from which to load vocab (default '%s')\n", params->fn_vocab_model); + fprintf(stderr, " --train-data FNAME path from which to load training data (default '%s')\n", params->fn_train_data); + fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in); + fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out); + fprintf(stderr, " --model-out FNAME path to save ggml model (default '%s')\n", params->fn_model_out); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); + fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx); + fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd); + fprintf(stderr, " --mult N Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult); + fprintf(stderr, " --head N Number of heads for new models (default %d)\n", params->n_head); + fprintf(stderr, " --layer N Number of layers for new models (default %d)\n", params->n_layer); + fprintf(stderr, " --rotmax N Maximal number Rope dimensions for new models (default %d)\n", params->n_rotmax); + fprintf(stderr, " -t N, --threads N Number of threads (default %d)\n", params->n_threads); + fprintf(stderr, " -b N, --batch N Parallel batch size (default %d)\n", params->n_batch); + fprintf(stderr, " -n N, --examples N Number of examples to train (default %d)\n", params->n_examples); + fprintf(stderr, " --predict N Number of tokens to generate after training (default %d)\n", params->n_predict); + fprintf(stderr, " --print-info-interval N Print infos during training each N examples (default %d)\n", params->print_info_interval); + fprintf(stderr, " --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval); + fprintf(stderr, " --samples-after-nl Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off"); + fprintf(stderr, " --use-lbfgs Use LBFGS optimizer instead of default Adam\n"); + fprintf(stderr, " --use-adam Use Adam optimizer (default)\n"); + fprintf(stderr, " --no-flash Don't use flash attention.\n"); + fprintf(stderr, " --use-flash Use flash attention (default)\n"); + fprintf(stderr, " --no-scratch Don't use scratch buffers\n"); + fprintf(stderr, " --use-scratch Use scratch buffers (default)\n"); + fprintf(stderr, " --warmup N Number of warmup steps (default %d)\n", params->warmup); + fprintf(stderr, " --cos-decay-steps N Number of cosine decay steps (default %d)\n", params->cos_decay_steps); + fprintf(stderr, " --cos-decay-restart N Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); + fprintf(stderr, " --cos-decay-alpha N Cosine decay alpha (default %f)\n", params->cos_decay_alpha); + fprintf(stderr, " --lbfgs-iter N Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter); + fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter); + fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha); + fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay); + fprintf(stderr, " --mem-model N Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb); + fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb); + fprintf(stderr, " --mem-compute0 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb); + fprintf(stderr, " --mem-compute1 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb); + fprintf(stderr, "\n"); +} + +bool train_params_parse(int argc, char ** argv, struct train_params * params) { + bool invalid_param = false; + std::string arg; + struct train_params default_params = get_default_train_params(); + const std::string arg_prefix = "--"; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + if (arg == "--vocab-model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_vocab_model = argv[i]; + } else if (arg == "--train-data") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_train_data = argv[i]; + } else if (arg == "--checkpoint-in") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_checkpoint_in = argv[i]; + } else if (arg == "--checkpoint-out") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_checkpoint_out = argv[i]; + } else if (arg == "--model-out") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->fn_model_out = argv[i]; + } else if (arg == "-s" || arg == "--seed") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->seed = std::stoi(argv[i]); + } else if (arg == "-c" || arg == "--ctx") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_ctx = std::stoi(argv[i]); + } else if (arg == "--embd") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_embd = std::stoi(argv[i]); + } else if (arg == "--mult") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_mult = std::stoi(argv[i]); + } else if (arg == "--head") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_head = std::stoi(argv[i]); + } else if (arg == "--layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_layer = std::stoi(argv[i]); + } else if (arg == "--rotmax") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_rotmax = std::stoi(argv[i]); + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_threads = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_batch = std::stoi(argv[i]); + } else if (arg == "-n" || arg == "--examples") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_examples = std::stoi(argv[i]); + } else if (arg == "--predict") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->n_predict = std::stoi(argv[i]); + } else if (arg == "--print-info-interval") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->print_info_interval = std::stoi(argv[i]); + } else if (arg == "--print-details-interval") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->print_details_interval = std::stoi(argv[i]); + } else if (arg == "--samples-after-nl") { + params->samples_start_after_nl = true; + } else if (arg == "--use-lbfgs") { + params->use_adam = false; + } else if (arg == "--use-adam") { + params->use_adam = true; + } else if (arg == "--no-flash") { + params->use_flash = false; + } else if (arg == "--use-flash") { + params->use_flash = true; + } else if (arg == "--no-scratch") { + params->use_scratch = false; + } else if (arg == "--use-scratch") { + params->use_scratch = true; + } else if (arg == "--warmup") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->warmup = std::stoi(argv[i]); + } else if (arg == "--cos-decay-steps") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->cos_decay_steps = std::stof(argv[i]); + } else if (arg == "--cos-decay-restart") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->cos_decay_restart = std::stof(argv[i]); + } else if (arg == "--cos-decay-alpha") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->cos_decay_alpha = std::stof(argv[i]); + } else if (arg == "--lbfgs-iter") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->lbfgs_n_iter = std::stoi(argv[i]); + } else if (arg == "--adam-iter") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_n_iter = std::stoi(argv[i]); + } else if (arg == "--adam-alpha") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_alpha = std::stof(argv[i]); + } else if (arg == "--adam-decay") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_decay = std::stof(argv[i]); + } else if (arg == "--mem-model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->mem_model_gb = std::stoi(argv[i]); + } else if (arg == "--mem-compute") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->mem_compute_gb = std::stoi(argv[i]); + } else if (arg == "--mem-compute0") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->mem_compute0_gb = std::stoi(argv[i]); + } else if (arg == "--mem-compute1") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->mem_compute1_gb = std::stoi(argv[i]); + } else if (arg == "-h" || arg == "--help") { + train_print_usage(argc, argv, &default_params); + exit(0); + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + train_print_usage(argc, argv, &default_params); + exit(1); + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + train_print_usage(argc, argv, &default_params); + exit(1); + } + + return true; +} + +int main(int argc, char ** argv) { + struct train_params params = get_default_train_params(); + + if (!train_params_parse(argc, argv, ¶ms)) { + return 1; + } + + if (params.seed < 0) { + params.seed = time(NULL); + } + printf("%s: seed: %d\n", __func__, params.seed); + srand(params.seed); + + struct llama_context_params llama_params = llama_context_default_params(); + llama_params.vocab_only = true; + + struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params); + + struct llama_vocab vocab; + { + std::vector strings; + std::vector scores; + int n_vocab = llama_n_vocab(lctx); + strings.resize(n_vocab, NULL); + scores.resize(n_vocab, 0); + n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab); + GGML_ASSERT(n_vocab == llama_n_vocab(lctx)); + vocab.id_to_token.resize(n_vocab); + for (int i=0; i train_tokens; + if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) { + fprintf(stderr, "%s: failed to tokenize file '%s'\n", __func__, params.fn_train_data); + } + printf("%s: number of training tokens: %d\n", __func__, (int) train_tokens.size()); + + struct my_llama_model model; + model.hparams.n_vocab = llama_n_vocab(lctx); + model.hparams.n_ctx = params.n_ctx; + model.hparams.n_embd = params.n_embd; + model.hparams.n_mult = params.n_mult; + model.hparams.n_head = params.n_head; + model.hparams.n_layer = params.n_layer; + model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); + + print_params(&model.hparams); + + std::vector token_noccurs; + std::vector token_notavail; + token_noccurs.resize(model.hparams.n_vocab, 0); + token_notavail.resize(model.hparams.n_vocab, true); + for (int i = 0; i < (int) train_tokens.size(); ++i) { + ++token_noccurs[train_tokens[i]]; + token_notavail[train_tokens[i]] = false; + } + + std::vector token_freq; + token_freq.resize(model.hparams.n_vocab, 0); + int n_unique_tokens = 0; + for (int i = 0; i < (int) token_noccurs.size(); ++i) { + token_freq[i] = (float) token_noccurs[i] / (float) train_tokens.size(); + n_unique_tokens += (token_noccurs[i] > 0) ? 1 : 0; + } + printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); + + struct my_llama_kv_cache kv_self; + + + struct ggml_init_params lcparams; + lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); + lcparams.mem_buffer = NULL; + lcparams.no_alloc = false; + + model.ctx = ggml_init(lcparams); + kv_self.ctx = model.ctx; + + my_llama_sampler sampler; + + + int n_tokens = model.hparams.n_ctx; + int n_vocab = model.hparams.n_vocab; + int n_batch = params.n_batch; + + struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); + memset(opt, 0, sizeof(struct ggml_opt_context)); + + struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM); + struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); + opt_params_adam.print_forward_graph = false; + opt_params_adam.print_backward_graph = false; + opt_params_adam.n_threads = params.n_threads; + opt_params_adam.adam.n_iter = params.adam_n_iter; + opt_params_adam.adam.sched = 1.0f; + opt_params_adam.adam.alpha = params.adam_alpha; + opt_params_adam.adam.decay = params.adam_decay; + + opt_params_lbfgs.print_forward_graph = false; + opt_params_lbfgs.print_backward_graph = false; + opt_params_lbfgs.n_threads = params.n_threads; + opt_params_lbfgs.lbfgs.n_iter = params.lbfgs_n_iter; + + opt->ctx = model.ctx; + opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; + + printf("%s: init model\n", __func__); + bool existed = load_checkpoint(&model, opt, params.fn_checkpoint_in, true); + set_param_model(&model); + + opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; + + opt->iter = model.train_its; + printf("%s: opt iter %d\n", __func__, opt->iter); + + bool from_scratch = !existed; + if (from_scratch) { + randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f); + } + + init_kv_cache(&kv_self, &model, 1); + // init_kv_cache(&kv_self, &model, n_batch); + init_sampler(&sampler, lctx); + + printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx)); + // ggml_print_tensor_objects(model.ctx); + + size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb); + uint8_t * compute_addr = new uint8_t[compute_size]; + + size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb); + size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb); + uint8_t * compute_buf_0 = new uint8_t[size_buf_0]; + uint8_t * compute_buf_1 = new uint8_t[size_buf_1]; + + GGML_ASSERT(n_tokens < (int) train_tokens.size()); + std::vector train_samples; + train_samples.push_back(0); + for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) { + if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) { + train_samples.push_back(i); + } + } + shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); + for (int i = 0; i < (int) train_samples.size(); ++i) { + GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); + } + + printf("%s: begin training\n", __func__); + + for (int ex = 0; ex < params.n_examples; ++ex) { + if (ex*n_batch >= (int) train_samples.size()) { + shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); + for (int i = 0; i < (int) train_samples.size(); ++i) { + GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); + } + } + + struct ggml_init_params cparams = { + /*.mem_size =*/ compute_size, + /*.mem_buffer =*/ compute_addr, + /*.no_alloc =*/ false, + }; + struct ggml_context * ctx0 = ggml_init(cparams); + + struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); + //struct ggml_tensor * after_opt_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); + struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); + struct ggml_tensor * target_logits = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); + struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); + + int n_past = 0; + + struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); + struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); + + memset(gfbuf->data, 0, ggml_nbytes(gfbuf)); + memset(gbbuf->data, 0, ggml_nbytes(gbbuf)); + + struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; + struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; + + // ggml_cgraph gf = {}; + gf->n_threads = params.n_threads; + gb->n_threads = params.n_threads; + + get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); + + GGML_ASSERT(n_past == 0); + + struct ggml_tensor * loss = NULL; + struct ggml_tensor * logits = NULL; + + if (params.use_scratch) { + loss = forward_batch_wo_cache_flash_attn_train( + &model, ctx0, + gf, gb, + &logits, tokens_input, target_probs, + compute_buf_0, compute_buf_1, + size_buf_0, size_buf_1, + n_tokens, n_batch); + } else if (params.use_flash) { + logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch); + loss = cross_entropy_loss(ctx0, logits, target_probs); + ggml_build_forward_expand(gf, loss); + *gb = ggml_build_backward(ctx0, gf, true); + } else { + logits = forward_batch_wo_cache(&model, ctx0, gf, tokens_input, n_tokens, n_batch); + loss = cross_entropy_loss(ctx0, logits, target_probs); + ggml_build_forward_expand(gf, loss); + *gb = ggml_build_backward(ctx0, gf, true); + } + + ggml_graph_compute(ctx0, gf); + + size_t used_mem_before_opt = ggml_used_mem(ctx0); + + float error_before_opt = ggml_get_f32_1d(loss, 0); + + opt->params.adam.sched = (opt->iter < params.warmup) + ? (float) opt->iter / (float) params.warmup + : cosine_decay_restart( + params.cos_decay_steps, + params.cos_decay_alpha, + opt->iter - params.warmup, + params.cos_decay_restart); + + printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched); + + ggml_opt_resume_g(ctx0, opt, loss, gf, gb); + + size_t used_mem_after_opt = ggml_used_mem(ctx0); + + model.train_its = opt->iter; + model.train_samples += n_batch; + model.train_tokens += n_batch * n_tokens; + + ggml_graph_compute(ctx0, gf); + + float error_after_opt = ggml_get_f32_1d(loss, 0); + + if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) { + printf("Example %d, opt iter %d\n", ex, opt->iter); + printf("error_before_opt: %.6f\n", error_before_opt); + printf("error_after_opt: %.6f\n", error_after_opt); + printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt); + printf("used_mem_after_opt: %zu bytes\n", used_mem_after_opt); + } + + if (params.print_details_interval > 0 && ex % params.print_details_interval == 0) { + // set_logits_masked(logits, token_notavail, -1e9); + for (int i=0; idata + i*logits->nb[2] + k*logits->nb[1]), + (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]), + k); + * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token; + } + } + + // printf("probabilities after optimization:\n"); + // print_matrix(after_opt_probs); + printf("Example:\n---\n"); + print_tokens_batch(lctx, tokens_input); + printf("\n---\n"); + + // printf("best samples after optimization:\n---\n"); + printf("samples after optimization:\n---\n"); + print_tokens_batch(lctx, after_opt_best_samples); + printf("\n---\n"); + } + + ggml_free(ctx0); + } + + if (params.n_examples > 0) { + save_checkpoint(&model, opt, params.fn_checkpoint_out); + } + + if (strlen(params.fn_model_out) > 0) { + save_as_llama_model(&vocab, &model, params.fn_model_out); + } + + { + int n_gen = params.n_predict; + int sample_ctx = n_tokens - n_tokens/8; + + sampler.params.temp = 0.2; + sampler.params.repeat_penalty = 1.1; + sampler.params.mirostat = 2; + init_sampler(&sampler, lctx); + + printf("Generating %d tokens.\n", n_gen); + + struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); + struct ggml_tensor * target_probs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); + + get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs); + for (int i=sample_ctx; idata + (sample_ctx-1)*logits->nb[1]), + (llama_token *) tokens_input->data, + sample_ctx-1); + //int token = ggml_get_i32_1d(best_samples, sample_ctx-1); + + // print_row(probs, sample_at); + print_token(lctx, token); + + lshift_examples(tokens_input, target_logits, target_probs, 1); + ggml_set_i32_1d(tokens_input, 0, 0); + ggml_set_i32_1d(tokens_input, sample_ctx-1, token); + + ggml_free(ctx0); + } + } + + delete[] compute_addr; + delete[] compute_buf_0; + delete[] compute_buf_1; + ggml_free(model.ctx); + + return 0; +} diff --git a/ggml.c b/ggml.c index 252edd582..32c191307 100644 --- a/ggml.c +++ b/ggml.c @@ -3603,6 +3603,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "SUM_ROWS", "MEAN", "REPEAT", + "REPEAT_BACK", "ABS", "SGN", "NEG", @@ -3616,6 +3617,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "RMS_NORM_BACK", "MUL_MAT", + "OUT_PROD", "SCALE", "SET", @@ -3631,6 +3633,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "DIAG_MASK_INF", "DIAG_MASK_ZERO", "SOFT_MAX", + "SOFT_MAX_BACK", "ROPE", "ROPE_BACK", "ALIBI", @@ -3640,13 +3643,16 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "FLASH_ATTN", "FLASH_FF", + "FLASH_ATTN_BACK", "MAP_UNARY", "MAP_BINARY", + + "CROSS_ENTROPY_LOSS", + "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51"); - +static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3665,6 +3671,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "Σx_k", "Σx/n", "repeat(x)", + "repeat_back(x)", "abs(x)", "sgn(x)", "-x", @@ -3677,6 +3684,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "rms_norm(x)", "rms_norm_back(x)", + "X*Y", "X*Y", "x*v", @@ -3693,6 +3701,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "diag_mask_inf(x)", "diag_mask_zero(x)", "soft_max(x)", + "soft_max_back(x)", "rope(x)", "rope_back(x)", "alibi(x)", @@ -3702,12 +3711,16 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "flash_attn(x)", "flash_ff(x)", + "flash_attn_back(x)", "f(x)", "f(x,y)", + + "cross_entropy_loss(x,y)", + "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51"); +static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -3870,6 +3883,15 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct (t0->ne[3] == t1->ne[3]); } +static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + (t0->ne[1] == t1->ne[1]) && + (t0->ne[2] == t1->ne[2]) && + (t0->ne[3] == t1->ne[3]); +} + bool ggml_is_quantized(enum ggml_type type) { return GGML_IS_QUANTIZED[type]; } @@ -4693,7 +4715,7 @@ struct ggml_tensor * ggml_add_impl( bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { is_node = true; } @@ -4733,7 +4755,7 @@ struct ggml_tensor * ggml_add1_impl( bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { is_node = true; } @@ -5159,6 +5181,34 @@ struct ggml_tensor * ggml_repeat( return result; } +// ggml_repeat_back + +struct ggml_tensor * ggml_repeat_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_repeat(b, a)); + + bool is_node = false; + + if (a->grad) { + is_node = true; + } + + if (ggml_are_same_shape(a, b) && !is_node) { + return a; + } + + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + + result->op = GGML_OP_REPEAT_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + // ggml_abs struct ggml_tensor * ggml_abs_impl( @@ -5536,6 +5586,32 @@ struct ggml_tensor * ggml_mul_mat( return result; } +// ggml_out_prod + +struct ggml_tensor * ggml_out_prod( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_out_prod(a, b)); + GGML_ASSERT(!ggml_is_transposed(a)); + + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + const int64_t ne[4] = { a->ne[0], b->ne[0], a->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); + + result->op = GGML_OP_OUT_PROD; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + // ggml_scale struct ggml_tensor * ggml_scale_impl( @@ -5548,7 +5624,7 @@ struct ggml_tensor * ggml_scale_impl( bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { is_node = true; } @@ -5591,7 +5667,7 @@ struct ggml_tensor * ggml_set_impl( bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { is_node = true; } @@ -5913,10 +5989,6 @@ struct ggml_tensor * ggml_view_1d( result->src1 = NULL; result->opt[0] = offs; - if (is_node) { - memcpy(result->padding, &offset, sizeof(offset)); - } - return result; } @@ -5957,10 +6029,6 @@ struct ggml_tensor * ggml_view_2d( result->src1 = NULL; result->opt[0] = offs; - if (is_node) { - memcpy(result->padding, &offset, sizeof(offset)); - } - return result; } @@ -6003,10 +6071,6 @@ struct ggml_tensor * ggml_view_3d( result->src1 = NULL; result->opt[0] = offs; - if (is_node) { - memcpy(result->padding, &offset, sizeof(offset)); - } - return result; } @@ -6051,10 +6115,6 @@ struct ggml_tensor * ggml_view_4d( result->src1 = NULL; result->opt[0] = offs; - if (is_node) { - memcpy(result->padding, &offset, sizeof(offset)); - } - return result; } @@ -6116,10 +6176,18 @@ struct ggml_tensor * ggml_permute( result->src1 = NULL; if (is_node) { - result->padding[0] = axis0; - result->padding[1] = axis1; - result->padding[2] = axis2; - result->padding[3] = axis3; + ggml_scratch_save(ctx); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); + + ((int32_t *) b->data)[0] = axis0; + ((int32_t *) b->data)[1] = axis1; + ((int32_t *) b->data)[2] = axis2; + ((int32_t *) b->data)[3] = axis3; + + ggml_scratch_load(ctx); + + result->opt[0] = b; } return result; @@ -6359,6 +6427,44 @@ struct ggml_tensor * ggml_soft_max_inplace( return ggml_soft_max_impl(ctx, a, true); } + +// ggml_soft_max_back + +struct ggml_tensor * ggml_soft_max_back_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; // TODO : implement backward pass + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_SOFT_MAX_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +struct ggml_tensor * ggml_soft_max_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_soft_max_back_impl(ctx, a, b, false); +} + +struct ggml_tensor * ggml_soft_max_back_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_soft_max_back_impl(ctx, a, b, true); +} + // ggml_rope struct ggml_tensor * ggml_rope_impl( @@ -6371,7 +6477,7 @@ struct ggml_tensor * ggml_rope_impl( GGML_ASSERT(n_past >= 0); bool is_node = false; - if (!inplace && a->grad) { + if (a->grad) { is_node = true; } @@ -6425,8 +6531,7 @@ struct ggml_tensor * ggml_rope_back( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward - is_node = true; + is_node = false; // TODO: implement backward } struct ggml_tensor * result = ggml_dup_tensor(ctx, a); @@ -6591,7 +6696,6 @@ struct ggml_tensor * ggml_flash_attn( bool is_node = false; if (q->grad || k->grad || v->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -6623,7 +6727,6 @@ struct ggml_tensor * ggml_flash_ff( bool is_node = false; if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) { - GGML_ASSERT(false); // TODO: implement backward is_node = true; } @@ -6641,6 +6744,71 @@ struct ggml_tensor * ggml_flash_ff( return result; } +// ggml_flash_attn_back + +struct ggml_tensor * ggml_flash_attn_back( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * d, + bool masked) { + GGML_ASSERT(ggml_can_mul_mat(k, q)); + // TODO: check if vT can be multiplied by (k*qT) + + // d shape [D,N,ne2,ne3] + // q shape [D,N,ne2,ne3] + // k shape [D,M,ne2,ne3] + // v shape [M,D,ne2,ne3] + + const int64_t D = q->ne[0]; + const int64_t N = q->ne[1]; + const int64_t M = k->ne[1]; + const int64_t ne2 = q->ne[2]; + const int64_t ne3 = q->ne[3]; + + GGML_ASSERT(k->ne[0] == D); + GGML_ASSERT(v->ne[0] == M); + GGML_ASSERT(v->ne[1] == D); + GGML_ASSERT(d->ne[0] == D); + GGML_ASSERT(d->ne[1] == N); + GGML_ASSERT(k->ne[2] == ne2); + GGML_ASSERT(k->ne[3] == ne3); + GGML_ASSERT(v->ne[2] == ne2); + GGML_ASSERT(v->ne[3] == ne3); + GGML_ASSERT(d->ne[2] == ne2); + GGML_ASSERT(d->ne[3] == ne3); + + bool is_node = false; + + if (q->grad || k->grad || v->grad) { + // when using this operation (in backwards pass) these grads are set. + // we don't want to create (big) grad of our result, so is_node is false. + is_node = false; + } + + // store gradients of q, k and v as continuous tensors concatenated in result. + // q shape[D,N,ne2,ne3] ; k shape [D,M,ne2,ne3] ; v shape [M,D,ne2,ne3] + // gradq->data = result->data + // gradk->data = result->data + nb0*D*N*ne2*ne3 + // gradv->data = result->data + nb0*D*N*ne2*ne3 + nb0*D*M*ne2*ne3 + // note: v and gradv are actually transposed, i.e. v->ne[0] != D. + int64_t ne[4] = {D,M+N+M,ne2,ne3}; + + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_FLASH_ATTN_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = q; + result->src1 = k; + result->opt[0] = v; + result->opt[1] = d; + result->opt[2] = ggml_new_i32(ctx, masked ? 1 : 0); + + return result; +} + + // ggml_map_unary struct ggml_tensor * ggml_map_unary_impl_f32( @@ -6725,6 +6893,50 @@ struct ggml_tensor * ggml_map_binary_inplace_f32( return ggml_map_binary_impl_f32(ctx, a, b, fun, true); } +// ggml_cross_entropy_loss + +struct ggml_tensor * ggml_cross_entropy_loss( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + bool is_node = false; + + if (a->grad || b->grad) { + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); + + result->op = GGML_OP_CROSS_ENTROPY_LOSS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +// ggml_cross_entropy_loss_back + +struct ggml_tensor * ggml_cross_entropy_loss_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_is_scalar(c)); + + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; + result->grad = NULL; + result->src0 = a; + result->src1 = b; + result->opt[0] = c; + + return result; +} + //////////////////////////////////////////////////////////////////////////////// void ggml_set_param( @@ -8875,6 +9087,99 @@ static void ggml_compute_forward_repeat( } } +// ggml_compute_forward_repeat_back + +static void ggml_compute_forward_repeat_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(dst, src0)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne00/ne0); + const int nr1 = (int)(ne01/ne1); + const int nr2 = (int)(ne02/ne2); + const int nr3 = (int)(ne03/ne3); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (ggml_is_contiguous(dst)) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + } else { + for (int k3 = 0; k3 < ne3; k3++) { + for (int k2 = 0; k2 < ne2; k2++) { + for (int k1 = 0; k1 < ne1; k1++) { + ggml_vec_set_f32(ne0, + (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), + 0); + } + } + } + } + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne3; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne2; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne1; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_vec_acc_f32(ne0, + (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), + (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00)); + } + } + } + } + } + } + } +} + +static void ggml_compute_forward_repeat_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_repeat_back_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_abs static void ggml_compute_forward_abs_f32( @@ -10249,6 +10554,176 @@ static void ggml_compute_forward_mul_mat( } } +// ggml_compute_forward_out_prod + + +static void ggml_compute_forward_out_prod_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + //const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod + // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) + + if (params->type == GGML_TASK_INIT) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by last three dimensions + + // total rows in dst + const int64_t nr = ne1*ne2*ne3; + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + // dst[:,:,:,:] = 0 + // for i2,i3: + // for i1: + // for i01: + // for i0: + // dst[i0,i1,i2,i3] += src0[i0,i01,i2,i3] * src1[i1,i01,i2,i3] + + for (int64_t ir = ir0; ir < ir1; ++ir) { + // dst indices + const int64_t i3 = ir/(ne2*ne1); + const int64_t i2 = (ir - i3*ne2*ne1)/ne1; + const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); + + const int64_t i02 = i2; + const int64_t i03 = i3; + + //const int64_t i10 = i1; + const int64_t i12 = i2; + const int64_t i13 = i3; + + for (int64_t i01 = 0; i01 < ne01; ++i01) { + const int64_t i11 = i01; + + float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03)); + float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); + float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); + + ggml_vec_mad_f32(ne0, d, s0, *s1); + // for (int64_t i0 = 0; i0 < ne0; ++i0) { + // d[i0] += s0[i0] * s1[i1]; + // } + } + } + + //int64_t t1 = ggml_perf_time_us(); + //static int64_t acc = 0; + //acc += t1 - t0; + //if (t1 - t0 > 10) { + // printf("\n"); + // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); + // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); + // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13); + + // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); + //} +} + +static void ggml_compute_forward_out_prod( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + { + GGML_ASSERT(false); // todo + // ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + { + GGML_ASSERT(false); // todo + // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_out_prod_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_scale static void ggml_compute_forward_scale_f32( @@ -10671,7 +11146,11 @@ static void ggml_compute_forward_get_rows_back_f32( GGML_ASSERT(ggml_is_contiguous(opt0)); GGML_ASSERT(ggml_is_contiguous(dst)); - ggml_compute_forward_dup_same_cont(params, opt0, dst); + // ggml_compute_forward_dup_same_cont(params, opt0, dst); + + if (params->type == GGML_TASK_INIT) { + memset(dst->data, 0, ggml_nbytes(dst)); + } if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -10815,8 +11294,8 @@ static void ggml_compute_forward_diag_mask_f32( const struct ggml_tensor * src1, struct ggml_tensor * dst, const float value) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(src1) == 2); const int ith = params->ith; const int nth = params->nth; @@ -10824,7 +11303,7 @@ static void ggml_compute_forward_diag_mask_f32( const int n_past = ((int32_t *) src1->data)[0]; const bool inplace = (bool)((int32_t *) src1->data)[1]; - assert(n_past >= 0); + GGML_ASSERT(n_past >= 0); if (!inplace && (params->type == GGML_TASK_INIT)) { // memcpy needs to be synchronized across threads to avoid race conditions. @@ -10848,8 +11327,8 @@ static void ggml_compute_forward_diag_mask_f32( const int nr = src0->ne[1]; const int nz = n/nr; - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); for (int k = 0; k < nz; k++) { for (int j = ith; j < nr; j += nth) { @@ -10985,6 +11464,101 @@ static void ggml_compute_forward_soft_max( } } +// ggml_compute_forward_soft_max_back + +static void ggml_compute_forward_soft_max_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_are_same_shape(src1, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + // TODO: handle transposed/permuted matrices + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float *dy = (float *)((char *) src0->data + i1*src0->nb[1]); + float *y = (float *)((char *) src1->data + i1*src1->nb[1]); + float *dx = (float *)((char *) dst->data + i1*dst->nb[1]); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(dy[i])); + assert(!isnan(y[i])); + } +#endif + // Jii = yi - yi*yi + // Jij = -yi*yj + // J = diag(y)-y.T*y + // dx = J * dy + // dxk = sum_i(Jki * dyi) + // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk + // dxk = sum_i(-yk*yi * dyi) + yk*dyk + // dxk = -yk * sum_i(yi * dyi) + yk*dyk + // dxk = -yk * dot(y, dy) + yk*dyk + // dxk = yk * (- dot(y, dy) + dyk) + // dxk = yk * (dyk - dot(y, dy)) + // + // post-order: + // dot_y_dy := dot(y, dy) + // dx := dy + // dx := dx - dot_y_dy + // dx := dx * y + + // linear runtime, no additional memory + float dot_y_dy = 0; + ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy); + ggml_vec_cpy_f32 (nc, dx, dy); + ggml_vec_acc1_f32(nc, dx, -dot_y_dy); + ggml_vec_mul_f32 (nc, dx, dx, y); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(dx[i])); + assert(!isinf(dx[i])); + } +#endif + } +} + +static void ggml_compute_forward_soft_max_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_alibi static void ggml_compute_forward_alibi_f32( @@ -12938,6 +13512,414 @@ static void ggml_compute_forward_flash_ff( } } +// ggml_compute_forward_flash_attn_back + +static void ggml_compute_forward_flash_attn_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * d, + const bool masked, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int64_t neq0 = q->ne[0]; + const int64_t neq1 = q->ne[1]; + const int64_t neq2 = q->ne[2]; + const int64_t neq3 = q->ne[3]; + + const int64_t nek0 = k->ne[0]; + const int64_t nek1 = k->ne[1]; + //const int64_t nek2 = k->ne[2]; + //const int64_t nek3 = k->ne[3]; + + const int64_t nev0 = v->ne[0]; + const int64_t nev1 = v->ne[1]; + //const int64_t nev2 = v->ne[2]; + //const int64_t nev3 = v->ne[3]; + + const int64_t ned0 = d->ne[0]; + const int64_t ned1 = d->ne[1]; + //const int64_t ned2 = d->ne[2]; + //const int64_t ned3 = d->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int nbk0 = k->nb[0]; + const int nbk1 = k->nb[1]; + const int nbk2 = k->nb[2]; + const int nbk3 = k->nb[3]; + + const int nbq0 = q->nb[0]; + const int nbq1 = q->nb[1]; + const int nbq2 = q->nb[2]; + const int nbq3 = q->nb[3]; + + const int nbv0 = v->nb[0]; + const int nbv1 = v->nb[1]; + const int nbv2 = v->nb[2]; + const int nbv3 = v->nb[3]; + + const int nbd0 = d->nb[0]; + const int nbd1 = d->nb[1]; + const int nbd2 = d->nb[2]; + const int nbd3 = d->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t D = neq0; + const int64_t N = neq1; + const int64_t P = nek1 - N; + const int64_t M = P + N; + + const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); + const int mxDM = MAX(D, Mup); + + // GGML_ASSERT(ne0 == D); + // GGML_ASSERT(ne1 == N); + GGML_ASSERT(P >= 0); + + GGML_ASSERT(nbq0 == sizeof(float)); + GGML_ASSERT(nbk0 == sizeof(float)); + GGML_ASSERT(nbv0 == sizeof(float)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev1 == D); + GGML_ASSERT(ned0 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nek1 == N + P); + GGML_ASSERT(nev1 == D); + GGML_ASSERT(ned1 == N); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + if (params->type == GGML_TASK_INIT) { + if (ith == 0) { + memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); + } + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by q rows using ggml_vec_dot_f32 + + // total rows in q + const int nr = neq2*neq3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const float scale = 1.0f/sqrtf(D); + + //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + + for (int ir = ir0; ir < ir1; ++ir) { + // q indices + const int iq3 = ir/(neq2); + const int iq2 = ir - iq3*neq2; + for ( int iq1 = 0; iq1 < neq1; ++iq1) { + + + // not sure about CACHE_LINE_SIZE_F32.. + // - maybe it must not be multiplied by 2 and excluded from .. in SM 1*(..) offset? + float * S = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 0*(mxDM+CACHE_LINE_SIZE_F32); + float * SM = (float *) params->wdata + ith*2*(mxDM + CACHE_LINE_SIZE_F32) + 1*(mxDM+CACHE_LINE_SIZE_F32); + + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } + + for (int64_t ic = 0; ic < nek1; ++ic) { + // k indices + const int ik3 = iq3; + const int ik2 = iq2; + const int ik1 = ic; + + // S indices + const int i1 = ik1; + + ggml_vec_dot_f32(neq0, + S + i1, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + } + + // scale + ggml_vec_scale_f32(nek1, S, scale); + + if (masked) { + for (int64_t i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = -INFINITY; + } + } + } + + // softmax + { + float max = -INFINITY; + ggml_vec_max_f32(M, &max, S); + + ggml_float sum = 0.0; + { +#ifdef GGML_SOFT_MAX_ACCELERATE + max = -max; + vDSP_vsadd(SM, 1, &max, SM, 1, Mup); + vvexpf(SM, SM, &Mup); + ggml_vec_sum_f32(Mup, &sum, SM); +#else + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; + + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + float * SR = S + i; + float * SW = SM + i; + + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + if (SR[j] == -INFINITY) { + SW[j] = 0.0f; + } else { + ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max); + memcpy(&scvt[j], &s, sizeof(uint16_t)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + sump[j] += (ggml_float)val; + SW[j] = val; + } + } + } + + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + sum += sump[i]; + } +#endif + } + + assert(sum > 0.0); + + sum = 1.0/sum; + ggml_vec_scale_f32(M, SM, sum); + + } + + // step-by-step explanation + { + // forward-process shape grads from backward process + // parallel_for iq2,iq3: + // k[:D,:M,:,:] [D,M,:,:] grad[k][:D,:M,iq2,iq3] += grad[kcur] + // q[:D,:N,:,:] [D,N,:,:] grad[q][:D,iq1,iq2,iq3] += grad[qcur] + // v[:M,:D,:,:] [M,D,:,:] grad[v][:M,:D,iq2,iq3] += grad[vcur] + // for iq1: + // kcur = k[:D,:M,iq2,iq3] [D,M,1,1] grad[kcur] = grad[S1].T @ qcur + // qcur = q[:D,iq1,iq2,iq3] [D,1,1,1] grad[qcur] = grad[S1] @ kcur + // vcur = v[:M,:D,iq2,iq3] [M,D,1,1] grad[vcur] = grad[S5].T @ S4 + // S0 = -Inf [D,1,1,1] + // ~S1[i] = dot(kcur[:D,i], qcur) + // S1 = qcur @ kcur.T [M,1,1,1] grad[S1] = grad[S2] * scale + // S2 = S1 * scale [M,1,1,1] grad[S2] = diag_mask_zero(grad[S3], P) + // S3 = diag_mask_inf(S2, P) [M,1,1,1] grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // S4 = softmax(S3) [M,1,1,1] grad[S4] = grad[S5] @ vcur + // ~S5[i] = dot(vcur[:,i], S4) + // S5 = S4 @ vcur.T [D,1,1,1] grad[S5] = d[:D,iq1,iq2,iq3] + // ~dst[i,iq1,iq2,iq3] = S5[i] ^ + // dst[:D,iq1,iq2,iq3] = S5 | grad[dst[:D,iq1,iq2,iq3]] = d[:D,iq1,iq2,iq3] + // dst backward-/ grad[dst] = d + // + // output gradients with their dependencies: + // + // grad[kcur] = grad[S1].T @ qcur + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S4] = grad[S5] @ vcur + // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur + // grad[qcur] = grad[S1] @ kcur + // grad[vcur] = grad[S5].T @ S4 + // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 + // + // in post-order: + // + // S1 = qcur @ kcur.T + // S2 = S1 * scale + // S3 = diag_mask_inf(S2, P) + // S4 = softmax(S3) + // grad[S4] = d[:D,iq1,iq2,iq3] @ vcur + // grad[S3] = S4 * (grad[S4] - dot(S4, grad[S4])) + // grad[S1] = diag_mask_zero(grad[S3], P) * scale + // grad[qcur] = grad[S1] @ kcur + // grad[kcur] = grad[S1].T @ qcur + // grad[vcur] = d[:D,iq1,iq2,iq3].T @ S4 + // + // using less variables (SM=S4): + // + // S = diag_mask_inf(qcur @ kcur.T * scale, P) + // SM = softmax(S) + // S = d[:D,iq1,iq2,iq3] @ vcur + // dot_SM_gradSM = dot(SM, S) + // S = SM * (S - dot(SM, S)) + // S = diag_mask_zero(S, P) * scale + // + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[k][:D,:M,iq2,iq3] += S.T @ qcur + // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM + } + + // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur + // S = d[:D,iq1,iq2,iq3] @ vcur + // S[:M] += vcur[:M,ic] * d[ic,iq1,iq2,iq3] + ggml_vec_set_f32(M, S, 0); + for (int64_t ic = 0; ic < D; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_mad_f32(M, + S, + (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); + } + + // S = SM * (S - dot(SM, S)) + float dot_SM_gradSM = 0; + ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S); + ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); + ggml_vec_mul_f32 (M, S, S, SM); + + // S = diag_mask_zero(S, P) * scale + if (masked) { + // for (int64_t i = P + iq1 + 1; i < M; i++) { + // S[i] = 0; + // } + for (int64_t i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = 0; + } + } + } + ggml_vec_scale_f32(M, S, scale); + + void * grad_q = (char *) dst->data; + void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3; + void * grad_v = (char *) dst->data + nb0*D*N*neq2*neq3 + nb0*D*M*neq2*neq3; + + const size_t nbgq1 = nb0*neq0; + const size_t nbgq2 = nb0*neq0*neq1; + const size_t nbgq3 = nb0*neq0*neq1*neq2; + + const size_t nbgk1 = nb0*nek0; + const size_t nbgk2 = nb0*nek0*nek1; + const size_t nbgk3 = nb0*nek0*nek1*neq2; + + const size_t nbgv1 = nb0*nev0; + const size_t nbgv2 = nb0*nev0*nev1; + const size_t nbgv3 = nb0*nev0*nev1*neq2; + + // S shape [M,1] + // SM shape [M,1] + // kcur shape [D,M] + // qcur shape [D,1] + // vcur shape [M,D] + // + // grad[q][:D,iq1,iq2,iq3] += S @ kcur + // grad[q][:D,iq1,iq2,iq3] += shape[M,1] @ shape[D,M] + // grad[q][:D,iq1,iq2,iq3] += S[ic] * kcur[:D,ic] + // + //// grad[q][ic,iq1,iq2,iq3] += dot(kcur[:,ic],S.T) + //// grad[q][ic,iq1,iq2,iq3] += dot(k[:D,ic,iq2,iq3],S.T) + for (int64_t ic = 0; ic < M; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + ggml_vec_mad_f32(D, + (float *) ((char *) grad_q + (i1*nbgq1 + i2*nbgq2 + i3*nbgq3)), + (float *) ((char *) k->data + (ic*nbk1 + i2*nbk2 + i3*nbk3)), + S[ic]); + } + + // grad[k][:D,:M,iq2,iq3] += S.T @ qcur + // grad[k][:D,ic,iq2,iq3] += S.T[0,ic] * qcur[:D,0] + // grad[k][:D,ic,iq2,iq3] += S[ic] * qcur[:D,0] + for (int64_t ic = 0; ic < M; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + // ggml_vec_set_f32(D, + // (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), + // 0); + ggml_vec_mad_f32(D, + (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), + (float *) ((char *) q->data + (i1*nbq1 + i2*nbq2 + i3*nbq3)), + S[ic]); + } + + // grad[v][:M,:D,iq2,iq3] += d[:D,iq1,iq2,iq3].T @ SM + // grad[v][:M,ic,iq2,iq3] += d[:D,iq1,iq2,iq3].T[0,ic] * SM[:M] + // grad[v][:M,ic,iq2,iq3] += d[ic,iq1,iq2,iq3] * SM[:M] + for (int64_t ic = 0; ic < D; ++ic) { + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + // ggml_vec_set_f32(M, + // (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), + // 0); + ggml_vec_mad_f32(M, + (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), + SM, + *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); + } + } + } +} + +static void ggml_compute_forward_flash_attn_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * d, + const bool masked, + struct ggml_tensor * dst) { + switch (q->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_map_unary static void ggml_compute_forward_map_unary_f32( @@ -13031,6 +14013,286 @@ static void ggml_compute_forward_map_binary( } } +// ggml_compute_forward_cross_entropy_loss + +static void ggml_compute_forward_cross_entropy_loss_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + + const int ith = params->ith; + const int nth = params->nth; + + float * sums = (float *) params->wdata; + + // TODO: handle transposed/permuted matrices + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + if (params->type == GGML_TASK_INIT) { + if (ith == 0) { + memset(sums, 0, sizeof(float) * (nth + nth * nc)); + } + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + if (ith == 0) { + float * dp = (float *) dst->data; + ggml_vec_sum_f32(nth, dp, sums); + dp[0] *= -1.0f; + } + return; + } + + const double eps = 1e-9; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); + float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); + float * st = (float *) params->wdata + nth + ith*nc; + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(s0[i])); + assert(!isnan(s1[i])); + } +#endif + // soft_max + ggml_float sum = 0.0; + { + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, s0); + + uint16_t scvt; + for (int i = 0; i < nc; i++) { + if (s0[i] == -INFINITY) { + st[i] = 0.0f; + } else { + // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); + memcpy(&scvt, &s, sizeof(scvt)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); + sum += (ggml_float)val; + st[i] = val; + } + } + + assert(sum > 0.0); + // sum = 1.0/sum; + } + // avoid log(0) by rescaling from [0..1] to [eps..1] + sum = (1.0 - eps) / sum; + ggml_vec_scale_f32(nc, st, sum); + ggml_vec_add1_f32(nc, st, st, eps); + ggml_vec_log_f32(nc, st, st); + ggml_vec_mul_f32(nc, st, st, s1); + + ggml_vec_sum_f32(nc, sums + ith, st); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(st[i])); + assert(!isinf(st[i])); + } +#endif + } + +} + +static void ggml_compute_forward_cross_entropy_loss( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_cross_entropy_loss_back + +static void ggml_compute_forward_cross_entropy_loss_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + const int64_t ith = params->ith; + const int64_t nth = params->nth; + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const float eps = 1e-9f; + + // TODO: handle transposed/permuted matrices + const int64_t nc = src0->ne[0]; + const int64_t nr = ggml_nrows(src0); + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + float * d = (float *) opt0->data; + + for (int64_t i1 = ir0; i1 < ir1; i1++) { + float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); + float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); + float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); + float * sm = (float *) params->wdata + ith*nc; + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + //printf("p[%d] = %f\n", i, p[i]); + assert(!isnan(s0[i])); + assert(!isnan(s1[i])); + } +#endif + // step by step explanation: + { + //float * sums = (float *) params->wdata; + + // forward pass with annotated gradients from backward pass + // (built by going in reverse operation order, adding to gradients of current operation args) + // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum + // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1])) + // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps) + // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3] + // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3 + // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1 + // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]] + // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel] + + // substitute into grad[st1], because we can reuse softmax_back from this point on + // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps)) + // postorder: + // grad[st1] := softmax(s0) + // grad[st1] := grad[st1]*(1.0 - eps) + // grad[st1] := grad[st1] + eps + // grad[st1] := s1 / grad[st1] + // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel] + + // src0 gradients by going through softmax_back + // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1])) + // from softmax_back: + // dxk = yk * (dyk - dot(y, dy)) + // dot_y_dy := dot(y, dy) + // dx := dy + // dx := dx - dot_y_dy + // dx := dx * y + // postorder: + // dot_st1_dst1 := dot(st1, grad[st1]) + // grad[s0] := grad[st1] + // grad[s0] := grad[s0] - dot_st1_dst1 + // grad[s0] := grad[s0] * st1 + + // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1] + // sm := softmax(s0) + // grad[s0] := sm*(1.0 - eps) + // grad[s0] := grad[s0] + eps + // grad[s0] := s1 / grad[s0] + // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel] + // dot_st1_dst1 := dot(sm, grad[s0]) + // grad[s0] := grad[s0] - dot_st1_dst1 + // grad[s0] := grad[s0] * sm + } + + // soft_max + ggml_float sum = 0.0; + { + float max = -INFINITY; + ggml_vec_max_f32(nc, &max, s0); + + uint16_t scvt; + for (int i = 0; i < nc; i++) { + if (s0[i] == -INFINITY) { + sm[i] = 0.0f; + } else { + // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); + memcpy(&scvt, &s, sizeof(scvt)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); + sum += (ggml_float)val; + sm[i] = val; + } + } + + assert(sum > 0.0); + sum = 1.0/sum; + } + + float dot_st1_dst1 = 0; + ggml_vec_scale_f32(nc, sm, sum); + ggml_vec_cpy_f32 (nc, ds0, sm); + ggml_vec_scale_f32(nc, ds0, (1.0f - eps)); + ggml_vec_add1_f32 (nc, ds0, ds0, eps); + ggml_vec_div_f32 (nc, ds0, s1, ds0); + ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]); + ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0); + ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1); + ggml_vec_mul_f32 (nc, ds0, ds0, sm); + +#ifndef NDEBUG + for (int i = 0; i < nc; ++i) { + assert(!isnan(sm[i])); + assert(!isinf(sm[i])); + assert(!isnan(ds0[i])); + assert(!isinf(ds0[i])); + } +#endif + } +} + +static void ggml_compute_forward_cross_entropy_loss_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + + ///////////////////////////////// static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { @@ -13102,6 +14364,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_repeat(params, tensor->src0, tensor); } break; + case GGML_OP_REPEAT_BACK: + { + ggml_compute_forward_repeat_back(params, tensor->src0, tensor); + } break; case GGML_OP_ABS: { ggml_compute_forward_abs(params, tensor->src0, tensor); @@ -13150,6 +14416,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor); } break; + case GGML_OP_OUT_PROD: + { + ggml_compute_forward_out_prod(params, tensor->src0, tensor->src1, tensor); + } break; case GGML_OP_SCALE: { ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor); @@ -13206,6 +14476,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_soft_max(params, tensor->src0, tensor); } break; + case GGML_OP_SOFT_MAX_BACK: + { + ggml_compute_forward_soft_max_back(params, tensor->src0, tensor->src1, tensor); + } break; case GGML_OP_ROPE: { ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor); @@ -13241,6 +14515,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); } break; + case GGML_OP_FLASH_ATTN_BACK: + { + int32_t t = ggml_get_i32_1d(tensor->opt[2], 0); + GGML_ASSERT(t == 0 || t == 1); + bool masked = t != 0; + ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor); + } break; case GGML_OP_MAP_UNARY: { const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data); @@ -13253,6 +14534,16 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun); } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor); + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + ggml_compute_forward_cross_entropy_loss_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + } + break; case GGML_OP_NONE: { // nop @@ -13391,11 +14682,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_impl(ctx, src0->grad, - ggml_mul(ctx, - tensor->grad, // this was not catched by test_grad because in test_grad tensor->grad is 1 + ggml_scale(ctx, ggml_div(ctx, - ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor), - tensor)), + tensor->grad, + tensor), + ggml_new_f32(ctx, 0.5f)), inplace); } } break; @@ -13441,43 +14732,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - GGML_ASSERT(src0->n_dims == 1 || src0->n_dims == 2); - const int nc = tensor->ne[0]; - const int nr = tensor->ne[1]; - const int nc0 = src0->ne[0]; - const int nr0 = src0->ne[1]; - const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat - const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat - // tensor->grad [nc,nr,1,1] - // reshape [nc0,nc/nc0,nr0,nr/nr0] - // permute [nc0,nr0,nc/nc0,nr/nr0] - // substitute [nc0,nr0,ncr,nrr] - // reshape [nc0*nr0,ncr*nrr,1,1] - // transpose [ncr*nrr,nc0*nr0,1,1] - // sum rows [1,nc0*nr0,1,1] - // transpose [nc0*nr0,1,1] - // reshape [nc0,nr0,1,1] reshape_1d or reshape_2d - // add to src0->grad - - int64_t ne[4] = {nc0,ncr,nr0,nrr}; - - struct ggml_tensor* F00 = tensor->grad; - struct ggml_tensor* F01 = ggml_reshape (ctx, F00, ggml_new_tensor(ctx,tensor->grad->type,4,ne)); - struct ggml_tensor* F02 = ggml_permute (ctx, F01, 0,2,1,3); - struct ggml_tensor* F03 = ggml_cont (ctx, F02); - struct ggml_tensor* F04 = ggml_reshape_2d(ctx, F03, nc0*nr0, ncr*nrr); - struct ggml_tensor* F05 = ggml_transpose (ctx, F04); - struct ggml_tensor* F06 = ggml_cont (ctx, F05); - struct ggml_tensor* F07 = ggml_sum_rows (ctx, F06); - struct ggml_tensor* F08 = ggml_transpose (ctx, F07); - struct ggml_tensor* F09 = ggml_cont (ctx, F08); - struct ggml_tensor* F10 = ggml_reshape (ctx, F09, src0->grad); - - src0->grad = - ggml_add_impl(ctx, - src0->grad, - F10, - inplace); + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_repeat_back(ctx, tensor->grad, src0->grad), + inplace); + } + } break; + case GGML_OP_REPEAT_BACK: + { + if (src0->grad) { + // TODO: test this + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_repeat(ctx, tensor->grad, src0->grad), + inplace); } } break; case GGML_OP_ABS: @@ -13584,38 +14852,37 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { - // TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor->grad); src0->grad = ggml_add_impl(ctx, src0->grad, - // ds0 = dt.dot(s1.T) - // ggml_out_prod(ctx, // [n,m] - // src1, // [n,p] - // tensor->grad), // [m,p] - // for now just using A*B==(B.T*A.T).T - ggml_cont(ctx, // [n,m] - ggml_transpose(ctx, // [n,m] - ggml_mul_mat(ctx, // [m,n] - ggml_cont(ctx, // [p,m] - ggml_transpose(ctx, // [p,m] - tensor->grad)), // [m,p] - ggml_cont(ctx, // [p,n] - ggml_transpose(ctx, // [p,n] - src1))))), // [n,p] + ggml_out_prod(ctx, // [n,m] + src1, // [n,p] + tensor->grad), // [m,p] inplace); } if (src1->grad) { src1->grad = ggml_add_impl(ctx, src1->grad, - // ds1 = s0.T.dot(dt): - ggml_mul_mat(ctx, // [n,p] - ggml_cont(ctx, // [m,n] - ggml_transpose(ctx, src0)), // [m,n] - tensor->grad), // [m,p] + // ggml_mul_mat(ctx, // [n,p] + // ggml_cont(ctx, // [m,n] + // ggml_transpose(ctx, src0)), // [m,n] + // tensor->grad), // [m,p] + + // // when src0 is bigger than tensor->grad (this is mostly the case in llama), + // // avoid transpose of src0, rather transpose smaller tensor->grad + // // and then use ggml_out_prod + ggml_out_prod(ctx, // [n,p] + src0, // [n,m] + ggml_transpose(ctx, // [p,m] + tensor->grad)), // [m,p] inplace); } } break; + case GGML_OP_OUT_PROD: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_SCALE: { // necessary for llama @@ -13717,7 +14984,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { size_t offset; - memcpy(&offset, tensor->padding, sizeof(offset)); + + GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->opt[0])); + memcpy(&offset, tensor->opt[0]->data, sizeof(offset)); size_t nb1 = tensor->nb[1]; size_t nb2 = tensor->nb[2]; @@ -13744,10 +15013,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - int axis0 = tensor->padding[0] & 0x3; - int axis1 = tensor->padding[1] & 0x3; - int axis2 = tensor->padding[2] & 0x3; - int axis3 = tensor->padding[3] & 0x3; + int32_t * axes = (int32_t *) tensor->opt[0]->data; + int axis0 = axes[0] & 0x3; + int axis1 = axes[1] & 0x3; + int axis2 = axes[2] & 0x3; + int axis3 = axes[3] & 0x3; int axes_backward[4] = {0,0,0,0}; axes_backward[axis0] = 0; axes_backward[axis1] = 1; @@ -13831,50 +15101,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - // y = softmax(x) - // - // Jii = yi - yi*yi - // Jij = -yi*yj - // J = diag(y)-y.*y - // dx = J * dy - // dxk = sum(Jkj * dyk) - - int64_t ne2[4] = { - tensor->ne[0], - 1, - tensor->ne[1]*tensor->ne[2], - tensor->ne[3] - }; - struct ggml_tensor * tensor2 = ggml_cont(ctx, - ggml_reshape_4d(ctx, - ggml_cont(ctx, tensor), - ne2[0], ne2[1], ne2[2], ne2[3])); - - struct ggml_tensor * grad2 = ggml_cont(ctx, - ggml_reshape_4d(ctx, - ggml_cont(ctx, tensor->grad), - ne2[0], ne2[1], ne2[2], ne2[3])); - - struct ggml_tensor * tensor2_t = ggml_cont(ctx, // [1,ne0,ne1*ne2,ne3] - ggml_permute(ctx, // [1,ne0,ne1*ne2,ne3] - tensor2, // [ne0,1,ne1*ne2,ne3] - 1, 0, 2, 3)); - src0->grad = - ggml_add_impl(ctx, - src0->grad, // [ne0,ne1,ne2,ne3] - ggml_reshape(ctx, // [ne0,ne1,ne2,ne3] - ggml_mul_mat(ctx, // [ne0,1,ne1*ne2,ne3] - ggml_sub(ctx, // [ne0,ne0,ne1*ne2,ne3] - ggml_diag(ctx, // [ne0,ne0,ne1*ne2,ne3] - tensor2), // [ne0,1,ne1*ne2,ne3] - ggml_mul_mat(ctx, // [ne0,ne0,ne1*ne2,ne3] - tensor2_t, // [1,ne0,ne1*ne2,ne3] - tensor2_t)), // [1,ne0,ne1*ne2,ne3] - grad2), // [ne0,1,ne1*ne2,ne3] - src0->grad), - inplace); + ggml_add_impl(ctx, src0->grad, + ggml_soft_max_back(ctx, tensor->grad, tensor), + inplace); } + + } break; + case GGML_OP_SOFT_MAX_BACK: + { + GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_ROPE: { @@ -13929,17 +15165,190 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_FLASH_ATTN: { - GGML_ASSERT(false); // not supported + struct ggml_tensor * flash_grad = NULL; + if (src0->grad || src1->grad || tensor->opt[0]->grad) { + int32_t t = ggml_get_i32_1d(tensor->opt[1], 0); + GGML_ASSERT(t == 0 || t == 1); + bool masked = t != 0; + flash_grad = + ggml_flash_attn_back(ctx, + src0, + src1, + tensor->opt[0], + tensor->grad, + masked); + } + + if (src0->grad) { + struct ggml_tensor * grad_q = NULL; + const size_t nb0 = flash_grad->nb[0]; + const size_t offset = 0; + switch(src0->n_dims) { + case 2: + { + grad_q = ggml_view_2d(ctx, + flash_grad, + src0->ne[0], + src0->ne[1], + nb0*src0->ne[0], + offset); + } break; + case 3: + { + grad_q = ggml_view_3d(ctx, + flash_grad, + src0->ne[0], + src0->ne[1], + src0->ne[2], + nb0*src0->ne[0], + nb0*src0->ne[0]*src0->ne[1], + offset); + } break; + case 4: + { + grad_q = ggml_view_4d(ctx, + flash_grad, + src0->ne[0], + src0->ne[1], + src0->ne[2], + src0->ne[3], + nb0*src0->ne[0], + nb0*src0->ne[0]*src0->ne[1], + nb0*src0->ne[0]*src0->ne[1]*src0->ne[2], + offset); + } break; + } + + src0->grad = ggml_add_impl(ctx, + src0->grad, + grad_q, + inplace); + } + + if (src1->grad) { + struct ggml_tensor * grad_k = NULL; + const size_t nb0 = flash_grad->nb[0]; + const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3]; + switch(src1->n_dims) { + case 2: + { + grad_k = ggml_view_2d(ctx, + flash_grad, + src1->ne[0], + src1->ne[1], + nb0*src1->ne[0], + offset); + } break; + case 3: + { + grad_k = ggml_view_3d(ctx, + flash_grad, + src1->ne[0], + src1->ne[1], + src1->ne[2], + nb0*src1->ne[0], + nb0*src1->ne[0]*src1->ne[1], + offset); + } break; + case 4: + { + grad_k = ggml_view_4d(ctx, + flash_grad, + src1->ne[0], + src1->ne[1], + src1->ne[2], + src1->ne[3], + nb0*src1->ne[0], + nb0*src1->ne[0]*src1->ne[1], + nb0*src1->ne[0]*src1->ne[1]*src1->ne[2], + offset); + } break; + } + + src1->grad = ggml_add_impl(ctx, + src1->grad, + grad_k, + inplace); + } + + struct ggml_tensor * opt0 = tensor->opt[0]; + + if (opt0->grad) { + struct ggml_tensor * grad_v = NULL; + const size_t nb0 = flash_grad->nb[0]; + const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3] + + nb0*src1->ne[0]*src1->ne[1]*src1->ne[2]*src1->ne[3]; + switch(opt0->n_dims) { + case 2: + { + grad_v = ggml_view_2d(ctx, + flash_grad, + opt0->ne[0], + opt0->ne[1], + nb0*opt0->ne[0], + offset); + } break; + case 3: + { + grad_v = ggml_view_3d(ctx, + flash_grad, + opt0->ne[0], + opt0->ne[1], + opt0->ne[2], + nb0*opt0->ne[0], + nb0*opt0->ne[0]*opt0->ne[1], + offset); + } break; + case 4: + { + grad_v = ggml_view_4d(ctx, + flash_grad, + opt0->ne[0], + opt0->ne[1], + opt0->ne[2], + opt0->ne[3], + nb0*opt0->ne[0], + nb0*opt0->ne[0]*opt0->ne[1], + nb0*opt0->ne[0]*opt0->ne[1]*opt0->ne[2], + offset); + } break; + } + + opt0->grad = ggml_add_impl(ctx, + opt0->grad, + grad_v, + inplace); + } } break; case GGML_OP_FLASH_FF: { GGML_ASSERT(false); // not supported } break; + case GGML_OP_FLASH_ATTN_BACK: + { + GGML_ASSERT(false); // not supported + } break; case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: { GGML_ASSERT(false); // not supported } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_cross_entropy_loss_back(ctx, + src0, + src1, + tensor->grad), + inplace); + } + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + GGML_ASSERT(false); // not supported + } break; case GGML_OP_NONE: { // nop @@ -14316,6 +15725,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_SUM_ROWS: case GGML_OP_MEAN: case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: case GGML_OP_ABS: case GGML_OP_SGN: case GGML_OP_NEG: @@ -14335,6 +15745,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) node->n_tasks = n_threads; } break; case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: { node->n_tasks = n_threads; @@ -14417,6 +15828,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { @@ -14496,6 +15908,27 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + node->n_tasks = n_threads; + + size_t cur = 0; + + const int64_t D = node->src0->ne[0]; + const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 + } + + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 + } + work_size = MAX(work_size, cur); } break; case GGML_OP_MAP_UNARY: @@ -14503,6 +15936,22 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { node->n_tasks = 1; } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + node->n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks); + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + node->n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks; + + work_size = MAX(work_size, cur); + } break; case GGML_OP_NONE: { node->n_tasks = 1; @@ -15478,6 +16927,7 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g static enum ggml_opt_result ggml_opt_adam( struct ggml_context * ctx, + struct ggml_opt_context * opt, struct ggml_opt_params params, struct ggml_tensor * f, struct ggml_cgraph * gf, @@ -15503,25 +16953,29 @@ static enum ggml_opt_result ggml_opt_adam( } } + if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) { + int iter = opt->iter; + ggml_opt_init(opt->ctx, opt, params, nx); + opt->iter = iter; + } + // constants - const float alpha = params.adam.alpha; + const float sched = params.adam.sched; + const float decay = params.adam.decay * sched; + const float alpha = params.adam.alpha * sched; const float beta1 = params.adam.beta1; const float beta2 = params.adam.beta2; const float eps = params.adam.eps; - float * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // view of the parameters - float * g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient - float * g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient squared - float * m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment - float * v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment - float * mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment hat - float * vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment hat + float * x = opt->adam.x->data; // view of the parameters + float * g1 = opt->adam.g1->data; // gradient + float * g2 = opt->adam.g2->data; // gradient squared + float * m = opt->adam.m->data; // first moment + float * v = opt->adam.v->data; // second moment + float * mh = opt->adam.mh->data; // first moment hat + float * vh = opt->adam.vh->data; // second moment hat - float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values - - // initialize - ggml_vec_set_f32(nx, m, 0.0f); - ggml_vec_set_f32(nx, v, 0.0f); + float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values // update view ggml_opt_get_params(np, ps, x); @@ -15531,16 +16985,27 @@ static enum ggml_opt_result ggml_opt_adam( ggml_set_f32 (f->grad, 1.0f); ggml_graph_compute(ctx, gb); - float fx_prev = ggml_get_f32_1d(f, 0); + opt->adam.fx_prev = ggml_get_f32_1d(f, 0); + opt->adam.fx_best = opt->adam.fx_prev; if (pf) { - pf[0] = fx_prev; + pf[opt->iter % params.past] = opt->adam.fx_prev; } - int n_no_improvement = 0; - float fx_best = fx_prev; + // initialize + if (opt->just_initialized) { + opt->adam.n_no_improvement = 0; + opt->just_initialized = false; + } + + float * fx_best = &opt->adam.fx_best; + float * fx_prev = &opt->adam.fx_prev; + int * n_no_improvement = &opt->adam.n_no_improvement; + + int iter0 = opt->iter; // run the optimizer for (int t = 0; t < params.adam.n_iter; ++t) { + opt->iter = iter0 + t + 1; GGML_PRINT_DEBUG ("=== iter %d ===\n", t); GGML_PRINT_DEBUG ("f = %10.6f\n", ggml_get_f32_1d(f, 0)); @@ -15574,17 +17039,22 @@ static enum ggml_opt_result ggml_opt_adam( // m^hat = m_t / (1 - beta1^t) // v^hat = v_t / (1 - beta2^t) - // x_t = x_t-1 - alpha*m^hat/(sqrt(v^hat) + eps) + // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1) + // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1 + // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps) + // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps) + // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay) ggml_vec_cpy_f32 (nx, mh, m); ggml_vec_cpy_f32 (nx, vh, v); - ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, t + 1))); - ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, t + 1))); + ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter))); + ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter))); ggml_vec_sqrt_f32 (nx, vh, vh); ggml_vec_acc1_f32 (nx, vh, eps); ggml_vec_div_f32 (nx, mh, mh, vh); + ggml_vec_scale_f32(nx, x, 1.0f - decay); ggml_vec_sub_f32 (nx, x, x, mh); // update the parameters @@ -15598,7 +17068,7 @@ static enum ggml_opt_result ggml_opt_adam( const float fx = ggml_get_f32_1d(f, 0); // check convergence - if (fabsf(fx - fx_prev)/fx < params.adam.eps_f) { + if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { GGML_PRINT_DEBUG("converged\n"); return GGML_OPT_OK; @@ -15607,32 +17077,32 @@ static enum ggml_opt_result ggml_opt_adam( // delta-based convergence test if (pf != NULL) { // need at least params.past iterations to start checking for convergence - if (params.past <= t) { - const float rate = (pf[t%params.past] - fx)/fx; + if (params.past <= iter0 + t) { + const float rate = (pf[(iter0 + t)%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { return GGML_OPT_OK; } } - pf[t%params.past] = fx; + pf[(iter0 + t)%params.past] = fx; } // check for improvement if (params.max_no_improvement > 0) { - if (fx_best > fx) { - fx_best = fx; - n_no_improvement = 0; + if (fx_best[0] > fx) { + fx_best[0] = fx; + n_no_improvement[0] = 0; } else { - ++n_no_improvement; + ++n_no_improvement[0]; - if (n_no_improvement >= params.max_no_improvement) { + if (n_no_improvement[0] >= params.max_no_improvement) { return GGML_OPT_OK; } } } - fx_prev = fx; + fx_prev[0] = fx; { const int64_t t_end_cpu = ggml_cycles(); @@ -15771,6 +17241,7 @@ static enum ggml_opt_result linesearch_backtracking( static enum ggml_opt_result ggml_opt_lbfgs( struct ggml_context * ctx, + struct ggml_opt_context * opt, struct ggml_opt_params params, struct ggml_tensor * f, struct ggml_cgraph * gf, @@ -15803,31 +17274,32 @@ static enum ggml_opt_result ggml_opt_lbfgs( } } - float * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current parameters - float * xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous parameters - float * g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current gradient - float * gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous gradient - float * d = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // search direction + if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) { + int iter = opt->iter; + ggml_opt_init(ctx, opt, params, nx); + opt->iter = iter; + } - float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values + float * x = opt->lbfgs.x->data; // current parameters + float * xp = opt->lbfgs.xp->data; // previous parameters + float * g = opt->lbfgs.g->data; // current gradient + float * gp = opt->lbfgs.gp->data; // previous gradient + float * d = opt->lbfgs.d->data; // search direction + + float * pf = params.past > 0 ? opt->lbfgs.pf->data : NULL; // past function values float fx = 0.0f; // cost function value float xnorm = 0.0f; // ||x|| float gnorm = 0.0f; // ||g|| - float step = 0.0f; // initialize x from the graph nodes ggml_opt_get_params(np, ps, x); // the L-BFGS memory - struct ggml_lbfgs_iteration_data * lm = alloca(sizeof(struct ggml_lbfgs_iteration_data)*m); - - for (int i = 0; i < m; ++i) { - lm[i].alpha = 0.0f; - lm[i].ys = 0.0f; - lm[i].s = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; - lm[i].y = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; - } + float * lm_alpha = opt->lbfgs.lmal->data; + float * lm_ys = opt->lbfgs.lmys->data; + float * lm_s = opt->lbfgs.lms->data; + float * lm_y = opt->lbfgs.lmy->data; // evaluate the function value and its gradient { @@ -15842,12 +17314,6 @@ static enum ggml_opt_result ggml_opt_lbfgs( fx = ggml_get_f32_1d(f, 0); } - if (pf) { - pf[0] = fx; - } - - float fx_best = fx; - // search direction = -gradient ggml_vec_neg_f32(nx, d, g); @@ -15864,26 +17330,43 @@ static enum ggml_opt_result ggml_opt_lbfgs( return GGML_OPT_OK; } - // initial step - ggml_vec_norm_inv_f32(nx, &step, d); + if (opt->just_initialized) { + if (pf) { + pf[0] = fx; + } + opt->lbfgs.fx_best = fx; - int j = 0; - int k = 1; - int ls = 0; - int end = 0; - int bound = 0; - int n_no_improvement = 0; + // initial step + ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d); + opt->lbfgs.j = 0; + opt->lbfgs.k = 1; + opt->lbfgs.end = 0; + opt->lbfgs.n_no_improvement = 0; + opt->just_initialized = false; + } + + float * fx_best = &opt->lbfgs.fx_best; + float * step = &opt->lbfgs.step; + int * j = &opt->lbfgs.j; + int * k = &opt->lbfgs.k; + int * end = &opt->lbfgs.end; + int * n_no_improvement = &opt->lbfgs.n_no_improvement; + + int ls = 0; + int bound = 0; float ys = 0.0f; float yy = 0.0f; float beta = 0.0f; + int it = 0; + while (true) { // store the current position and gradient vectors ggml_vec_cpy_f32(nx, xp, x); ggml_vec_cpy_f32(nx, gp, g); - ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, &step, xp, f, gf, gb, np, ps); + ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps); if (ls < 0) { // linesearch failed - go back to the previous point and return @@ -15909,32 +17392,32 @@ static enum ggml_opt_result ggml_opt_lbfgs( // delta-based convergence test if (pf != NULL) { // need at least params.past iterations to start checking for convergence - if (params.past <= k) { - const float rate = (pf[k%params.past] - fx)/fx; + if (params.past <= k[0]) { + const float rate = (pf[k[0]%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { return GGML_OPT_OK; } } - pf[k%params.past] = fx; + pf[k[0]%params.past] = fx; } // check for improvement if (params.max_no_improvement > 0) { - if (fx < fx_best) { - fx_best = fx; - n_no_improvement = 0; + if (fx < fx_best[0]) { + fx_best[0] = fx; + n_no_improvement[0] = 0; } else { - n_no_improvement++; + n_no_improvement[0]++; - if (n_no_improvement >= params.max_no_improvement) { + if (n_no_improvement[0] >= params.max_no_improvement) { return GGML_OPT_OK; } } } - if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < k + 1) { + if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) { // reached the maximum number of iterations return GGML_OPT_DID_NOT_CONVERGE; } @@ -15943,50 +17426,51 @@ static enum ggml_opt_result ggml_opt_lbfgs( // s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. // y_{k+1} = g_{k+1} - g_{k}. // - ggml_vec_sub_f32(nx, lm[end].s, x, xp); - ggml_vec_sub_f32(nx, lm[end].y, g, gp); + ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp); + ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp); // compute scalars ys and yy: // ys = y^t \cdot s -> 1 / \rho. // yy = y^t \cdot y. // - ggml_vec_dot_f32(nx, &ys, lm[end].y, lm[end].s); - ggml_vec_dot_f32(nx, &yy, lm[end].y, lm[end].y); + ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]); + ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]); - lm[end].ys = ys; + lm_ys[end[0]] = ys; // find new search direction // ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS - bound = (m <= k) ? m : k; - k++; - end = (end + 1)%m; + bound = (m <= k[0]) ? m : k[0]; + k[0]++; + it++; + end[0] = (end[0] + 1)%m; // initialize search direction with -g ggml_vec_neg_f32(nx, d, g); - j = end; + j[0] = end[0]; for (int i = 0; i < bound; ++i) { - j = (j + m - 1) % m; + j[0] = (j[0] + m - 1) % m; // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1} - ggml_vec_dot_f32(nx, &lm[j].alpha, lm[j].s, d); - lm[j].alpha /= lm[j].ys; + ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d); + lm_alpha[j[0]] /= lm_ys[j[0]]; // q_{i} = q_{i+1} - \alpha_{i} y_{i} - ggml_vec_mad_f32(nx, d, lm[j].y, -lm[j].alpha); + ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]); } ggml_vec_scale_f32(nx, d, ys/yy); for (int i = 0; i < bound; ++i) { // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i} - ggml_vec_dot_f32(nx, &beta, lm[j].y, d); - beta /= lm[j].ys; + ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d); + beta /= lm_ys[j[0]]; // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} - ggml_vec_mad_f32(nx, d, lm[j].s, lm[j].alpha - beta); - j = (j + 1)%m; + ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta); + j[0] = (j[0] + 1)%m; } - step = 1.0; + step[0] = 1.0; } return GGML_OPT_DID_NOT_CONVERGE; @@ -16011,6 +17495,8 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { .adam = { .n_iter = 10000, + .sched = 1.000f, + .decay = 0.001f, .alpha = 0.001f, .beta1 = 0.9f, .beta2 = 0.999f, @@ -16053,6 +17539,71 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { return result; } +GGML_API void ggml_opt_init( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + int64_t nx) { + opt->ctx = ctx; + opt->params = params; + opt->iter = 0; + opt->nx = nx; + opt->just_initialized = true; + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.pf = params.past > 0 + ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) + : NULL; + ggml_set_zero(opt->adam.x); + ggml_set_zero(opt->adam.g1); + ggml_set_zero(opt->adam.g2); + ggml_set_zero(opt->adam.m); + ggml_set_zero(opt->adam.v); + ggml_set_zero(opt->adam.mh); + ggml_set_zero(opt->adam.vh); + if (opt->adam.pf) { + ggml_set_zero(opt->adam.pf); + } + } break; + case GGML_OPT_LBFGS: + { + opt->lbfgs.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.d = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.pf = params.past > 0 + ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) + : NULL; + opt->lbfgs.lmal = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lmys = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lms = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m); + opt->lbfgs.lmy = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m); + ggml_set_zero(opt->lbfgs.x); + ggml_set_zero(opt->lbfgs.xp); + ggml_set_zero(opt->lbfgs.g); + ggml_set_zero(opt->lbfgs.gp); + ggml_set_zero(opt->lbfgs.d); + ggml_set_zero(opt->lbfgs.pf); + if (opt->lbfgs.pf) { + ggml_set_zero(opt->lbfgs.pf); + } + ggml_set_zero(opt->lbfgs.lmal); + ggml_set_zero(opt->lbfgs.lmys); + ggml_set_zero(opt->lbfgs.lms); + ggml_set_zero(opt->lbfgs.lmy); + } break; + } +} + enum ggml_opt_result ggml_opt( struct ggml_context * ctx, struct ggml_opt_params params, @@ -16075,30 +17626,10 @@ enum ggml_opt_result ggml_opt( enum ggml_opt_result result = GGML_OPT_OK; - // build forward + backward compute graphs - struct ggml_cgraph gf = ggml_build_forward (f); - struct ggml_cgraph gb = ggml_build_backward(ctx, &gf, true); + struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); - switch (params.type) { - case GGML_OPT_ADAM: - { - result = ggml_opt_adam(ctx, params, f, &gf, &gb); - } break; - case GGML_OPT_LBFGS: - { - result = ggml_opt_lbfgs(ctx, params, f, &gf, &gb); - } break; - } - - if (params.print_forward_graph) { - ggml_graph_print (&gf); - ggml_graph_dump_dot(&gf, NULL, "opt-forward.dot"); - } - - if (params.print_backward_graph) { - ggml_graph_print (&gb); - ggml_graph_dump_dot(&gb, &gf, "opt-backward.dot"); - } + ggml_opt_init(ctx, opt, params, 0); + result = ggml_opt_resume(ctx, opt, f); if (free_ctx) { ggml_free(ctx); @@ -16107,6 +17638,58 @@ enum ggml_opt_result ggml_opt( return result; } +enum ggml_opt_result ggml_opt_resume( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f) { + + // build forward + backward compute graphs + struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); + struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); + + struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; + struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; + + *gf = ggml_build_forward (f); + *gb = ggml_build_backward(ctx, gf, true); + + return ggml_opt_resume_g(ctx, opt, f, gf, gb); +} + +enum ggml_opt_result ggml_opt_resume_g( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb) { + + // build forward + backward compute graphs + enum ggml_opt_result result = GGML_OPT_OK; + + switch (opt->params.type) { + case GGML_OPT_ADAM: + { + result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb); + } break; + case GGML_OPT_LBFGS: + { + result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb); + } break; + } + + if (opt->params.print_forward_graph) { + ggml_graph_print (gf); + ggml_graph_dump_dot(gf, NULL, "opt-forward.dot"); + } + + if (opt->params.print_backward_graph) { + ggml_graph_print (gb); + ggml_graph_dump_dot(gb, gf, "opt-backward.dot"); + } + + return result; +} + //////////////////////////////////////////////////////////////////////////////// size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { diff --git a/ggml.h b/ggml.h index 1b26da3ad..f2a91761b 100644 --- a/ggml.h +++ b/ggml.h @@ -296,6 +296,7 @@ extern "C" { GGML_OP_SUM_ROWS, GGML_OP_MEAN, GGML_OP_REPEAT, + GGML_OP_REPEAT_BACK, GGML_OP_ABS, GGML_OP_SGN, GGML_OP_NEG, @@ -309,6 +310,7 @@ extern "C" { GGML_OP_RMS_NORM_BACK, GGML_OP_MUL_MAT, + GGML_OP_OUT_PROD, GGML_OP_SCALE, GGML_OP_SET, @@ -324,6 +326,7 @@ extern "C" { GGML_OP_DIAG_MASK_INF, GGML_OP_DIAG_MASK_ZERO, GGML_OP_SOFT_MAX, + GGML_OP_SOFT_MAX_BACK, GGML_OP_ROPE, GGML_OP_ROPE_BACK, GGML_OP_ALIBI, @@ -333,10 +336,14 @@ extern "C" { GGML_OP_FLASH_ATTN, GGML_OP_FLASH_FF, + GGML_OP_FLASH_ATTN_BACK, GGML_OP_MAP_UNARY, GGML_OP_MAP_BINARY, + GGML_OP_CROSS_ENTROPY_LOSS, + GGML_OP_CROSS_ENTROPY_LOSS_BACK, + GGML_OP_COUNT, }; @@ -574,6 +581,11 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_add1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_acc( struct ggml_context * ctx, struct ggml_tensor * a, @@ -645,6 +657,11 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_repeat_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_abs( struct ggml_context * ctx, struct ggml_tensor * a); @@ -698,14 +715,22 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); - // A: m rows, n columns - // B: p rows, n columns (i.e. we transpose it internally) + // A: n columns, m rows + // B: n columns, p rows (i.e. we transpose it internally) // result is m columns, p rows GGML_API struct ggml_tensor * ggml_mul_mat( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); + // A: m columns, n rows, + // B: p columns, n rows, + // result is m columns, p rows + GGML_API struct ggml_tensor * ggml_out_prod( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // // operations on tensors without backpropagation // @@ -916,6 +941,17 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_soft_max_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_soft_max_back_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // rotary position embedding // if mode & 1 == 1, skip n_past elements // if mode & 2 == 1, GPT-NeoX style @@ -982,6 +1018,14 @@ extern "C" { struct ggml_tensor * v, bool masked); + GGML_API struct ggml_tensor * ggml_flash_attn_back( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * d, + bool masked); + GGML_API struct ggml_tensor * ggml_flash_ff( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1005,6 +1049,19 @@ extern "C" { struct ggml_tensor * b, ggml_binary_op_f32_t fun); + // loss function + + GGML_API struct ggml_tensor * ggml_cross_entropy_loss( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c); + // // automatic differentiation // @@ -1099,6 +1156,8 @@ extern "C" { struct { int n_iter; + float sched; // schedule multiplier (fixed, decay or warmup) + float decay; // weight decay for AdamW, use 0.0f to disable float alpha; // learning rate float beta1; float beta2; @@ -1123,6 +1182,49 @@ extern "C" { } lbfgs; }; + struct ggml_opt_context { + struct ggml_context * ctx; + struct ggml_opt_params params; + + int iter; + int64_t nx; // number of parameter elements + + bool just_initialized; + + struct { + struct ggml_tensor * x; // view of the parameters + struct ggml_tensor * g1; // gradient + struct ggml_tensor * g2; // gradient squared + struct ggml_tensor * m; // first moment + struct ggml_tensor * v; // second moment + struct ggml_tensor * mh; // first moment hat + struct ggml_tensor * vh; // second moment hat + struct ggml_tensor * pf; // past function values + float fx_best; + float fx_prev; + int n_no_improvement; + } adam; + + struct { + struct ggml_tensor * x; // current parameters + struct ggml_tensor * xp; // previous parameters + struct ggml_tensor * g; // current gradient + struct ggml_tensor * gp; // previous gradient + struct ggml_tensor * d; // search direction + struct ggml_tensor * pf; // past function values + struct ggml_tensor * lmal; // the L-BFGS memory alpha + struct ggml_tensor * lmys; // the L-BFGS memory ys + struct ggml_tensor * lms; // the L-BFGS memory s + struct ggml_tensor * lmy; // the L-BFGS memory y + float fx_best; + float step; + int j; + int k; + int end; + int n_no_improvement; + } lbfgs; + }; + GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); // optimize the function defined by the tensor f @@ -1131,6 +1233,27 @@ extern "C" { struct ggml_opt_params params, struct ggml_tensor * f); + // initialize optimizer context + GGML_API void ggml_opt_init( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + int64_t nx); + + // continue optimizing the function defined by the tensor f + GGML_API enum ggml_opt_result ggml_opt_resume( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f); + + // continue optimizing the function defined by the tensor f + GGML_API enum ggml_opt_result ggml_opt_resume_g( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb); + // // quantization // diff --git a/llama.cpp b/llama.cpp index c7a333642..d2a52bb0c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1036,6 +1036,12 @@ static void llama_model_load_internal( case 40: model.type = e_model::MODEL_13B; break; case 60: model.type = e_model::MODEL_30B; break; case 80: model.type = e_model::MODEL_65B; break; + default: + { + if (hparams.n_layer < 32) { + model.type = e_model::MODEL_7B; + } + } break; } hparams.n_ctx = n_ctx; @@ -1200,6 +1206,7 @@ static void llama_model_load_internal( mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); (void) vram_scratch; + (void) n_batch; #ifdef GGML_USE_CUBLAS vram_scratch = n_batch * MB; ggml_cuda_set_scratch_size(vram_scratch); @@ -1227,6 +1234,7 @@ static void llama_model_load_internal( model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); } + (void) tensor_split; #if defined(GGML_USE_CUBLAS) { ggml_cuda_set_tensor_split(tensor_split); @@ -2161,6 +2169,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok return -log2f(candidate.p) > *mu; })); + if (candidates->size == 0) { + candidates->size = 1; + } + // Normalize the probabilities of the remaining words llama_sample_softmax(ctx, candidates); @@ -3287,6 +3299,19 @@ int llama_n_embd(const struct llama_context * ctx) { return ctx->model.hparams.n_embd; } +int llama_get_vocab( + const struct llama_context * ctx, + const char * * strings, + float * scores, + int capacity) { + int n = std::min(capacity, (int) ctx->vocab.id_to_token.size()); + for (int i = 0; ivocab.id_to_token[i].tok.c_str(); + scores[i] = ctx->vocab.id_to_token[i].score; + } + return n; +} + float * llama_get_logits(struct llama_context * ctx) { return ctx->logits.data(); } diff --git a/llama.h b/llama.h index 7c7fd481c..61f6c867d 100644 --- a/llama.h +++ b/llama.h @@ -220,6 +220,14 @@ extern "C" { LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx); + // Get the vocabulary as output parameters. + // Returns number of results. + LLAMA_API int llama_get_vocab( + const struct llama_context * ctx, + const char * * strings, + float * scores, + int capacity); + // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row // Can be mutated in order to change the probabilities of the next token diff --git a/tests/test-grad0.c b/tests/test-grad0.c index ec5059220..c8c2c0f71 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -5,7 +5,7 @@ #include #include -#define MAX_NARGS 2 +#define MAX_NARGS 3 #undef MIN #undef MAX @@ -1090,6 +1090,25 @@ int main(int argc, const char ** argv) { } } + // cross_entropy_loss + { + const int nargs = 1; + + int64_t ne2[4]; + get_random_dims(ne2, 4); + + for (int ndims = 1; ndims <= 3; ++ndims) { + x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f); + x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f); + ggml_set_param(ctx0, x[0]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1])); + + check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY); + // finite differences regularly fails! + } + } + // rope { const int nargs = 1; @@ -1124,6 +1143,45 @@ int main(int argc, const char ** argv) { } } + // flash_attn + { + const int nargs = 3; + + int64_t ne2[4]; + + get_random_dims(ne2, 4); + int64_t D = ne2[0]; + int64_t N = ne2[1]; + int64_t M = ne2[2] + N; + int64_t B = ne2[3]; + + for (int masked = 0; masked <= 1; ++masked) { + for (int ndims = 2; ndims <= 4; ++ndims) { + int64_t neq[4] = { D, N, B, ne[3] }; + int64_t nek[4] = { D, M, B, ne[3] }; + int64_t nev[4] = { M, D, B, ne[3] }; + if (ndims == 2) { + neq[2] = 1; neq[3] = 1; + nek[2] = 1; nek[3] = 1; + nev[2] = 1; nev[3] = 1; + } else if (ndims == 3) { + neq[3] = 1; + nek[3] = 1; + nev[3] = 1; + } + x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f); + x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f); + x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f); + ggml_set_param(ctx0, x[0]); + ggml_set_param(ctx0, x[1]); + ggml_set_param(ctx0, x[2]); + + struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0))); + + check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f); + } + } + } ggml_free(ctx0); } From 92549202659fc23ba9fec5e688227d0da9b06b40 Mon Sep 17 00:00:00 2001 From: 0xspringtime <110655352+0xspringtime@users.noreply.github.com> Date: Tue, 13 Jun 2023 15:37:54 -0400 Subject: [PATCH 03/21] baby-llama : fix operator!= (#1821) * Update baby-llama.cpp Seems to be an error in the implementation of the operator!= function. It attempts to compare the this pointer (a llama_hparams_lora object) with the other pointer (a llama_hparams object) using memcmp. This can lead to incorrect results because the sizes of the objects being compared (sizeof(llama_hparams) and sizeof(llama_hparams_lora)) are different, should now be able to compare two llama_hparams_lora objects for inequality. * Update baby-llama.cpp * Update baby-llama.cpp --- examples/baby-llama/baby-llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index e5639da37..0add6adc0 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -153,8 +153,8 @@ struct llama_hparams_lora { uint32_t n_rot = 64; uint32_t n_lora = 64; - bool operator!=(const llama_hparams & other) const { - return memcmp(this, &other, sizeof(llama_hparams)); + bool operator!=(const llama_hparams_lora & other) const { + return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0; } }; From 254a7a7a5ff4c874ff8488f1f5cbdd7e9c89d682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 14 Jun 2023 19:47:19 +0200 Subject: [PATCH 04/21] CUDA full GPU acceleration, KV cache in VRAM (#1827) * Fixed CUDA RoPE * ggml_cuda_mul_mat_vec_p021 * ggml_cuda_scale * ggml_cuda_diag_mask_inf * ggml_is_permuted * ggml_cuda_cpy * flatten rows for ggml_cuda_op * Added a --low-vram option * Fixed Windows performance * Fixed LLAMA_CUDA_DMMV_Y > 1 for WizardLM --- examples/common.cpp | 8 + examples/common.h | 17 +- examples/main/README.md | 1 + examples/server/README.md | 1 + examples/server/server.cpp | 9 + ggml-cuda.cu | 797 ++++++++++++++++++++++++++++++++----- ggml-cuda.h | 2 + ggml.c | 6 + ggml.h | 1 + llama.cpp | 159 ++++++-- llama.h | 1 + 11 files changed, 853 insertions(+), 149 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index df69f2736..dc69e5373 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -331,6 +331,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } #else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); +#endif // GGML_USE_CUBLAS + } else if (arg == "--low-vram" || arg == "-lv") { +#ifdef GGML_USE_CUBLAS + params.low_vram = true; +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); #endif // GGML_USE_CUBLAS } else if (arg == "--no-mmap") { params.use_mmap = false; @@ -479,6 +485,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); + fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); #endif fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); @@ -528,6 +535,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { lparams.n_gpu_layers = params.n_gpu_layers; lparams.main_gpu = params.main_gpu; memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float)); + lparams.low_vram = params.low_vram; lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.use_mmap = params.use_mmap; diff --git a/examples/common.h b/examples/common.h index 6fedb414a..6c2953cb2 100644 --- a/examples/common.h +++ b/examples/common.h @@ -21,15 +21,16 @@ int32_t get_num_physical_cores(); struct gpt_params { - int32_t seed = -1; // RNG seed - int32_t n_threads = get_num_physical_cores(); - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_gpu_layers = 0; // number of layers to store in VRAM - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + int32_t seed = -1; // RNG seed + int32_t n_threads = get_num_physical_cores(); + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 512; // context size + int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_gpu_layers = 0; // number of layers to store in VRAM + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs + bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance // sampling parameters std::unordered_map logit_bias; // logit bias for specific tokens diff --git a/examples/main/README.md b/examples/main/README.md index 149d507a8..b6d3212fe 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -288,5 +288,6 @@ These options provide extra functionality and customization when running the LLa - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. diff --git a/examples/server/README.md b/examples/server/README.md index b011302fc..7dabac9cf 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -289,6 +289,7 @@ Test(); - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS. - `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**. - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`; - `--port`: Set the port to listen. Default: `8080`. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 31d8087ef..872750053 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -405,6 +405,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms) fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); + fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); #endif fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); @@ -537,6 +538,14 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para } #else fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); +#endif // GGML_USE_CUBLAS + } + else if (arg == "--low-vram" || arg == "-lv") + { +#ifdef GGML_USE_CUBLAS + params.low_vram = true; +#else + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); #endif // GGML_USE_CUBLAS } else if (arg == "--main-gpu" || arg == "-mg") diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 3b9a5ddfb..0565571f4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -48,6 +49,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1); typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v); +typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); typedef void (*ggml_cuda_op_t)( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i, @@ -151,7 +153,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define CUDA_ADD_BLOCK_SIZE 256 #define CUDA_MUL_BLOCK_SIZE 256 #define CUDA_SILU_BLOCK_SIZE 256 +#define CUDA_CPY_BLOCK_SIZE 32 +#define CUDA_SCALE_BLOCK_SIZE 256 #define CUDA_ROPE_BLOCK_SIZE 256 +#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 // dmmv = dequantize_mul_mat_vec @@ -655,10 +660,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k) } template -static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) { +static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols, const int nrows) { // qk = quantized weights per x block // qr = number of quantized weights per data value in x block - const int row = blockIdx.x*blockDim.y + threadIdx.y; + const int row = blockIdx.y*blockDim.y + threadIdx.y; + + if (row >= nrows) { + return; + } + const int tid = threadIdx.x; const int iter_stride = 2*GGML_CUDA_DMMV_X; @@ -703,8 +713,13 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, } template -static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols) { - const int row = blockIdx.x*blockDim.y + threadIdx.y; +static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) { + const int row = blockIdx.y*blockDim.y + threadIdx.y; + + if (row >= nrows) { + return; + } + const int tid = threadIdx.x; const int iter_stride = QK_K; @@ -737,6 +752,139 @@ static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y } } +static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) { + const half * x = (half *) vx; + + const int row_x = blockDim.y*blockIdx.y + threadIdx.y; + const int channel = blockDim.z*blockIdx.z + threadIdx.z; + + const int nrows_y = ncols_x; + const int nrows_dst = nrows_x; + const int row_dst = row_x; + + float tmp = 0.0f; + + for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) { + const int col_x = col_x0 + threadIdx.x; + + if (col_x >= ncols_x) { + break; + } + + // x is transposed and permuted + const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x; + const float xi = __half2float(x[ix]); + + const int row_y = col_x; + + + // y is not transposed but permuted + const int iy = channel*nrows_y + row_y; + + tmp += xi * y[iy]; + } + + // dst is not transposed and not permuted + const int idst = channel*nrows_dst + row_dst; + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (threadIdx.x == 0) { + dst[idst] = tmp; + } +} + +static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous + const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, + const int row_stride_x, const int nchannels_x, const int channel_stride_x) { + + const half * x = (half *) vx; + + const int row_x = blockDim.y*blockIdx.y + threadIdx.y; + const int channel = blockDim.z*blockIdx.z + threadIdx.z; + + const int nrows_y = ncols_x; + const int nrows_dst = nrows_x; + const int row_dst = row_x; + + const int idst = channel*nrows_dst + row_dst; + + float tmp = 0.0f; + + for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) { + const int col_x = col_x0 + threadIdx.x; + + if (col_x >= ncols_x) { + break; + } + + const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x; + const float xi = __half2float(x[ix]); + + const int row_y = col_x; + + const int iy = channel*nrows_y + row_y; + + tmp += xi * y[iy]; + } + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (threadIdx.x == 0) { + dst[idst] = tmp; + } +} + +static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) { + const float * xi = (float *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) { + const float * xi = (float *) cxi; + half * dsti = (half *) cdsti; + + *dsti = __float2half(*xi); +} + +template +static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= ne) { + return; + } + + // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor + // then combine those indices with the corresponding byte offsets to get the total offsets + const int i02 = i / (ne00*ne01); + const int i01 = (i - i02*ne01*ne00) / ne00; + const int i00 = i - i02*ne01*ne00 - i01*ne00; + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + + const int i12 = i / (ne10*ne11); + const int i11 = (i - i12*ne10*ne11) / ne10; + const int i10 = i - i12*ne10*ne11 - i11*ne10; + const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + + cpy_1(cx + x_offset, cdst + dst_offset); +} + +// rope == RoPE == rotary positional embedding static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) { const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x); @@ -758,6 +906,72 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c dst[i + 1] = x0*sin_theta + x1*cos_theta; } +static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { + const int col = blockDim.x*blockIdx.x + threadIdx.x; + const int row = blockDim.y*blockIdx.y + threadIdx.y; + + if (col >= ncols) { + return; + } + + const int i = row*ncols + col; + // dst[i] = col > n_past + row ? -INFINITY : x[i]; + dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU +} + +// the CUDA soft max implementation differs from the CPU implementation +// instead of doubles floats are used +// values are also not normalized to the maximum value by subtracting it in the exponential function +// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine +static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) { + const int row = blockDim.y*blockIdx.y + threadIdx.y; + const int block_size = blockDim.x; + const int tid = threadIdx.x; + + float tmp = 0.0; + + for (int block_start = 0; block_start < ncols; block_start += block_size) { + const int col = block_start + tid; + + if (col >= ncols) { + break; + } + + const int i = row*ncols + col; + const float val = expf(x[i]); + tmp += val; + dst[i] = val; + } + + // sum up partial sums + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + for (int block_start = 0; block_start < ncols; block_start += block_size) { + const int col = block_start + tid; + + if (col >= ncols) { + break; + } + + const int i = row*ncols + col; + dst[i] /= tmp; + } +} + +static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + dst[i] = scale * x[i]; +} + static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; add_f32<<>>(x, y, dst, k); @@ -831,73 +1045,92 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; + const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols); + <<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; + const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols); + <<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; + const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols); + <<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; + const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols); + <<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; + const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols); + <<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); const int ny = 2; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<(nrows + ny - 1)/ny, block_dims, 0, stream>>>(vx, y, dst, ncols); + dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<>>(vx, y, dst, ncols); + const int ny = 2; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<>>(vx, y, dst, ncols); + const int ny = 2; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<>>(vx, y, dst, ncols); + const int ny = 2; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const dim3 block_dims(32, 2, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<>>(vx, y, dst, ncols); + const int ny = 2; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<>>(vx, y, dst, ncols, nrows); } static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { @@ -907,10 +1140,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); + const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; + const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1); dequantize_mul_mat_vec<1, 1, convert_f16> - <<>>(vx, y, dst, ncols); + <<>>(vx, y, dst, ncols, nrows); } static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { @@ -942,6 +1176,47 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { } } +static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) { + const dim3 block_nums(1, nrows_x, nchannels_x); + const dim3 block_dims(WARP_SIZE, 1, 1); + mul_mat_p021_f16_f32<<>>(vx, y, dst, ncols_x, nrows_x, nchannels_x); +} + +static void ggml_mul_mat_vec_nc_f16_f32_cuda( + const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x, + const int nchannels_x, const int channel_stride_x, cudaStream_t stream) { + + const dim3 block_nums(1, nrows_x, nchannels_x); + const dim3 block_dims(WARP_SIZE, 1, 1); + mul_mat_vec_nc_f16_f32<<>> + (vx, y, dst, ncols_x, nrows_x, row_stride_x, nchannels_x, channel_stride_x); +} + +static void ggml_cpy_f32_f32_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_f32_f16<<>> + (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); +} + +static void ggml_cpy_f32_f16_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_f32_f16<<>> + (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); +} + +static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; + scale_f32<<>>(x, dst, scale, k); +} + static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) { GGML_ASSERT(nrows % 2 == 0); const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1); @@ -950,6 +1225,19 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i rope_f32<<>>(x, dst, ncols, p, theta_scale); } +static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) { + const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1); + const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; + const dim3 block_nums(block_num_x, nrows_x, 1); + diag_mask_inf_f32<<>>(x, dst, ncols_x, rows_per_channel, n_past); +} + +static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, cudaStream_t stream) { + const dim3 block_dims(WARP_SIZE, 1, 1); + const dim3 block_nums(1, nrows_x, 1); + soft_max_f32<<>>(x, dst, ncols_x); +} + // buffer pool for cuda #define MAX_CUDA_BUFFERS 256 @@ -1120,10 +1408,25 @@ void ggml_cuda_host_free(void * ptr) { CUDA_CHECK(cudaFreeHost(ptr)); } -static cudaError_t ggml_cuda_h2d_tensor_2d( +static cudaError_t ggml_cuda_cpy_tensor_2d( void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) { - char * dst_char = (char *) dst; + cudaMemcpyKind kind; + char * src_ptr; + if (src->backend == GGML_BACKEND_CPU) { + kind = cudaMemcpyHostToDevice; + src_ptr = (char *) src->data; + } else if (src->backend == GGML_BACKEND_GPU) { + kind = cudaMemcpyDeviceToDevice; + struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + src_ptr = (char *) extra->data_device[id]; + } else { + GGML_ASSERT(false); + } + char * dst_ptr = (char *) dst; + const int64_t ne0 = src->ne[0]; const int64_t nb0 = src->nb[0]; const int64_t nb1 = src->nb[1]; @@ -1134,17 +1437,17 @@ static cudaError_t ggml_cuda_h2d_tensor_2d( const int64_t bs = ggml_blck_size(type); int64_t i1_diff = i1_high - i1_low; - const void * x = (const void *) ((const char *) src->data + i1_low*nb1 + i2*nb2 + i3*nb3); + const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3; if (nb0 == ts && nb1 == ts*ne0/bs) { - return cudaMemcpyAsync(dst_char, x, i1_diff*nb1, cudaMemcpyHostToDevice, stream); + return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream); } else if (nb0 == ts) { - return cudaMemcpy2DAsync(dst_char, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyHostToDevice, stream); + return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream); } else { for (int64_t i1 = 0; i1 < i1_diff; i1++) { const void * rx = (const void *) ((const char *) x + i1*nb1); - void * rd = (void *) (dst_char + i1*ts*ne0/bs); + void * rd = (void *) (dst_ptr + i1*ts*ne0/bs); // pretend the row is a matrix with cols=1 - cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream); + cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream); if (r != cudaSuccess) return r; } return cudaSuccess; @@ -1380,8 +1683,81 @@ inline void ggml_cuda_op_rope( (void) i1; } +inline void ggml_cuda_op_diag_mask_inf( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t i01_diff = i01_high - i01_low; + + const int n_past = ((int32_t *) src1->data)[0]; + + // compute + diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); + + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_soft_max( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_scale( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const float scale = ((float *) src1->data)[0]; + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + scale_f32_cuda(src0_ddf_i, dst_ddf_i, scale, ne00*i01_diff, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - ggml_cuda_op_t op, bool src0_needs_f32) { + ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) { const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -1404,21 +1780,27 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); // strides for iteration over dims 3 and 2 - const int64_t src0_stride = ne00 * ne01; - const int64_t src1_stride = ne10 * ne11; - const int64_t dst_stride = ne0 * ne1; - const int64_t num_iters = ne02 * ne03; + const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03; + const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1; + const int64_t src0_stride = ne00 * ne01 * stride_mod; + const int64_t src1_stride = ne10 * ne11 * stride_mod; + const int64_t dst_stride = ne0 * ne1 * stride_mod; const size_t src0_ts = ggml_type_size(src0->type); const size_t src0_bs = ggml_blck_size(src0->type); - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src0_is_f32 = src0->type == GGML_TYPE_F32; + const bool src1_is_contiguous = use_src1 && ggml_is_contiguous(src1); + const bool src1_stays_on_host = use_src1 && ( + dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE); + const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); @@ -1427,13 +1809,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; - float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; + float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // asq = actual size quantized, asf = actual size float size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0}; size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0}; size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0}; - size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0}; + size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0}; for (int id = 0; id < g_device_count; ++id) { if (!split && id != g_main_device) { @@ -1446,9 +1828,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm int64_t row_low, row_high; if (split) { row_low = id == 0 ? 0 : nrows0*g_tensor_split[id]; - row_low -= row_low % GGML_CUDA_DMMV_Y; row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1]; - row_high -= row_high % GGML_CUDA_DMMV_Y; } else { row_low = 0; row_high = nrows0; @@ -1461,7 +1841,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm cudaSetDevice(id); - if (src0_on_device) { + if (src0_on_device && src0_is_contiguous) { if (src0_is_f32) { src0_ddf[id] = (float *) src0_extra->data_device[id]; } else { @@ -1479,8 +1859,8 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]); } - if (use_src1) { - if (src1_on_device) { + if (use_src1 && !src1_stays_on_host) { + if (src1_on_device && src1_is_contiguous) { src1_ddf[id] = (float *) src1_extra->data_device[id]; } else { src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]); @@ -1493,26 +1873,32 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]); } - for (int64_t i03 = 0; i03 < ne03; i03++) { + const int64_t i03_max = flatten_rows ? 1 : ne03; + const int64_t i02_max = flatten_rows ? 1 : ne02; + const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01; + + for (int64_t i03 = 0; i03 < i03_max; i03++) { const int64_t i13 = i03 % ne13; - for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i02 = 0; i02 < i02_max; i02++) { const int64_t i12 = i02 % ne12; const int64_t i0 = i03*ne02 + i02; - const int64_t i0_offset_low = row_low/ne01; - const int64_t i0_offset_high = row_high/ne01; + + // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs + const int64_t i0_offset_low = row_low/rows_per_iter; + const int64_t i0_offset_high = row_high/rows_per_iter; int64_t i01_low = 0; - int64_t i01_high = ne01; + int64_t i01_high = rows_per_iter; if (split) { if (i0 < i0_offset_low || i0 > i0_offset_high) { continue; } if (i0 == i0_offset_low) { - i01_low = row_low % ne01; + i01_low = row_low % rows_per_iter; } if (i0 == i0_offset_high) { - i01_high = row_high % ne01; + i01_high = row_high % rows_per_iter; } } @@ -1521,7 +1907,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output. // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU). GGML_ASSERT(i01_low == 0 || g_device_count > 1); - GGML_ASSERT(i01_high == ne01 || g_device_count > 1); + GGML_ASSERT(i01_high == rows_per_iter || g_device_count > 1); const int64_t i01_diff = i01_high - i01_low; if (i01_diff == 0) { @@ -1529,24 +1915,23 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm } const int64_t i11 = i13*ne12 + i12; - cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS]; + cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS]; cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS]; - cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS]; + cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS]; // for split tensors the data begins at i0 == i0_offset_low char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs; float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride; float * src1_ddf_i = src1_ddf[id] + i11*src1_stride; - float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride; + float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride; // for split tensors the data pointer needs to be rounded down // to the bin edge for i03, i02 bins beyond the first if (i0 - i0_offset_low > 0) { + GGML_ASSERT(!flatten_rows); src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs; src0_ddf_i -= (row_low % ne01)*ne00; - } - if (i0 - i0_offset_low > 0) { - dst_ddf_i -= (row_low % ne0)*ne1; + dst_ddf_i -= (row_low % ne0)*ne1; } // the main device memory buffer can be on VRAM scratch, with space for all partial results @@ -1556,30 +1941,37 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm } // copy src0, src1 to device if necessary - if (use_src1) { + if (use_src1 && !src1_stays_on_host) { if (src1->backend == GGML_BACKEND_CPU) { - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_memcpy_src1)); - } else if (src1->backend == GGML_BACKEND_GPU) { + GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1)); + int64_t nrows1 = flatten_rows ? nrows0 : ne11; + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1)); + } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { if (id != g_main_device) { + GGML_ASSERT(!flatten_rows); float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device]; src1_ddf_i_source += i11*src1_stride; CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float), cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1)); } + } else if (src1_on_device && !src1_is_contiguous) { + GGML_ASSERT(!split); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_main)); } else { GGML_ASSERT(false); } } CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1)); - if (!src0_on_device) { + + if (!src0_on_device || !src0_is_contiguous) { if (src0_is_f32) { - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); } else { - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); } } - // convert src0 to f32 if it's necessary for the ggml_cuda_op + // convert src0 to f32 if it is necessary for the ggml_cuda_op if (src0_needs_f32 && !src0_is_f32) { to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main); CUDA_CHECK(cudaGetLastError()); @@ -1644,39 +2036,30 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true, true); } void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten } void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true); } void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true); } bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU); const int64_t ne10 = src1->ne[0]; const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; - // if (strcmp(dst->name, "KQ") == 0 || strcmp(dst->name, "KQV") == 0) { - // fprintf(stderr, "(%ld, %ld, %ld, %ld) + (%ld, %ld, %ld, %ld) -> (%ld, %ld, %ld, %ld)\n", - // src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], - // src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], - // dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); - // return false; - // } - // TODO: find the optimal values for these if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && @@ -1688,23 +2071,158 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te return false; } +void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ + GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation + GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + + CUDA_CHECK(cudaSetDevice(g_main_device)); + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0]; + + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + + struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main); + + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ + GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)); + GGML_ASSERT(!ggml_is_permuted(src0)); + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; + + CUDA_CHECK(cudaSetDevice(g_main_device)); + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0]; + + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + + struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + const int row_stride_x = nb01 / sizeof(half); + const int channel_stride_x = nb02 / sizeof(half); + + ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main); + + CUDA_CHECK(cudaDeviceSynchronize()); +} + void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - if (src0->type == GGML_TYPE_F32) { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true); + bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && + src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU; + + if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + ggml_cuda_mul_mat_vec_p021(src0, src1, dst); + } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) { + ggml_cuda_mul_mat_vec_nc(src0, src1, dst); + }else if (src0->type == GGML_TYPE_F32) { + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false); } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { - if (src1->ne[1] == 1) { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); + if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) { + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false); } else { - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false); } } else { GGML_ASSERT(false); } } +void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_scale, true, true); +} + +void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne == ggml_nelements(src1)); + + GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + + GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); + GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + GGML_ASSERT(src0->ne[3] == 1); + + const int64_t nb00 = src0->nb[0]; + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + GGML_ASSERT(src1->ne[3] == 1); + + const int64_t nb10 = src1->nb[0]; + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; + + CUDA_CHECK(cudaSetDevice(g_main_device)); + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0]; + + const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + char * src1_ddc = (char *) src1_extra->data_device[g_main_device]; + + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, + ne10, ne11, nb10, nb11, nb12, cudaStream_main); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, + ne10, ne11, nb10, nb11, nb12, cudaStream_main); + } else { + GGML_ASSERT(false); + } + + CUDA_CHECK(cudaDeviceSynchronize()); + + (void) dst; +} + +void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true); +} + +void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_soft_max, true, true); +} + void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, false); // FIXME flatten changes results } void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -1718,10 +2236,9 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { const size_t nb1 = tensor->nb[1]; ggml_backend backend = tensor->backend; struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; + memset(extra, 0, sizeof(*extra)); for (int id = 0; id < g_device_count; ++id) { - extra->data_device[id] = nullptr; - if (backend == GGML_BACKEND_GPU && id != g_main_device) { continue; } @@ -1734,10 +2251,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { row_high = nrows; } else if (backend == GGML_BACKEND_GPU_SPLIT) { row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; - row_low -= row_low % GGML_CUDA_DMMV_Y; row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1]; - row_high -= row_high % GGML_CUDA_DMMV_Y; - GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); } else { GGML_ASSERT(false); } @@ -1781,45 +2295,76 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) { delete extra; } -void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { - if (tensor->src0 != nullptr && tensor->src0->op == GGML_OP_RESHAPE) { - ggml_cuda_assign_buffers(tensor); +void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) { + if (scratch && g_scratch_size == 0) { + return; } - const size_t size = ggml_nbytes(tensor); - GGML_ASSERT(size <= g_scratch_size); - if (g_scratch_offset + size > g_scratch_size) { - g_scratch_offset = 0; + // recursively assign CUDA buffers until a compute tensor is found + if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) { + const ggml_op src0_op = tensor->src0->op; + if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) { + ggml_cuda_assign_buffers_impl(tensor->src0, scratch); + } + } + if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) { + ggml_cuda_assign_buffers_impl(tensor->src1, scratch); } tensor->backend = GGML_BACKEND_GPU; struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; - bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data; + const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) || + tensor->op == GGML_OP_VIEW; + const size_t size = ggml_nbytes(tensor); CUDA_CHECK(cudaSetDevice(g_main_device)); if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) { struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra; - extra->data_device[g_main_device] = src0_extra->data_device; - GGML_ASSERT(false); - } else { + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + size_t offset = 0; + if (tensor->op == GGML_OP_VIEW) { + memcpy(&offset, tensor->opt[0]->data, sizeof(size_t)); + } + extra->data_device[g_main_device] = src0_ddc + offset; + } else if (tensor->op == GGML_OP_CPY) { + struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra; + void * src1_ddv = src1_extra->data_device[g_main_device]; + extra->data_device[g_main_device] = src1_ddv; + } else if (scratch) { + GGML_ASSERT(size <= g_scratch_size); + if (g_scratch_offset + size > g_scratch_size) { + g_scratch_offset = 0; + } + char * data = (char *) g_scratch_buffer; if (data == nullptr) { CUDA_CHECK(cudaMalloc(&data, g_scratch_size)); g_scratch_buffer = data; } extra->data_device[g_main_device] = data + g_scratch_offset; + + g_scratch_offset += size; + + GGML_ASSERT(g_scratch_offset <= g_scratch_size); + } else { // allocate new buffers outside of scratch + void * data; + CUDA_CHECK(cudaMalloc(&data, size)); + CUDA_CHECK(cudaMemset(data, 0, size)); + extra->data_device[g_main_device] = data; } - // fprintf(stderr, "data=%p offset=%ld data_device=%p\n", data, g_scratch_offset, extra->data_device[0]); - g_scratch_offset += size; - // fprintf(stderr, "%s: scratch %d, %p - %p\n", - // tensor->name, g_scratch_index, data + g_scratch_offset, data + g_scratch_offset + size); - - GGML_ASSERT(g_scratch_offset <= g_scratch_size); tensor->extra = extra; } +void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, true); +} + +void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, false); +} + void ggml_cuda_set_main_device(int main_device) { if (main_device > g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", @@ -1838,6 +2383,15 @@ void ggml_cuda_set_scratch_size(size_t scratch_size) { g_scratch_size = scratch_size; } +void ggml_cuda_free_scratch() { + if (g_scratch_buffer == nullptr) { + return; + } + + CUDA_CHECK(cudaFree(g_scratch_buffer)); + g_scratch_buffer = nullptr; +} + bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ ggml_cuda_func_t func; const bool any_on_device = tensor->backend == GGML_BACKEND_GPU @@ -1875,12 +2429,39 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ } func = ggml_cuda_mul_mat; break; + case GGML_OP_SCALE: + if (!any_on_device) { + return false; + } + func = ggml_cuda_scale; + break; + case GGML_OP_CPY: + if (!any_on_device) { + return false; + } + func = ggml_cuda_cpy; + break; case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: if (!any_on_device) { return false; } func = ggml_cuda_nop; break; + case GGML_OP_DIAG_MASK_INF: + if (!any_on_device) { + return false; + } + func = ggml_cuda_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + if (!any_on_device) { + return false; + } + func = ggml_cuda_soft_max; + break; case GGML_OP_ROPE: if (!any_on_device) { return false; diff --git a/ggml-cuda.h b/ggml-cuda.h index fde6d4085..d32b44842 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -28,8 +28,10 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); void ggml_cuda_free_data(struct ggml_tensor * tensor); void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); +void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); void ggml_cuda_set_main_device(int main_device); void ggml_cuda_set_scratch_size(size_t scratch_size); +void ggml_cuda_free_scratch(void); bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); #ifdef __cplusplus diff --git a/ggml.c b/ggml.c index 32c191307..c0efa1977 100644 --- a/ggml.c +++ b/ggml.c @@ -3939,6 +3939,12 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } +bool ggml_is_permuted(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; +} + static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); diff --git a/ggml.h b/ggml.h index f2a91761b..9b0c846f8 100644 --- a/ggml.h +++ b/ggml.h @@ -485,6 +485,7 @@ extern "C" { GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); + GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); // use this to compute the memory overhead of a tensor GGML_API size_t ggml_tensor_overhead(void); diff --git a/llama.cpp b/llama.cpp index d2a52bb0c..b8bc0d821 100644 --- a/llama.cpp +++ b/llama.cpp @@ -165,6 +165,11 @@ struct llama_kv_cache { if (ctx) { ggml_free(ctx); } + +#ifdef GGML_USE_CUBLAS + ggml_cuda_free_data(k); + ggml_cuda_free_data(v); +#endif // GGML_USE_CUBLAS } }; @@ -210,6 +215,7 @@ struct llama_model { for (size_t i = 0; i < tensors_by_name.size(); ++i) { ggml_cuda_free_data(tensors_by_name[i].second); } + ggml_cuda_free_scratch(); #elif defined(GGML_USE_CLBLAST) for (size_t i = 0; i < tensors_by_name.size(); ++i) { ggml_cl_free_data(tensors_by_name[i].second); @@ -867,7 +873,8 @@ static bool kv_cache_init( const struct llama_hparams & hparams, struct llama_kv_cache & cache, ggml_type wtype, - int n_ctx) { + int n_ctx, + int n_gpu_layers) { const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; @@ -893,6 +900,15 @@ static bool kv_cache_init( ggml_set_name(cache.k, "cache_k"); ggml_set_name(cache.v, "cache_v"); +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer + 1) { + ggml_cuda_assign_buffers_no_scratch(cache.v); + } + if (n_gpu_layers > n_layer + 2) { + ggml_cuda_assign_buffers_no_scratch(cache.k); + } +#endif // GGML_USE_CUBLAS + return true; } @@ -903,6 +919,7 @@ struct llama_context_params llama_context_default_params() { /*.gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ {0}, + /*.low_vram =*/ false, /*.seed =*/ -1, /*.f16_kv =*/ true, /*.logits_all =*/ false, @@ -1011,6 +1028,7 @@ static void llama_model_load_internal( int n_gpu_layers, int main_gpu, const float * tensor_split, + bool low_vram, ggml_type memory_type, bool use_mmap, bool use_mlock, @@ -1137,18 +1155,34 @@ static void llama_model_load_internal( ml->ggml_ctx = ctx; model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU); // "output" tensor { + ggml_backend backend_norm; ggml_backend backend_output; if (n_gpu_layers > int(n_layer)) { // NOLINT + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; } else { + backend_norm = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU; } + model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm); model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output); + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.norm); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } } const int i_gpu_start = n_layer - n_gpu_layers; @@ -1208,22 +1242,47 @@ static void llama_model_load_internal( (void) vram_scratch; (void) n_batch; #ifdef GGML_USE_CUBLAS - vram_scratch = n_batch * MB; - ggml_cuda_set_scratch_size(vram_scratch); - if (n_gpu_layers > 0) { - fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n", - __func__, vram_scratch / MB); + if (low_vram) { + fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__); + ggml_cuda_set_scratch_size(0); // disable scratch + } else { + vram_scratch = n_batch * MB; + ggml_cuda_set_scratch_size(vram_scratch); + if (n_gpu_layers > 0) { + fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n", + __func__, vram_scratch / MB); + } } #endif // GGML_USE_CUBLAS #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); - fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu); + fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); if (n_gpu_layers > (int) hparams.n_layer) { - fprintf(stderr, "%s: offloading output layer to GPU\n", __func__); + fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__); } + size_t vram_kv_cache = 0; + if (n_gpu_layers > (int) hparams.n_layer + 1) { + if (low_vram) { + fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); + } else { + fprintf(stderr, "%s: offloading v cache to GPU\n", __func__); + vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2; + } + } + if (n_gpu_layers > (int) hparams.n_layer + 2) { + if (low_vram) { + fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); + } else { + fprintf(stderr, "%s: offloading k cache to GPU\n", __func__); + vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2; + } + } + const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3; + fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n", + __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3); fprintf(stderr, "%s: total VRAM used: %zu MB\n", - __func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up + __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up #else (void) n_gpu_layers; #endif @@ -1262,6 +1321,7 @@ static bool llama_model_load( int n_gpu_layers, int main_gpu, float * tensor_split, + bool low_vram, ggml_type memory_type, bool use_mmap, bool use_mlock, @@ -1269,7 +1329,7 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type, + llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { @@ -1345,12 +1405,33 @@ static bool llama_eval_internal( const int i_gpu_start = n_layer - n_gpu_layers; (void) i_gpu_start; + // offload functions set the tensor output backend to GPU + // tensors are GPU-accelerated if any input or the output has been offloaded + // + // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal + // in that case ggml_cuda_assign_buffers has no effect + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer) { + offload_func_nr = ggml_cuda_assign_buffers; + } + if (n_gpu_layers > n_layer + 1) { + offload_func_v = ggml_cuda_assign_buffers; + } + if (n_gpu_layers > n_layer + 2) { + offload_func_kq = ggml_cuda_assign_buffers; + } +#endif // GGML_USE_CUBLAS + for (int il = 0; il < n_layer; ++il) { offload_func_t offload_func = llama_nop; #ifdef GGML_USE_CUBLAS if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU + offload_func = ggml_cuda_assign_buffers; } #endif // GGML_USE_CUBLAS @@ -1373,31 +1454,42 @@ static bool llama_eval_internal( // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - // offload_func(tmpq); - ggml_set_name(tmpq, "tmpq"); - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - // offload_func(tmpk); + offload_func_kq(tmpk); ggml_set_name(tmpk, "tmpk"); + struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + offload_func_kq(tmpq); + ggml_set_name(tmpq, "tmpq"); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0); + offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0); + offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); // store key and value to memory { // compute the transposed [N, n_embd] V matrix - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N)); + + struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + offload_func_v(tmpv); + ggml_set_name(tmpv, "tmpv"); + + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N)); + offload_func_v(Vcur); ggml_set_name(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + offload_func_kq(k); ggml_set_name(k, "k"); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + offload_func_v(v); ggml_set_name(v, "v"); // important: storing RoPE-ed version of K in the KV cache! @@ -1409,6 +1501,7 @@ static bool llama_eval_internal( ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + offload_func_kq(Q); ggml_set_name(Q, "Q"); struct ggml_tensor * K = @@ -1417,10 +1510,12 @@ static bool llama_eval_internal( ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); + offload_func_kq(K); ggml_set_name(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); ggml_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd/n_head) @@ -1429,14 +1524,17 @@ static bool llama_eval_internal( // KQ_scaled shape [n_past + N, N, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + offload_func_kq(KQ_scaled); ggml_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + offload_func_kq(KQ_masked); ggml_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads @@ -1446,10 +1544,12 @@ static bool llama_eval_internal( n_ctx*ggml_element_size(kv_self.v), n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head, il*n_ctx*ggml_element_size(kv_self.v)*n_embd); + offload_func_v(V); ggml_set_name(V, "V"); #if 1 struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); ggml_set_name(KQV, "KQV"); #else // make V contiguous in memory to speed up the matmul, however we waste time on the copy @@ -1461,12 +1561,14 @@ static bool llama_eval_internal( // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); ggml_set_name(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, N) cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + offload_func_v(cur); ggml_set_name(cur, "KQV_merged_contiguous"); // projection (no bias) @@ -1478,7 +1580,6 @@ static bool llama_eval_internal( } lctx.use_buf(ctx0, 1); - //ggml_cuda_set_scratch(1); struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); offload_func(inpFF); @@ -1536,32 +1637,24 @@ static bool llama_eval_internal( } lctx.use_buf(ctx0, 0); - //ggml_cuda_set_scratch(0); // used at the end to optionally extract the embeddings struct ggml_tensor * embeddings = NULL; - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU - } -#endif // GGML_USE_CUBLAS // norm { cur = ggml_rms_norm(ctx0, inpL); - offload_func(cur); + offload_func_nr(cur); ggml_set_name(cur, "rms_norm_inpL"); cur = ggml_rms_norm(ctx0, cur); - offload_func(cur); + offload_func_nr(cur); ggml_set_name(cur, "rms_norm_after"); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.norm); - offload_func(cur); + offload_func_nr(cur); ggml_set_name(cur, "result_norm"); embeddings = cur; @@ -2552,8 +2645,8 @@ struct llama_context * llama_init_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, - params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock, + if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu, + params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { fprintf(stderr, "%s: failed to load model\n", __func__); llama_free(ctx); @@ -2562,7 +2655,7 @@ struct llama_context * llama_init_from_file( // reserve memory for context buffers if (!params.vocab_only) { - if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { + if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; diff --git a/llama.h b/llama.h index 61f6c867d..64292265c 100644 --- a/llama.h +++ b/llama.h @@ -77,6 +77,7 @@ extern "C" { int n_gpu_layers; // number of layers to store in VRAM int main_gpu; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs + bool low_vram; // if true, reduce VRAM usage at the cost of performance int seed; // RNG seed, -1 for random bool f16_kv; // use fp16 for KV cache From 6b8312e7979b852f6b6ac9d29cd51fda16c17948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 15 Jun 2023 19:06:46 +0200 Subject: [PATCH 05/21] Better error when using both LoRA + GPU layers (#1861) --- examples/common.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/common.cpp b/examples/common.cpp index dc69e5373..b47f06273 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -412,6 +412,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { gpt_print_usage(argc, argv, default_params); exit(1); } + +#ifdef GGML_USE_CUBLAS + if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) { + fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__); + exit(1); + } +#endif // GGML_USE_CUBLAS + if (escape_prompt) { process_escapes(params.prompt); } From 4bfcc855abdb2c9fcc3c5a84747974521909fa41 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 15 Jun 2023 20:29:48 +0300 Subject: [PATCH 06/21] metal : parallel command buffer encoding (#1860) * metal : parallel command buffer encoding * metal : determine number of command buffers based on gf->n_threads --- ggml-metal.h | 1 + ggml-metal.m | 917 ++++++++++++++++++++++++++------------------------- 2 files changed, 471 insertions(+), 447 deletions(-) diff --git a/ggml-metal.h b/ggml-metal.h index a9441a9d4..033c4d86a 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -55,6 +55,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); // same as ggml_graph_compute but uses Metal +// creates gf->n_threads command buffers in parallel void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); #ifdef __cplusplus diff --git a/ggml-metal.m b/ggml-metal.m index 658c392e0..0e9b56aa3 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -284,528 +284,551 @@ void ggml_metal_get_tensor( void ggml_metal_graph_compute( struct ggml_metal_context * ctx, - struct ggml_cgraph * gf) { + struct ggml_cgraph * gf) { metal_printf("%s: evaluating graph\n", __func__); - size_t offs_src0 = 0; - size_t offs_src1 = 0; - size_t offs_dst = 0; + // create multiple command buffers and enqueue them + // then, we encode the graph into the command buffers in parallel - id command_buffer = [ctx->queue commandBuffer]; - id encoder = nil; + const int n_cb = gf->n_threads; - for (int i = 0; i < gf->n_nodes; ++i) { - //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; - struct ggml_tensor * src0 = gf->nodes[i]->src0; - struct ggml_tensor * src1 = gf->nodes[i]->src1; - struct ggml_tensor * dst = gf->nodes[i]; + for (int i = 0; i < n_cb; ++i) { + command_buffers[i] = [ctx->queue commandBuffer]; - const int64_t ne00 = src0 ? src0->ne[0] : 0; - const int64_t ne01 = src0 ? src0->ne[1] : 0; - const int64_t ne02 = src0 ? src0->ne[2] : 0; - const int64_t ne03 = src0 ? src0->ne[3] : 0; + // enqueue the command buffers in order to specify their execution order + [command_buffers[i] enqueue]; + } - const uint64_t nb00 = src0 ? src0->nb[0] : 0; - const uint64_t nb01 = src0 ? src0->nb[1] : 0; - const uint64_t nb02 = src0 ? src0->nb[2] : 0; - const uint64_t nb03 = src0 ? src0->nb[3] : 0; + // TODO: is this the best way to start threads? + dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT); - const int64_t ne10 = src1 ? src1->ne[0] : 0; - const int64_t ne11 = src1 ? src1->ne[1] : 0; - const int64_t ne12 = src1 ? src1->ne[2] : 0; - const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); + for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { + const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb; - const uint64_t nb10 = src1 ? src1->nb[0] : 0; - const uint64_t nb11 = src1 ? src1->nb[1] : 0; - const uint64_t nb12 = src1 ? src1->nb[2] : 0; - const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); + dispatch_async(queue, ^{ + size_t offs_src0 = 0; + size_t offs_src1 = 0; + size_t offs_dst = 0; - const int64_t ne0 = dst ? dst->ne[0] : 0; - const int64_t ne1 = dst ? dst->ne[1] : 0; - const int64_t ne2 = dst ? dst->ne[2] : 0; - const int64_t ne3 = dst ? dst->ne[3] : 0; + id command_buffer = command_buffers[cb_idx]; - const uint64_t nb0 = dst ? dst->nb[0] : 0; - const uint64_t nb1 = dst ? dst->nb[1] : 0; - const uint64_t nb2 = dst ? dst->nb[2] : 0; - const uint64_t nb3 = dst ? dst->nb[3] : 0; + id encoder = nil; - const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; - const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; - const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; + const int node_start = (cb_idx + 0) * n_nodes_per_cb; + const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb; - id id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; - id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; - id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; + for (int i = node_start; i < node_end; ++i) { + metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); - //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); - //if (src0) { - // metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, - // ggml_is_contiguous(src0), src0->name); - //} - //if (src1) { - // metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, - // ggml_is_contiguous(src1), src1->name); - //} - //if (dst) { - // metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, - // dst->name); - //} + struct ggml_tensor * src0 = gf->nodes[i]->src0; + struct ggml_tensor * src1 = gf->nodes[i]->src1; + struct ggml_tensor * dst = gf->nodes[i]; - switch (dst->op) { - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - { - // noop - } break; - case GGML_OP_ADD: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + const int64_t ne00 = src0 ? src0->ne[0] : 0; + const int64_t ne01 = src0 ? src0->ne[1] : 0; + const int64_t ne02 = src0 ? src0->ne[2] : 0; + const int64_t ne03 = src0 ? src0->ne[3] : 0; - [encoder setComputePipelineState:ctx->pipeline_add]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + const uint64_t nb00 = src0 ? src0->nb[0] : 0; + const uint64_t nb01 = src0 ? src0->nb[1] : 0; + const uint64_t nb02 = src0 ? src0->nb[2] : 0; + const uint64_t nb03 = src0 ? src0->nb[3] : 0; - const int64_t n = ggml_nelements(dst); + const int64_t ne10 = src1 ? src1->ne[0] : 0; + const int64_t ne11 = src1 ? src1->ne[1] : 0; + const int64_t ne12 = src1 ? src1->ne[2] : 0; + const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_MUL: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + const uint64_t nb10 = src1 ? src1->nb[0] : 0; + const uint64_t nb11 = src1 ? src1->nb[1] : 0; + const uint64_t nb12 = src1 ? src1->nb[2] : 0; + const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); - if (ggml_nelements(src1) == ne10) { - // src1 is a row - [encoder setComputePipelineState:ctx->pipeline_mul_row]; - } else { - [encoder setComputePipelineState:ctx->pipeline_mul]; - } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + const int64_t ne0 = dst ? dst->ne[0] : 0; + const int64_t ne1 = dst ? dst->ne[1] : 0; + const int64_t ne2 = dst ? dst->ne[2] : 0; + const int64_t ne3 = dst ? dst->ne[3] : 0; - const int64_t n = ggml_nelements(dst); + const uint64_t nb0 = dst ? dst->nb[0] : 0; + const uint64_t nb1 = dst ? dst->nb[1] : 0; + const uint64_t nb2 = dst ? dst->nb[2] : 0; + const uint64_t nb3 = dst ? dst->nb[3] : 0; - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SCALE: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; + const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; + const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - const float scale = *(const float *) src1->data; + id id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; + id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; + id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; - [encoder setComputePipelineState:ctx->pipeline_scale]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; + //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //if (src0) { + // metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // ggml_is_contiguous(src0), src0->name); + //} + //if (src1) { + // metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // ggml_is_contiguous(src1), src1->name); + //} + //if (dst) { + // metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // dst->name); + //} - const int64_t n = ggml_nelements(dst); + switch (dst->op) { + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_TRANSPOSE: + case GGML_OP_PERMUTE: + { + // noop + } break; + case GGML_OP_ADD: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SILU: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + [encoder setComputePipelineState:ctx->pipeline_add]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setComputePipelineState:ctx->pipeline_silu]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + const int64_t n = ggml_nelements(dst); - const int64_t n = ggml_nelements(dst); + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_MUL: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_RELU: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + if (ggml_nelements(src1) == ne10) { + // src1 is a row + [encoder setComputePipelineState:ctx->pipeline_mul_row]; + } else { + [encoder setComputePipelineState:ctx->pipeline_mul]; + } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setComputePipelineState:ctx->pipeline_relu]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + const int64_t n = ggml_nelements(dst); - const int64_t n = ggml_nelements(dst); + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SCALE: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_GELU: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + const float scale = *(const float *) src1->data; - [encoder setComputePipelineState:ctx->pipeline_gelu]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setComputePipelineState:ctx->pipeline_scale]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst); - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_SOFT_MAX: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SILU: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } - const int nth = 32; + [encoder setComputePipelineState:ctx->pipeline_silu]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setComputePipelineState:ctx->pipeline_soft_max]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; - [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; + const int64_t n = ggml_nelements(dst); - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_DIAG_MASK_INF: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_RELU: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } - const int n_past = ((int32_t *)(src1->data))[0]; + [encoder setComputePipelineState:ctx->pipeline_relu]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; - [encoder setBytes:&n_past length:sizeof(int) atIndex:4]; + const int64_t n = ggml_nelements(dst); - [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_MUL_MAT: - { - // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224 + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_GELU: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } - GGML_ASSERT(ne00 == ne10); - GGML_ASSERT(ne02 == ne12); + [encoder setComputePipelineState:ctx->pipeline_gelu]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) { + const int64_t n = ggml_nelements(dst); - if (encoder != nil) { - [encoder endEncoding]; - encoder = nil; - } + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SOFT_MAX: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } - MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; - MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; + const int nth = 32; - // for F32 x F32 we use MPS - MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor - matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt]; + [encoder setComputePipelineState:ctx->pipeline_soft_max]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; - MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor - matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_DIAG_MASK_INF: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } - MPSMatrixDescriptor * desc = [MPSMatrixDescriptor - matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32]; + const int n_past = ((int32_t *)(src1->data))[0]; - MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] - initWithDevice:ctx->device transposeLeft:false transposeRight:true - resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0]; + [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&n_past length:sizeof(int) atIndex:4]; - // we need to do ne02 multiplications - // TODO: is there a way to do this in parallel - currently very slow .. - // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS - for (int64_t i02 = 0; i02 < ne02; ++i02) { - size_t offs_src0_cur = offs_src0 + i02*nb02; - size_t offs_src1_cur = offs_src1 + i02*nb12; - size_t offs_dst_cur = offs_dst + i02*nb2; + [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_MUL_MAT: + { + // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224 - MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0]; - MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1]; - MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst_cur descriptor:desc ]; + GGML_ASSERT(ne00 == ne10); + GGML_ASSERT(ne02 == ne12); - [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst]; - } - } else { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + if (ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) { - int nth0 = 32; - int nth1 = 1; - - // use custom matrix x vector kernel - switch (src0t) { - case GGML_TYPE_F16: - { - GGML_ASSERT(ne02 == ne12); - - nth0 = 64; - nth1 = 1; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; - } break; - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - - nth0 = 8; - nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32]; - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - - nth0 = 8; - nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32]; - } break; - case GGML_TYPE_Q2_K: - { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - - nth0 = 4; - nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32]; - } break; - case GGML_TYPE_Q3_K: - { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - - nth0 = 4; - nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32]; - } break; - case GGML_TYPE_Q4_K: - { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - - nth0 = 4; - nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32]; - } break; - case GGML_TYPE_Q5_K: - { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - - nth0 = 4; - nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32]; - } break; - case GGML_TYPE_Q6_K: - { - GGML_ASSERT(ne02 == 1); - GGML_ASSERT(ne12 == 1); - - nth0 = 4; - nth1 = 16; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32]; - } break; - default: - { - fprintf(stderr, "Asserting on type %d\n",(int)src0t); - GGML_ASSERT(false && "not implemented"); + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; } - }; + MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; + MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; - [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; - [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6]; - [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8]; - [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; + // for F32 x F32 we use MPS + MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor + matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt]; - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { - [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q2_K || - src0t == GGML_TYPE_Q3_K || - src0t == GGML_TYPE_Q4_K || - src0t == GGML_TYPE_Q5_K || - src0t == GGML_TYPE_Q6_K) { - [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } else { - [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - } - } break; - case GGML_OP_GET_ROWS: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor + matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt]; - switch (src0->type) { - case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; - case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; - case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break; - case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break; - case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break; - case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break; - case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break; - case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break; - default: GGML_ASSERT(false && "not implemented"); - } + MPSMatrixDescriptor * desc = [MPSMatrixDescriptor + matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4]; - [encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5]; + MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] + initWithDevice:ctx->device transposeLeft:false transposeRight:true + resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0]; - const int64_t n = ggml_nelements(src1); + // we need to do ne02 multiplications + // TODO: is there a way to do this in parallel - currently very slow .. + // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS + for (int64_t i02 = 0; i02 < ne02; ++i02) { + size_t offs_src0_cur = offs_src0 + i02*nb02; + size_t offs_src1_cur = offs_src1 + i02*nb12; + size_t offs_dst_cur = offs_dst + i02*nb2; - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_RMS_NORM: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0]; + MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1]; + MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst_cur descriptor:desc ]; - const float eps = 1e-6f; + [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst]; + } + } else { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } - const int nth = 256; + int nth0 = 32; + int nth1 = 1; - [encoder setComputePipelineState:ctx->pipeline_rms_norm]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; - [encoder setBytes:&eps length:sizeof( float) atIndex:4]; - [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; + // use custom matrix x vector kernel + switch (src0t) { + case GGML_TYPE_F16: + { + GGML_ASSERT(ne02 == ne12); - const int64_t nrows = ggml_nrows(src0); + nth0 = 64; + nth1 = 1; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; + } break; + case GGML_TYPE_Q4_0: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); - [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - case GGML_OP_ROPE: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + nth0 = 8; + nth1 = 8; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32]; + } break; + case GGML_TYPE_Q4_1: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + nth0 = 8; + nth1 = 8; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32]; + } break; + case GGML_TYPE_Q2_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); - const int n_past = ((int32_t *)(src1->data))[0]; + nth0 = 4; + nth1 = 16; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32]; + } break; + case GGML_TYPE_Q3_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); - [encoder setComputePipelineState:ctx->pipeline_rope]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; - [encoder setBytes:&mode length:sizeof( int) atIndex:20]; + nth0 = 4; + nth1 = 16; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32]; + } break; + case GGML_TYPE_Q4_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - } break; - case GGML_OP_CPY: - { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoder]; - } + nth0 = 4; + nth1 = 16; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32]; + } break; + case GGML_TYPE_Q5_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); - const int nth = 32; + nth0 = 4; + nth1 = 16; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32]; + } break; + case GGML_TYPE_Q6_K: + { + GGML_ASSERT(ne02 == 1); + GGML_ASSERT(ne12 == 1); - switch (src0t) { - case GGML_TYPE_F32: - { - switch (dstt) { - case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break; - case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break; - default: GGML_ASSERT(false && "not implemented"); + nth0 = 4; + nth1 = 16; + [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32]; + } break; + default: + { + fprintf(stderr, "Asserting on type %d\n",(int)src0t); + GGML_ASSERT(false && "not implemented"); + } }; - } break; - default: GGML_ASSERT(false && "not implemented"); - } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; - default: - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); - } + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { + [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q2_K || + src0t == GGML_TYPE_Q3_K || + src0t == GGML_TYPE_Q4_K || + src0t == GGML_TYPE_Q5_K || + src0t == GGML_TYPE_Q6_K) { + [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else { + [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + } + } break; + case GGML_OP_GET_ROWS: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + switch (src0->type) { + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; + case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; + case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break; + case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break; + case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break; + case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break; + case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break; + case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break; + default: GGML_ASSERT(false && "not implemented"); + } + + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4]; + [encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5]; + + const int64_t n = ggml_nelements(src1); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_RMS_NORM: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + const float eps = 1e-6f; + + const int nth = 256; + + [encoder setComputePipelineState:ctx->pipeline_rms_norm]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ROPE: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + const int n_dims = ((int32_t *) src1->data)[1]; + const int mode = ((int32_t *) src1->data)[2]; + + const int n_past = ((int32_t *)(src1->data))[0]; + + [encoder setComputePipelineState:ctx->pipeline_rope]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; + [encoder setBytes:&mode length:sizeof( int) atIndex:20]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_CPY: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + const int nth = 32; + + switch (src0t) { + case GGML_TYPE_F32: + { + switch (dstt) { + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break; + case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break; + default: GGML_ASSERT(false && "not implemented"); + }; + } break; + default: GGML_ASSERT(false && "not implemented"); + } + + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + default: + fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + GGML_ASSERT(false); + } + } + + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; + } + + [command_buffer commit]; + }); } - if (encoder != nil) { - [encoder endEncoding]; - encoder = nil; - } + // wait for all threads to finish + dispatch_barrier_sync(queue, ^{}); - [command_buffer commit]; - [command_buffer waitUntilCompleted]; - - { - const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime]; - UNUSED(time_elapsed); - - metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0); - } + [command_buffers[n_cb - 1] waitUntilCompleted]; } From 64cc19b4fe3df03bc20e520aa111c30cff3a655e Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 16 Jun 2023 01:29:59 +0800 Subject: [PATCH 07/21] Fix the validation of main device (#1872) --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0565571f4..0873e3e51 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2366,7 +2366,7 @@ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) { } void ggml_cuda_set_main_device(int main_device) { - if (main_device > g_device_count) { + if (main_device >= g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", main_device, g_device_count, g_main_device); return; From 37e257c48e350cf03c353c10d31e777f8d00123d Mon Sep 17 00:00:00 2001 From: sandyiscool Date: Thu, 15 Jun 2023 23:06:06 +0530 Subject: [PATCH 08/21] make : clean *.so files (#1857) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9a08d610b..66509cc33 100644 --- a/Makefile +++ b/Makefile @@ -259,7 +259,7 @@ libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: - rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h + rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h # # Examples From 9dda13e5e1f70bdfc25fbc0f0378f27c8b67e983 Mon Sep 17 00:00:00 2001 From: Srinivas Billa Date: Thu, 15 Jun 2023 18:36:38 +0100 Subject: [PATCH 09/21] readme : server compile flag (#1874) Explicitly include the server make instructions for C++ noobsl like me ;) --- examples/server/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/server/README.md b/examples/server/README.md index 7dabac9cf..3b111655a 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -16,6 +16,10 @@ This example allow you to have a llama.cpp http server to interact from a web pa To get started right away, run the following command, making sure to use the correct path for the model you have: #### Unix-based systems (Linux, macOS, etc.): +Make sure to build with the server option on +```bash +LLAMA_BUILD_SERVER=1 make +``` ```bash ./server -m models/7B/ggml-model.bin --ctx_size 2048 From cf267d1c71a781700698f8518e903239c3bcc929 Mon Sep 17 00:00:00 2001 From: daboe01 Date: Thu, 15 Jun 2023 19:42:48 +0200 Subject: [PATCH 10/21] make : add train-text-from-scratch (#1850) * make finetuning example accessible * fixed: targed was in wrong line * fixed: name of executable was wrong * fixed: naming of binary * fixed: model path was wrong * fixed clean target * Update examples/train-text-from-scratch/README.md --------- Co-authored-by: Georgi Gerganov --- .gitignore | 1 + Makefile | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 9b6905ed4..4b0422cd9 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ models/* /result /perplexity /embedding +/train-text-from-scratch /benchmark-matmult /vdot /Pipfile diff --git a/Makefile b/Makefile index 66509cc33..09c8834f5 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch ifdef LLAMA_BUILD_SERVER BUILD_TARGETS += server @@ -259,7 +259,7 @@ libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: - rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h + rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h # # Examples @@ -289,6 +289,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml. server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) +train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + build-info.h: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh > $@.tmp @if ! cmp -s $@.tmp $@; then \ From 69b34a0e80300bfb3e996983ac3ea075f5526675 Mon Sep 17 00:00:00 2001 From: Frederik Vogel Date: Fri, 16 Jun 2023 02:47:04 +0900 Subject: [PATCH 11/21] swift : Package compile breaks due to ggml-metal.metal (#1831) * Ignore metal file in spm * Add ggml.h to spm public Headers --------- Co-authored-by: Vogel Frederik --- Package.swift | 1 + spm-headers/ggml.h | 1 + 2 files changed, 2 insertions(+) create mode 120000 spm-headers/ggml.h diff --git a/Package.swift b/Package.swift index 2c2c147ba..73d027c70 100644 --- a/Package.swift +++ b/Package.swift @@ -11,6 +11,7 @@ let package = Package( .target( name: "llama", path: ".", + exclude: ["ggml-metal.metal"], sources: ["ggml.c", "llama.cpp"], publicHeadersPath: "spm-headers", cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")], diff --git a/spm-headers/ggml.h b/spm-headers/ggml.h new file mode 120000 index 000000000..39215298f --- /dev/null +++ b/spm-headers/ggml.h @@ -0,0 +1 @@ +../ggml.h \ No newline at end of file From 3559433fecedf365e7aba2fe3d5f89d9abb817c1 Mon Sep 17 00:00:00 2001 From: Igor Okulist Date: Thu, 15 Jun 2023 12:51:26 -0500 Subject: [PATCH 12/21] cmake : set include path for OpenBlas (#1830) --- CMakeLists.txt | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 19cd42dd2..de01e55ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,12 +163,24 @@ if (LLAMA_BLAS) if (BLAS_FOUND) message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") + # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. + # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 + find_path(BLAS_INCLUDE_DIRS + NAMES cblas.h + HINTS + /usr/include + /usr/local/include + /usr/include/openblas + ) + + + message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") + add_compile_options(${BLAS_LINKER_FLAGS}) add_compile_definitions(GGML_USE_OPENBLAS) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) + set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) - message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}") - include_directories(${BLAS_INCLUDE_DIRS}) else() message(WARNING "BLAS not found, please refer to " "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" @@ -408,7 +420,7 @@ add_library(ggml OBJECT ${GGML_SOURCES_EXTRA} ) -target_include_directories(ggml PUBLIC .) +target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) target_compile_features(ggml PUBLIC c_std_11) # don't bump target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) From c36e81da62ebfe09a768201cc44fa8d712dd00ed Mon Sep 17 00:00:00 2001 From: yangli2 Date: Thu, 15 Jun 2023 11:05:53 -0700 Subject: [PATCH 13/21] examples : add chat-vicuna.sh (#1854) Co-authored-by: Yang Li --- examples/chat-vicuna.sh | 41 +++++++++++++++++++++++++++++++++++++++++ llama.h | 6 +++--- 2 files changed, 44 insertions(+), 3 deletions(-) create mode 100755 examples/chat-vicuna.sh diff --git a/examples/chat-vicuna.sh b/examples/chat-vicuna.sh new file mode 100755 index 000000000..8c7b7bef4 --- /dev/null +++ b/examples/chat-vicuna.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +set -e + +cd "$(dirname "$0")/.." || exit + +MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}" +PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt} +USER_NAME="### Human" +AI_NAME="### Assistant" + +# Adjust to the number of CPU cores you want to use. +N_THREAD="${N_THREAD:-8}" +# Number of tokens to predict (made it larger than default because we want a long interaction) +N_PREDICTS="${N_PREDICTS:-2048}" + +# Note: you can also override the generation options by specifying them on the command line: +# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024 +GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}" + +DATE_TIME=$(date +%H:%M) +DATE_YEAR=$(date +%Y) + +PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt) + +sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \ + -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \ + -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \ + -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \ + $PROMPT_TEMPLATE > $PROMPT_FILE + +# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS +./bin/main $GEN_OPTIONS \ + --model "$MODEL" \ + --threads "$N_THREAD" \ + --n_predict "$N_PREDICTS" \ + --color --interactive \ + --file ${PROMPT_FILE} \ + --reverse-prompt "### Human:" \ + --in-prefix ' ' \ + "$@" diff --git a/llama.h b/llama.h index 64292265c..1241ba6c0 100644 --- a/llama.h +++ b/llama.h @@ -244,9 +244,9 @@ extern "C" { LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); // Special tokens - LLAMA_API llama_token llama_token_bos(); - LLAMA_API llama_token llama_token_eos(); - LLAMA_API llama_token llama_token_nl(); + LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence + LLAMA_API llama_token llama_token_eos(); // end-of-sentence + LLAMA_API llama_token llama_token_nl(); // next-line // Sampling functions From bed92756172d4514b23aaf9744cf8e2dc892fc7b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 15 Jun 2023 21:56:50 +0300 Subject: [PATCH 14/21] cmake : remove whitespaces --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index de01e55ec..ea9f80b80 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,7 +173,6 @@ if (LLAMA_BLAS) /usr/include/openblas ) - message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") add_compile_options(${BLAS_LINKER_FLAGS}) From a09f9195be39afb4b023b646c0a6ec8a86915174 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 15 Jun 2023 21:49:08 +0200 Subject: [PATCH 15/21] Fixed CUDA runtime version check (#1879) --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0873e3e51..bd89d0a1f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -25,7 +25,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); } \ } while (0) -#if CUDART_VERSION >= 12 +#if CUDART_VERSION >= 12000 #define CUBLAS_CHECK(err) \ do { \ cublasStatus_t err_ = (err); \ From 602c748863e15270d80d74aa2c3bf86ab8139e07 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Fri, 16 Jun 2023 09:58:11 +0300 Subject: [PATCH 16/21] gitignore : add several entries specific to Visual Studio (#1888) --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 4b0422cd9..b3ff6526c 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ build-metal/ build-no-accel/ build-sanitize-addr/ build-sanitize-thread/ +out/ models/* *.bin @@ -41,6 +42,7 @@ models/* build-info.h arm_neon.h compile_commands.json +CMakeSettings.json __pycache__ From 3d0112261042b356621e93db3fa4c6798a5d098f Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Fri, 16 Jun 2023 20:08:44 +0300 Subject: [PATCH 17/21] CUDA : faster k-quant dot kernels (#1862) * cuda : faster k-quant dot kernels * Imrove Q2_K dot kernel on older GPUs We now have a K_QUANTS_PER_ITERATION macro, which should be set to 1 on older and to 2 on newer GPUs. With this, we preserve the performance of the original PR on RTX-4080, and are faster compared to master on GTX-1660. * Imrove Q6_K dot kernel on older GPUs Using the same K_QUANTS_PER_ITERATION macro as last commit, we preserve performance on RTX-4080 and speed up Q6_K on a GTX-1660. * Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile Allowed values are 1 or 2. 2 gives the best performance on modern GPUs and is set as default. On older GPUs 1 may work better. * PR comments --------- Co-authored-by: Iwan Kawrakow --- CMakeLists.txt | 2 + Makefile | 5 + ggml-cuda.cu | 599 +++++++++++++++++++++++++++++++------------------ 3 files changed, 385 insertions(+), 221 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ea9f80b80..dbbc0b5d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") +set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_K_QUANTS "llama: use k-quants" ON) @@ -201,6 +202,7 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) + add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) if (LLAMA_STATIC) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) diff --git a/Makefile b/Makefile index 09c8834f5..b24caf8dd 100644 --- a/Makefile +++ b/Makefile @@ -171,6 +171,11 @@ ifdef LLAMA_CUDA_DMMV_Y else NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1 endif # LLAMA_CUDA_DMMV_Y +ifdef LLAMA_CUDA_KQUANTS_ITER + NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) +else + NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 +endif ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ endif # LLAMA_CUBLAS diff --git a/ggml-cuda.cu b/ggml-cuda.cu index bd89d0a1f..7edd1a9f8 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -167,6 +167,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define GGML_CUDA_DMMV_Y 1 #endif +#ifndef K_QUANTS_PER_ITERATION +#define K_QUANTS_PER_ITERATION 2 +#else +static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); +#endif + static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -326,37 +332,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { } -static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - - const block_q2_K * x = (const block_q2_K *) vx; - - // if n is 0, we want to do the lower 128, else the upper 128, - // covering y[l+0], y[l+32], y[l+64], y[l+96] and - // y[l+16], y[l+48], y[l+80], y[l+112] - int n = iqs/128; // 0 or 1 - int r = iqs - 128*n; // 0...120 in steps of 8 - int l = r/8; // 0...15 in steps of 1 - - const float * y = yy + 128*n + l; - const uint8_t * q = x[ib].qs + 32*n + l; - const uint8_t * s = x[ib].scales + 8*n; - - const float dall = x[ib].d; - const float dmin = x[ib].dmin; - - float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4)) - + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4)) - + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4)) - + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4)) - + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4)) - + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4)) - + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4)) - + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4)); - - result = sum; - -} - static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { int r = threadIdx.x/4; @@ -388,51 +363,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { } -static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - - const block_q3_K * x = (const block_q3_K *) vx; - - const uint32_t kmask1 = 0x03030303; - const uint32_t kmask2 = 0x0f0f0f0f; - - uint32_t aux[3]; - uint32_t utmp[4]; - - // if n is 0, we want to do the lower 128, else the upper 128, - // covering y[l+0], y[l+32], y[l+64], y[l+96] and - // y[l+16], y[l+48], y[l+80], y[l+112] - int n = iqs/128; // 0 or 1 - int r = iqs - 128*n; // 0...120 in steps of 8 - int l = r/8; // 0...15 in steps of 1 - - const float * y = yy + 128*n + l; - const uint8_t * q = x[ib].qs + 32*n + l; - const uint8_t * hm = x[ib].hmask + l; - const int8_t * s = (const int8_t *)utmp + 8*n; - - memcpy(aux, x[ib].scales, 12); - utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); - utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); - utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); - utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); - - const float dall = x[ib].d; - - const uint8_t m = 1 << (4*n); - - float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4)) - + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4)) - + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4)) - + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4)) - + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4)) - + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4)) - + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4)) - + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4)); - - result = sum * dall; - -} - static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { if (j < 4) { d = q[j] & 63; m = q[j + 4] & 63; @@ -479,38 +409,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { } } -static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - - const block_q4_K * x = (const block_q4_K *) vx; - - // iqs is in 0...248 in steps of 8 => - const int j = iqs / 64; // j is in 0...3 - const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4 - const int is = 2*j; // is is in 0...6 in steps of 2 - - const float * y = yy + 64*j + ir; - const uint8_t * q = x[ib].qs + 32*j + ir; - - const float dall = x[ib].d; - const float dmin = x[ib].dmin; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[ib].scales, sc, m); - const float d1 = dall * sc; - const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[ib].scales, sc, m); - const float d2 = dall * sc; - const float m2 = dmin * m; - - float sum = 0; - for (int k = 0; k < 4; ++k) { - sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1); - sum += y[k + 32] * (d2 * (q[k] >> 4) - m2); - } - result = sum; - -} - static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { const block_q5_K * x = (const block_q5_K *) vx; @@ -544,43 +442,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; } -static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { - - const block_q5_K * x = (const block_q5_K *) vx; - - // iqs is in 0...248 in steps of 8 => - const int j = iqs / 64; // j is in 0...3 - const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4 - const int is = 2*j; // is is in 0...6 in steps of 2 - - const float * y = yy + 64*j + ir; - const uint8_t * ql = x[ib].qs + 32*j + ir; - const uint8_t * qh = x[ib].qh + ir; - - const float dall = x[ib].d; - const float dmin = x[ib].dmin; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[ib].scales, sc, m); - const float d1 = dall * sc; - const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[ib].scales, sc, m); - const float d2 = dall * sc; - const float m2 = dmin * m; - - uint8_t hm = 1 << is; - float sum = 0; - for (int k = 0; k < 4; ++k) { - sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1); - } - hm <<= 1; - for (int k = 0; k < 4; ++k) { - sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2); - } - result = sum; - -} - static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { const block_q6_K * x = (const block_q6_K *) vx; @@ -606,31 +467,376 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); } -static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) { +static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { - const block_q6_K * x = (const block_q6_K *) vx; + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - const int ip = iqs / 128; // 0 or 1 - const int il = (iqs - 128*ip)/8; // 0...15 - const int is = 8*ip; + const int row = blockIdx.y*blockDim.y + threadIdx.y; + if (row > nrows) return; - const float * y = yy + 128*ip + il; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; - const float d = x[ib].d; + const block_q2_K * x = (const block_q2_K *)vx + ib0; - const uint8_t * ql = x[ib].ql + 64*ip + il; - const uint8_t * qh = x[ib].qh + 32*ip + il; - const int8_t * sc = x[ib].scales + is; + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 - result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32) - + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32) - + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32) - + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32) - + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32) - + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32) - + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32) - + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32); + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...7 + + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...14 in steps of 4 + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + float tmp = 0; // partial sum for thread in warp + + uint32_t aux[4]; + const uint8_t * d = (const uint8_t *)aux; + const uint8_t * m = (const uint8_t *)(aux + 2); + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const float dall = x[i].d; + const float dmin = x[i].dmin; + + const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + + } + tmp += dall * sum1 - dmin * sum2; + + } + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols) { + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int row = blockIdx.x; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q3_K * x = (const block_q3_K *)vx + ib0; + + const int tid = threadIdx.x/2; // 0...15 + const int ix = threadIdx.x%2; // 0, 1 + + const int n = 2; // iterations in the inner loop + const int im = tid/8; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - 8*im; // 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...28 in steps of 4 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + const uint8_t * h = x[i].hmask + l0; + + const uint16_t * a = (const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = x[i].d; + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; + + } + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols) { + + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int row = blockIdx.x; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const int tid = threadIdx.x/2; // 0...15 + const int ix = threadIdx.x%2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 4; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + const block_q4_K * x = (const block_q4_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * q1 = x[i].qs + q_offset; + const uint8_t * q2 = q1 + 64; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].d; + const float dmin = x[i].dmin; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < n; ++l) { + s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4); + s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4); + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; + + } + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) { + + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + //const int row = blockIdx.x*blockDim.y + threadIdx.y; + const int row = blockIdx.x; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const int tid = threadIdx.x/2; // 0...15 + const int ix = threadIdx.x%2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 4; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + const block_q5_K * x = (const block_q5_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * ql1 = x[i].qs + q_offset; + const uint8_t * ql2 = ql1 + 64; + const uint8_t * qh = x[i].qh + l0; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].d; + const float dmin = x[i].dmin; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + float4 sum = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < n; ++l) { + sum.x += y1[l+ 0] * ((ql1[l] & 0xF) + (qh[l] & (hm1 << 0) ? 16 : 0)); + sum.y += y1[l+32] * ((ql1[l] >> 4) + (qh[l] & (hm1 << 1) ? 16 : 0)); + sum.z += y2[l+ 0] * ((ql2[l] & 0xF) + (qh[l] & (hm2 << 0) ? 16 : 0)); + sum.w += y2[l+32] * ((ql2[l] >> 4) + (qh[l] & (hm2 << 1) ? 16 : 0)); + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; + + } + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = blockIdx.y*blockDim.y + threadIdx.y; + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q6_K * x = (const block_q6_K *)vx + ib0; + + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + +#if K_QUANTS_PER_ITERATION == 1 + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 + const int is = 0; +#else + const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int is = in / 4; +#endif + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = x[i].ql + ql_offset; + const uint8_t * qh = x[i].qh + qh_offset; + const int8_t * s = x[i].scales + s_offset; + + const float d = x[i].d; + +#if K_QUANTS_PER_ITERATION == 1 + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; +#else + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; +#endif + + } + + // sum up partial sums and write back result + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + if (tid == 0) { + dst[row] = tmp; + } } static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){ @@ -712,46 +918,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, } } -template -static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) { - const int row = blockIdx.y*blockDim.y + threadIdx.y; - - if (row >= nrows) { - return; - } - - const int tid = threadIdx.x; - - const int iter_stride = QK_K; - const int vals_per_iter = iter_stride / n_thread; - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - float tmp = 0; // partial sum for thread in warp - - for (int i = 0; i < ncols; i += iter_stride) { - const int col = i + vals_per_iter*tid; - const int ib = ib0 + col/QK_K; // x block index - const int iqs = col%QK_K; // x quant index - const int iybs = col - col%QK_K; // y block start index - - float v; - dot_kernel(vx, ib, iqs, y + iybs, v); - tmp += v; - } - - // sum up partial sums and write back result - __syncthreads(); -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); - } - - if (tid == 0) { - dst[row] = tmp; - } -} - static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) { const half * x = (half *) vx; @@ -1094,43 +1260,34 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f const int block_num_y = (nrows + ny - 1) / ny; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<>>(vx, y, dst, ncols, nrows); + dequantize_mul_mat_vec_q2_k<<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; - const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(1, block_num_y, 1); - const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<>>(vx, y, dst, ncols, nrows); + const dim3 block_dims(32, 1, 1); + dequantize_mul_mat_vec_q3_k<<>>(vx, y, dst, ncols); } static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; - const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(1, block_num_y, 1); - const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<>>(vx, y, dst, ncols, nrows); + const dim3 block_dims(32, 1, 1); + dequantize_mul_mat_vec_q4_k<<>>(vx, y, dst, ncols); } static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; - const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(1, block_num_y, 1); - const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<>>(vx, y, dst, ncols, nrows); + const dim3 block_dims(32, 1, 1); + dequantize_mul_mat_vec_q5_k<<>>(vx, y, dst, ncols); } static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; + const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<>>(vx, y, dst, ncols, nrows); + dequantize_mul_mat_vec_q6_k<<>>(vx, y, dst, ncols, nrows); } static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { From 9cbf50c041a525d781c7764f493a5443924e4e38 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Fri, 16 Jun 2023 21:23:53 +0300 Subject: [PATCH 18/21] build : fix and ignore MSVC warnings (#1889) --- examples/baby-llama/baby-llama.cpp | 6 +++- examples/benchmark/benchmark-matmult.cpp | 10 ++++-- examples/common.cpp | 6 +++- examples/embedding/embedding.cpp | 4 +++ examples/main/main.cpp | 6 +++- examples/perplexity/perplexity.cpp | 4 +++ examples/quantize-stats/quantize-stats.cpp | 4 +++ examples/save-load-state/save-load-state.cpp | 2 +- .../train-text-from-scratch.cpp | 18 ++++++----- ggml.c | 6 ++++ llama.cpp | 4 +++ pocs/vdot/vdot.cpp | 4 +++ tests/test-quantize-fns.cpp | 13 +++++--- tests/test-quantize-perf.cpp | 4 +++ tests/test-sampling.cpp | 32 +++++++++---------- tests/test-tokenizer-0.cpp | 2 +- 16 files changed, 88 insertions(+), 37 deletions(-) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 0add6adc0..50e14c4ac 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -4,6 +4,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + float frand() { return (float)rand()/(float)RAND_MAX; } @@ -1470,7 +1474,7 @@ struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_te } struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - const float eps = 1e-3; + const float eps = 1e-3f; return ggml_sum(ctx, ggml_neg(ctx, diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 9f9ed9db0..39d15caeb 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -16,6 +16,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + float tensor_sum_elements(const ggml_tensor * tensor) { float sum = 0; if (tensor->type==GGML_TYPE_F32) { @@ -29,9 +33,9 @@ float tensor_sum_elements(const ggml_tensor * tensor) { } void tensor_dump(const ggml_tensor * tensor, const char * name) { - printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", name, + printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name, tensor->type, ggml_type_name(tensor->type), - (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); float sum = tensor_sum_elements(tensor); printf("Sum of tensor %s is %6.2f\n", name, sum); } @@ -120,7 +124,7 @@ int main(int argc, char ** argv) { ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += 1024*1024*16; - printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024)); + printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); struct ggml_init_params params = { /*.mem_size =*/ ctx_size, diff --git a/examples/common.cpp b/examples/common.cpp index b47f06273..055383bef 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -28,6 +28,10 @@ #include #endif +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -373,7 +377,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else { throw std::exception(); } - } catch (const std::exception &e) { + } catch (const std::exception&) { invalid_param = true; break; } diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 03603b10f..860f99f67 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -4,6 +4,10 @@ #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + int main(int argc, char ** argv) { gpt_params params; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index efa913e16..ef9e75fab 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -28,6 +28,10 @@ #include #endif +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + static console_state con_st; static llama_context ** g_ctx; @@ -348,7 +352,7 @@ int main(int argc, char ** argv) { if ((int)embd.size() > max_embd_size) { auto skipped_tokens = embd.size() - max_embd_size; console_set_color(con_st, CONSOLE_COLOR_ERROR); - printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); console_set_color(con_st, CONSOLE_COLOR_DEFAULT); fflush(stdout); embd.resize(max_embd_size); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index e19c6825f..ae8cfe0af 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -5,6 +5,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); float max_logit = logits[0]; diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6e4f7e1e0..6b8018ee2 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -19,6 +19,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + struct quantize_stats_params { std::string model = "models/7B/ggml-model-f16.bin"; bool verbose = false; diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 91f04b6c7..da4d37ad0 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -37,7 +37,7 @@ int main(int argc, char ** argv) { // init auto ctx = llama_init_from_file(params.model.c_str(), lparams); auto tokens = std::vector(params.n_ctx); - auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true); + auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true); if (n_prompt_tokens < 1) { fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 51271b497..7ec85951a 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -12,6 +12,9 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif struct random_normal_distribution { std::mt19937 gen; @@ -20,7 +23,6 @@ struct random_normal_distribution { float max; }; - struct random_uniform_distribution { std::mt19937 gen; std::uniform_real_distribution rd; @@ -2366,7 +2368,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { file->write_u32(0); file->write_u32(0); file->write_u32(GGML_TYPE_F32); - file->seek(-file->tell() & 31, SEEK_CUR); + file->seek(0-file->tell() & 31, SEEK_CUR); return; } const char * name = ggml_get_name(tensor); @@ -2381,7 +2383,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { file->write_u32(tensor->type); file->write_raw(ne, sizeof(ne[0]) * nd); file->write_raw(name, name_len); - file->seek(-file->tell() & 31, SEEK_CUR); + file->seek(0-file->tell() & 31, SEEK_CUR); file->write_raw(tensor->data, ggml_nbytes(tensor)); } @@ -2402,7 +2404,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) { std::string name = file->read_string(name_len); GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0); - file->seek(-file->tell() & 31, SEEK_CUR); + file->seek(0-file->tell() & 31, SEEK_CUR); file->read_raw(tensor->data, ggml_nbytes(tensor)); } @@ -2756,8 +2758,8 @@ struct train_params get_default_train_params() { params.lbfgs_n_iter = 16; params.adam_n_iter = 16; - params.adam_alpha = 1e-3; - params.adam_decay = 1e-3; + params.adam_alpha = 1e-3f; + params.adam_decay = 1e-3f; params.mem_model_gb = 2; params.mem_compute_gb = 24; @@ -3331,8 +3333,8 @@ int main(int argc, char ** argv) { int n_gen = params.n_predict; int sample_ctx = n_tokens - n_tokens/8; - sampler.params.temp = 0.2; - sampler.params.repeat_penalty = 1.1; + sampler.params.temp = 0.2f; + sampler.params.repeat_penalty = 1.1f; sampler.params.mirostat = 2; init_sampler(&sampler, lctx); diff --git a/ggml.c b/ggml.c index c0efa1977..0eda7f338 100644 --- a/ggml.c +++ b/ggml.c @@ -35,6 +35,12 @@ #define static_assert(cond, msg) struct global_scope_noop_trick #endif +#if defined(_MSC_VER) +// disable "possible loss of data" to avoid hundreds of casts +// we should just be careful :) +#pragma warning(disable: 4244 4267) +#endif + #if defined(_WIN32) #include diff --git a/llama.cpp b/llama.cpp index b8bc0d821..a90438844 100644 --- a/llama.cpp +++ b/llama.cpp @@ -40,6 +40,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 26bf50c9a..7b18090d6 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -10,6 +10,10 @@ #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + constexpr int kVecSize = 1 << 18; float drawFromGaussianPdf(std::mt19937& rndm) { diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 728460b5e..c40f1b29c 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -9,12 +9,15 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif -const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001; -const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002; -const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075; -const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040; -const float MAX_DOT_PRODUCT_ERROR = 0.02; +const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; +const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; +const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; +const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; +const float MAX_DOT_PRODUCT_ERROR = 0.02f; const char* RESULT_STR[] = {"ok", "FAILED"}; diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index d5514455d..600375771 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -13,6 +13,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + #define MAX_ALIGNMENT 64 #define QK 32 #define WARMUP 5 diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 0e675127f..5d693f7b5 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -176,27 +176,27 @@ void test_frequency_presence_penalty( int main(void) { ggml_time_init(); - test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4}, 1); - test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2}, 3); + test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1); + test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3); - test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4}, 0); - test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3}, 0.7); - test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2, 0.1}, 1); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1); - test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3}, 0.25); - test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.75); - test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.99); + test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f); + test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f); + test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f); - test_typical({0.97, 0.01, 0.01, 0.01}, {0.97}, 0.5); - test_typical({0.4, 0.2, 0.2, 0.2}, {0.2, 0.2, 0.2}, 0.5); + test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f); + test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f); - test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.25, 0.25, 0.25, 0.25, 0}, 50.0); - test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.5, 0.5, 0, 0, 0}, 50.0); - test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.5, 0.5, 0, 0, 0}, 50.0); + test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f); + test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f); + test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f); - test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.249997, 0.249997, 0.249997, 0.249997, 0.000011}, 5.0, 5.0); - test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.499966, 0.499966, 0.000023, 0.000023, 0.000023}, 5.0, 5.0); - test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.499977, 0.499977, 0.000023, 0.000023, 0.000000}, 5.0, 5.0); + test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f); + test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f); + test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f); printf("OK\n"); } diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index b08984571..ab1538a0c 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -53,7 +53,7 @@ int main(int argc, char **argv) { for (const auto & test_kv : k_tests()) { std::vector res(test_kv.first.size()); - const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true); + const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true); res.resize(n); bool correct = res.size() == test_kv.second.size(); From 5b9ccaf104cc1054d4f8f17bc8a4b8dc949e5527 Mon Sep 17 00:00:00 2001 From: FrankHB Date: Sat, 17 Jun 2023 02:25:01 +0800 Subject: [PATCH 19/21] Fixed possible macro redefinition (#1892) MinGW libstdc++ may define `NOMINMAX` unconditionally. This fixes the case when it is already defined. --- examples/main/main.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index ef9e75fab..a051fcbc5 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -23,7 +23,9 @@ #include #elif defined (_WIN32) #define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX #define NOMINMAX +#endif #include #include #endif From ac3b8869538c7fbdb48ff141d78c4dea091789f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 16 Jun 2023 20:25:51 +0200 Subject: [PATCH 20/21] llama : fix embd when offloading non-repeating layers (#1891) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index a90438844..81f047ed2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1658,7 +1658,7 @@ static bool llama_eval_internal( // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.norm); - offload_func_nr(cur); + // offload_func_nr(cur); // TODO CPU + GPU mirrored backend ggml_set_name(cur, "result_norm"); embeddings = cur; From 13fe9d2d84f30cab613c960bf66ac83916006694 Mon Sep 17 00:00:00 2001 From: Zenix Date: Sat, 17 Jun 2023 03:53:04 +0900 Subject: [PATCH 21/21] cmake : add auto detection of BLAS_INCLUDE_DIRS (#1886) --- CMakeLists.txt | 56 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dbbc0b5d3..935fba838 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -159,23 +159,59 @@ if (LLAMA_BLAS) if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22) set(BLA_SIZEOF_INTEGER 8) endif() + set(BLA_VENDOR ${LLAMA_BLAS_VENDOR}) find_package(BLAS) + if (BLAS_FOUND) message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. - # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 - find_path(BLAS_INCLUDE_DIRS - NAMES cblas.h - HINTS - /usr/include - /usr/local/include - /usr/include/openblas - ) + if ("${BLAS_INCLUDE_DIRS}" STREQUAL "") + # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. + # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 + find_package(PkgConfig REQUIRED) + if (${LLAMA_BLAS_VENDOR} MATCHES "Generic") + pkg_check_modules(DepBLAS REQUIRED blas) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS") + pkg_check_modules(DepBLAS REQUIRED openblas) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME") + pkg_check_modules(DepBLAS REQUIRED blis) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS") + pkg_check_modules(DepBLAS REQUIRED blas-atlas) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS") + pkg_check_modules(DepBLAS REQUIRED flexiblas_api) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel") + # all Intel* libraries share the same include path + pkg_check_modules(DepBLAS REQUIRED mkl-sdl) + elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC") + # this doesn't provide pkg-config + # suggest to assign BLAS_INCLUDE_DIRS on your own + if ("${NVHPC_VERSION}" STREQUAL "") + message(WARNING "Better to set NVHPC_VERSION") + else() + set(DepBLAS_FOUND ON) + set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include") + endif() + endif() + if (DepBLAS_FOUND) + set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS}) + else() + message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically" + " detected by pkgconfig, trying to find cblas.h from possible paths...") + find_path(BLAS_INCLUDE_DIRS + NAMES cblas.h + HINTS + /usr/include + /usr/local/include + /usr/include/openblas + /opt/homebrew/opt/openblas/include + /usr/local/opt/openblas/include + /usr/include/x86_64-linux-gnu/openblas/include + ) + endif() + endif() message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") - add_compile_options(${BLAS_LINKER_FLAGS}) add_compile_definitions(GGML_USE_OPENBLAS) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})