diff --git a/README.md b/README.md index 9675ce1e7..ec7b58943 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics -- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \ +- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401) +- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \ **Devs should become familiar with the new API** - Local Falcon 180B inference on Mac Studio @@ -92,7 +93,8 @@ as the main playground for developing new features for the [ggml](https://github - [X] [WizardLM](https://github.com/nlpxucan/WizardLM) - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft)) - [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B) -- [X] Mistral AI v0.1 +- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) +- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) **Bindings:** @@ -662,6 +664,8 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \ The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). +For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one. + ### Instruction mode with Alpaca 1. First, download the `ggml` Alpaca model into the `./models` folder diff --git a/convert.py b/convert.py index 4ac5030db..8bb6c7e41 100755 --- a/convert.py +++ b/convert.py @@ -439,7 +439,7 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab' def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) if n_head_kv is not None and n_head != n_head_kv: - n_head //= n_head_kv + n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2) .reshape(weights.shape)) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index b61165fb7..8ca1874da 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -626,7 +626,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - { + ggml_allocr_alloc(alloc, KQ_pos); + if (!ggml_allocr_is_measure(alloc)) { int * data = (int *) KQ_pos->data; for (int i = 0; i < N; ++i) { data[i] = n_past + i; @@ -786,6 +787,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); ggml_allocr_alloc(alloc, t36->grad); + // KQ_pos + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one)); // make sure base model tensors data cannot be used in viewable operations ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one)); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 5043f32d0..be693b3ac 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -334,7 +334,8 @@ static struct ggml_tensor * llama_build_train_graphs( // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - { + ggml_allocr_alloc(alloc, KQ_pos); + if (!ggml_allocr_is_measure(alloc)) { int * data = (int *) KQ_pos->data; for (int i = 0; i < N; ++i) { data[i] = n_past + i; diff --git a/ggml.c b/ggml.c index 3dda7547f..820fe2e74 100644 --- a/ggml.c +++ b/ggml.c @@ -11583,6 +11583,7 @@ static void ggml_compute_forward_mul_mat( struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; diff --git a/llama.cpp b/llama.cpp index f7bd2ed9d..b78abee76 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6491,7 +6491,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s nthread = std::thread::hardware_concurrency(); } - llama_model_loader ml(fname_inp, /*use_mmap*/ false); + // mmap consistently increases speed Linux, and also increases speed on Windows with + // hot cache. It may cause a slowdown on macOS, possibly related to free memory. +#if defined(__linux__) || defined(_WIN32) + constexpr bool use_mmap = true; +#else + constexpr bool use_mmap = false; +#endif + + llama_model_loader ml(fname_inp, use_mmap); + if (ml.use_mmap) { + ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa())); + } llama_model model; llm_load_arch(ml, model); @@ -6569,10 +6580,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const std::string name = ggml_get_name(tensor); - if (read_data.size() < ggml_nbytes(tensor)) { - read_data.resize(ggml_nbytes(tensor)); + if (!ml.use_mmap) { + if (read_data.size() < ggml_nbytes(tensor)) { + read_data.resize(ggml_nbytes(tensor)); + } + tensor->data = read_data.data(); } - tensor->data = read_data.data(); ml.load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", diff --git a/llama.h b/llama.h index 96ff1f09c..fde4d6eca 100644 --- a/llama.h +++ b/llama.h @@ -167,18 +167,18 @@ extern "C" { struct llama_context_params { uint32_t seed; // RNG seed, -1 for random - uint32_t n_ctx; // text context - uint32_t n_batch; // prompt processing batch size + uint32_t n_ctx; // text context, 0 = from model + uint32_t n_batch; // prompt processing maximum batch size uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency - float rope_freq_scale; // RoPE frequency scaling factor + float rope_freq_base; // RoPE base frequency, 0 = from model + float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model // Keep the booleans together to avoid misalignment during copy-by-value. bool mul_mat_q; // if true, use experimental mul_mat_q kernels - bool f16_kv; // use fp16 for KV cache + bool f16_kv; // use fp16 for KV cache, fp32 otherwise bool logits_all; // the llama_eval() call computes all logits, not just the last one bool embedding; // embedding mode only };