From 569550df20c1ede59ff195a6b6e900957ad84d16 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 29 Sep 2023 07:15:57 -0400 Subject: [PATCH 1/5] readme : add link to grammars app (#3388) * Add link to grammars app per @ggernagov suggestion Adding a sentence in the Grammars section of README to point to grammar app, per https://github.com/ggerganov/llama.cpp/discussions/2494#discussioncomment-7138211 * Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9675ce1e7..8cdfb04e0 100644 --- a/README.md +++ b/README.md @@ -662,6 +662,8 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \ The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). +For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one. + ### Instruction mode with Alpaca 1. First, download the `ggml` Alpaca model into the `./models` folder From 0a4a4a098261ddd26480371eaccfe90d1bf6488a Mon Sep 17 00:00:00 2001 From: BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com> Date: Fri, 29 Sep 2023 08:50:35 -0400 Subject: [PATCH 2/5] readme : update hot topics + model links (#3399) --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8cdfb04e0..75b6075f2 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics -- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \ +- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \ **Devs should become familiar with the new API** - Local Falcon 180B inference on Mac Studio @@ -92,7 +92,8 @@ as the main playground for developing new features for the [ggml](https://github - [X] [WizardLM](https://github.com/nlpxucan/WizardLM) - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft)) - [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B) -- [X] Mistral AI v0.1 +- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) +- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) **Bindings:** From 2777a84be429401a2b7d33c2b6a4ada1f0776f1b Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 29 Sep 2023 09:48:45 -0400 Subject: [PATCH 3/5] llama : quantize up to 31% faster on Linux and Windows with mmap (#3206) * llama : enable mmap in quantize on Linux -> 31% faster * also enable mmap on Windows --------- Co-authored-by: Georgi Gerganov --- llama.cpp | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 666acc212..bff17135b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s nthread = std::thread::hardware_concurrency(); } - llama_model_loader ml(fname_inp, /*use_mmap*/ false); + // mmap consistently increases speed Linux, and also increases speed on Windows with + // hot cache. It may cause a slowdown on macOS, possibly related to free memory. +#if defined(__linux__) || defined(_WIN32) + constexpr bool use_mmap = true; +#else + constexpr bool use_mmap = false; +#endif + + llama_model_loader ml(fname_inp, use_mmap); + if (ml.use_mmap) { + ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa())); + } llama_model model; llm_load_arch(ml, model); @@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const std::string name = ggml_get_name(tensor); - if (read_data.size() < ggml_nbytes(tensor)) { - read_data.resize(ggml_nbytes(tensor)); + if (!ml.use_mmap) { + if (read_data.size() < ggml_nbytes(tensor)) { + read_data.resize(ggml_nbytes(tensor)); + } + tensor->data = read_data.data(); } - tensor->data = read_data.data(); ml.load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", From bc34dd4f5b5a7c10ae3ed85a265ce6f2ed2fab79 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 29 Sep 2023 19:05:18 +0300 Subject: [PATCH 4/5] train : fix KQ_pos allocation (#3392) * train : fix KQ_pos allocation * make sure KQ_pos is not reallocated in finetune --------- Co-authored-by: xaedes --- examples/finetune/finetune.cpp | 5 ++++- examples/train-text-from-scratch/train-text-from-scratch.cpp | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index b61165fb7..8ca1874da 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -626,7 +626,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - { + ggml_allocr_alloc(alloc, KQ_pos); + if (!ggml_allocr_is_measure(alloc)) { int * data = (int *) KQ_pos->data; for (int i = 0; i < N; ++i) { data[i] = n_past + i; @@ -786,6 +787,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); ggml_allocr_alloc(alloc, t36->grad); + // KQ_pos + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one)); // make sure base model tensors data cannot be used in viewable operations ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one)); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 5043f32d0..be693b3ac 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -334,7 +334,8 @@ static struct ggml_tensor * llama_build_train_graphs( // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - { + ggml_allocr_alloc(alloc, KQ_pos); + if (!ggml_allocr_is_measure(alloc)) { int * data = (int *) KQ_pos->data; for (int i = 0; i < N; ++i) { data[i] = n_past + i; From 40e07a60f9ce06e79f3ccd4c903eba300fb31b5e Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 29 Sep 2023 18:42:32 +0200 Subject: [PATCH 5/5] llama.cpp : add documentation about rope_freq_base and scale values (#3401) * llama.cpp : add documentation about rope_freq_base and scale values * add notice to hot topics --- README.md | 1 + llama.h | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 75b6075f2..ec7b58943 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics +- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401) - Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \ **Devs should become familiar with the new API** - Local Falcon 180B inference on Mac Studio diff --git a/llama.h b/llama.h index 96ff1f09c..fde4d6eca 100644 --- a/llama.h +++ b/llama.h @@ -167,18 +167,18 @@ extern "C" { struct llama_context_params { uint32_t seed; // RNG seed, -1 for random - uint32_t n_ctx; // text context - uint32_t n_batch; // prompt processing batch size + uint32_t n_ctx; // text context, 0 = from model + uint32_t n_batch; // prompt processing maximum batch size uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency - float rope_freq_scale; // RoPE frequency scaling factor + float rope_freq_base; // RoPE base frequency, 0 = from model + float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model // Keep the booleans together to avoid misalignment during copy-by-value. bool mul_mat_q; // if true, use experimental mul_mat_q kernels - bool f16_kv; // use fp16 for KV cache + bool f16_kv; // use fp16 for KV cache, fp32 otherwise bool logits_all; // the llama_eval() call computes all logits, not just the last one bool embedding; // embedding mode only };