From 800c9635b4a9390126f397870f3a825fc7455bd1 Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Wed, 23 Aug 2023 02:27:06 +0800 Subject: [PATCH 01/10] Fix CUDA softmax by subtracting max value before exp (#2665) --- ggml-cuda.cu | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 8ab29bb20..4fe378c21 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3979,24 +3979,29 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int // the CUDA soft max implementation differs from the CPU implementation // instead of doubles floats are used -// values are also not normalized to the maximum value by subtracting it in the exponential function -// theoretically these changes could cause problems with rounding error and arithmetic overflow but for LLaMa it seems to be fine static __global__ void soft_max_f32(const float * x, float * dst, const int ncols) { const int row = blockDim.x*blockIdx.x + threadIdx.x; const int block_size = blockDim.y; const int tid = threadIdx.y; - float tmp = 0.0; - - for (int block_start = 0; block_start < ncols; block_start += block_size) { - const int col = block_start + tid; - - if (col >= ncols) { - break; - } + float max_val = -INFINITY; + for (int col = tid; col < ncols; col += block_size) { const int i = row*ncols + col; - const float val = expf(x[i]); + max_val = max(max_val, x[i]); + } + + // find the max value in the block +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + max_val = max(max_val, __shfl_xor_sync(0xffffffff, max_val, mask, 32)); + } + + float tmp = 0.f; + + for (int col = tid; col < ncols; col += block_size) { + const int i = row*ncols + col; + const float val = expf(x[i] - max_val); tmp += val; dst[i] = val; } @@ -4007,15 +4012,11 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } - for (int block_start = 0; block_start < ncols; block_start += block_size) { - const int col = block_start + tid; - - if (col >= ncols) { - break; - } + const float inv_tmp = 1.f / tmp; + for (int col = tid; col < ncols; col += block_size) { const int i = row*ncols + col; - dst[i] /= tmp; + dst[i] *= inv_tmp; } } From 3b6cfe7c927df178ca3c11643c3ec93e143471c9 Mon Sep 17 00:00:00 2001 From: Alex Petenchea Date: Tue, 22 Aug 2023 21:58:16 +0300 Subject: [PATCH 02/10] convert.py : clarifying error message (#2718) --- convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 71978d671..e720889fd 100644 --- a/convert.py +++ b/convert.py @@ -964,7 +964,7 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, Sentence path = path3 else: raise FileNotFoundError( - f"Could not find tokenizer.model in {path} or its parent; " + f"Could not find {vocab_file} in {path} or its parent; " "if it's in another directory, pass the directory as --vocab-dir") print(f"Loading vocab file '{path}', type '{vocabtype}'") From c63bb1d16a70c03440671b76954bb767513cead8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 22 Aug 2023 22:47:05 +0200 Subject: [PATCH 03/10] CUDA: use mul_mat_q kernels by default (#2683) --- common/common.cpp | 16 ++++++++-------- common/common.h | 2 +- examples/server/server.cpp | 13 ++++++------- ggml-cuda.cu | 2 +- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 1623ba21f..2a83b379e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -387,11 +387,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { #else fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); #endif // GGML_USE_CUBLAS - } else if (arg == "--mul-mat-q" || arg == "-mmq") { + } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { #ifdef GGML_USE_CUBLAS - params.mul_mat_q = true; + params.mul_mat_q = false; #else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n"); + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); #endif // GGML_USE_CUBLAS } else if (arg == "--low-vram" || arg == "-lv") { #ifdef GGML_USE_CUBLAS @@ -599,11 +599,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " number of layers to store in VRAM\n"); fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); - fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); - fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" ); - fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" ); - fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" ); + fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); + fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); + fprintf(stdout, " -nommq, --no-mul-mat-q\n"); + fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); + fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); #endif fprintf(stdout, " --mtest compute maximum memory usage\n"); fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); diff --git a/common/common.h b/common/common.h index c50a6edfc..18fd951ea 100644 --- a/common/common.h +++ b/common/common.h @@ -68,7 +68,7 @@ struct gpt_params { size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score bool low_vram = false; // if true, reduce VRAM usage at the cost of performance - bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels + bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 39fdf3307..e5bc52cd0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -671,12 +671,11 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, fprintf(stdout, " number of layers to store in VRAM\n"); fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); - fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" ); - fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" ); - fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" ); + fprintf(stdout, " -nommq, --no-mul-mat-q\n"); + fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); + fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); #endif fprintf(stdout, " -m FNAME, --model FNAME\n"); fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); @@ -867,12 +866,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {}); #endif // GGML_USE_CUBLAS } - else if (arg == "--mul-mat-q" || arg == "-mmq") + else if (arg == "--no-mul-mat-q" || arg == "-nommq") { #ifdef GGML_USE_CUBLAS - params.mul_mat_q = true; + params.mul_mat_q = false; #else - LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {}); + LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {}); #endif // GGML_USE_CUBLAS } else if (arg == "--main-gpu" || arg == "-mg") diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 4fe378c21..70a950bb5 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -287,7 +287,7 @@ static int g_device_count = -1; static int g_main_device = 0; static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; -static bool g_mul_mat_q = false; +static bool g_mul_mat_q = true; static void * g_scratch_buffer = nullptr; static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default From 46ef5b5fcf4c366e1fb27726b6394adbbf8fd0ea Mon Sep 17 00:00:00 2001 From: goerch Date: Tue, 22 Aug 2023 23:10:42 +0200 Subject: [PATCH 04/10] llama : fix whitespace escaping in tokenizer (#2724) --- llama.cpp | 13 +++---------- tests/test-tokenizer-0.cpp | 11 ++++++++++- tests/test-tokenizer-1.cpp | 13 +++---------- 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/llama.cpp b/llama.cpp index 6abdc44f2..6c5da1309 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2253,18 +2253,11 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { } static std::string llama_escape_whitespace(const std::string& text) { - std::string result; - bool escaping = false; - result += "\xe2\x96\x81"; + std::string result = "\xe2\x96\x81"; for (size_t offs = 0; offs < text.length(); ++offs) { if (text[offs] == ' ') { - if (!escaping) { - result += "\xe2\x96\x81"; - escaping = true; - } - } - else { - escaping = false; + result += "\xe2\x96\x81"; + } else { result += text[offs]; } } diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 81764565b..f3ee851a3 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -17,6 +17,8 @@ static std::string unescape_whitespace(llama_context* ctx, const std::vector> & k_tests() { static std::map> _k_tests = { { " ", {1, 259, }, }, + { " ", { 1, 1678, }, }, + { " ", { 1, 268, }, }, { "\t", { 1, 29871, 12, }, }, { "\n", { 1, 29871, 13, }, }, { "\t\n", { 1, 29871, 12, 13, }, }, @@ -38,6 +40,12 @@ static const std::map> & k_tests() { 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, + { "Hello", { 1, 15043 }, }, + { " Hello", { 1, 29871, 15043 }, }, + { " Hello", { 1, 259, 15043 }, }, + { " Hello", { 1, 1678, 15043 }, }, + { " Hello", { 1, 268, 15043 }, }, + { " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, }, }; return _k_tests; @@ -106,7 +114,8 @@ int main(int argc, char **argv) { if (!correct) { fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); - fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str()); fprintf(stderr, "%s : expected tokens: ", __func__); for (const auto & t : test_kv.second) { fprintf(stderr, "%6d, ", t); diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp index d8db7cd96..993d17f18 100644 --- a/tests/test-tokenizer-1.cpp +++ b/tests/test-tokenizer-1.cpp @@ -11,18 +11,11 @@ #include static std::string escape_whitespace(const std::string& text) { - std::string result; - bool escaping = false; - result += "\xe2\x96\x81"; + std::string result = "\xe2\x96\x81"; for (size_t offs = 0; offs < text.length(); ++offs) { if (text[offs] == ' ') { - if (!escaping) { - result += "\xe2\x96\x81"; - escaping = true; - } - } - else { - escaping = false; + result += "\xe2\x96\x81"; + } else { result += text[offs]; } } From 777f42ba18b29f25c71ff8de3ecf97b8017304c0 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Tue, 22 Aug 2023 17:39:39 -0600 Subject: [PATCH 05/10] Improve handling of special tokens in GGML to GGUF converter (#2725) * Improve UNK, BOS, EOS token handling when converting without metadata. * Allow importing as a module. * Remove some obsolete code and minor cleanups. * Set default UNK token mapping from -1 to 0 in llama.cpp * Try to handle overflow due to buggy Windows Python with a better error message --- convert-llama-ggmlv3-to-gguf.py | 43 +++++++++++++++++++++++---------- llama.cpp | 2 +- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py index fa4a044ca..5b038fc0a 100644 --- a/convert-llama-ggmlv3-to-gguf.py +++ b/convert-llama-ggmlv3-to-gguf.py @@ -1,10 +1,12 @@ -import sys, struct, math, argparse +import sys, struct, math, argparse, warnings from pathlib import Path import numpy as np import gguf +warnings.filterwarnings('error') + # Note: Does not support GGML_QKK_64 QK_K = 256 # Items here are (block size, type size) @@ -215,15 +217,10 @@ class GGMLToGGUF: if self.vocab_override is not None: vo = self.vocab_override print('* Adding vocab item(s)') - for (idx, vitem) in enumerate(vo.all_tokens()): - if len(vitem) == 3: - tokens.append(vitem[0]) - scores.append(vitem[1]) - toktypes.append(vitem[2]) - else: - # Maybe try to guess the token type here? - tokens.append(vitem[0]) - scores.append(vitem[1]) + for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): + tokens.append(vbytes) + scores.append(score) + toktypes.append(ttype) assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}' gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) @@ -231,9 +228,21 @@ class GGMLToGGUF: gguf_writer.add_token_types(toktypes) return print(f'* Adding {hp.n_vocab} vocab item(s)') + assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab' for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items): tt = 1 # Normal - if len(vbytes) == 0: + # Special handling for UNK, BOS, EOS tokens. + if tokid <= 2: + if tokid == 0: + vbytes = b'' + tt = 2 + elif tokid == 1: + vbytes = b'' + tt = 3 + else: + vbytes = b'' + tt = 3 + elif len(vbytes) == 0: tt = 3 # Control elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1: vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8') @@ -246,6 +255,9 @@ class GGMLToGGUF: gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) gguf_writer.add_token_types(toktypes) + gguf_writer.add_unk_token_id(0) + gguf_writer.add_bos_token_id(1) + gguf_writer.add_eos_token_id(2) def add_tensors(self, gguf_writer): nm = self.name_map @@ -315,7 +327,11 @@ def main(): data = np.memmap(cfg.input, mode = 'r') model = GGMLV3Model() print('* Scanning GGML input file') - offset = model.load(data, 0) + try: + offset = model.load(data, 0) + except OverflowError: + print(f'!!! Caught overflow loading tensors. The most likely issue is running on Windows but not in WSL. Try running in WSL if possible.', file = sys.stderr) + raise print(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None @@ -330,4 +346,5 @@ def main(): converter.save() print(f'* Successful completion. Output saved to: {cfg.output}') -main() +if __name__ == '__main__': + main() diff --git a/llama.cpp b/llama.cpp index 6c5da1309..fd8eaa180 100644 --- a/llama.cpp +++ b/llama.cpp @@ -703,7 +703,7 @@ struct llama_vocab { // default LLaMA special tokens id special_bos_id = 1; id special_eos_id = 2; - id special_unk_id = -1; + id special_unk_id = 0; id special_sep_id = -1; id special_pad_id = -1; From f5fe98d11bdf9e7797bcfb05c0c3601ffc4b9d26 Mon Sep 17 00:00:00 2001 From: Evan Jones Date: Tue, 22 Aug 2023 21:01:57 -0400 Subject: [PATCH 06/10] docs : add grammar docs (#2701) * docs : add grammar docs * tweaks to grammar guide * rework GBNF example to be a commented grammar --- README.md | 12 ++++++ examples/main/README.md | 4 ++ grammars/README.md | 91 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 grammars/README.md diff --git a/README.md b/README.md index 82e070ac3..f746c49eb 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ Last revision compatible with the old format: [dadbed9](https://github.com/ggerg
  • Memory/Disk Requirements
  • Quantization
  • Interactive mode
  • +
  • Constrained output with grammars
  • Instruction mode with Alpaca
  • Using OpenLLaMA
  • Using GPT4All
  • @@ -604,6 +605,16 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \ CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh ``` +### Constrained output with grammars + +`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only: + +```bash +./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' +``` + +The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). + ### Instruction mode with Alpaca 1. First, download the `ggml` Alpaca model into the `./models` folder @@ -885,3 +896,4 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m / - [BLIS](./docs/BLIS.md) - [Performance troubleshooting](./docs/token_generation_performance_tips.md) - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks) +- [GBNF grammars](./grammars/README.md) diff --git a/examples/main/README.md b/examples/main/README.md index 60e3907d5..d555afdcc 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -288,6 +288,10 @@ These options help improve the performance and memory usage of the LLaMA models. - `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation. +### Grammars + +- `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax. + ### Quantization For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run). diff --git a/grammars/README.md b/grammars/README.md new file mode 100644 index 000000000..7f3b11ca5 --- /dev/null +++ b/grammars/README.md @@ -0,0 +1,91 @@ +# GBNF Guide + +GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`. + +## Background + +[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features. + +## Basics + +In GBNF, we define *production rules* that specify how a *non-terminal* (rule name) can be replaced with sequences of *terminals* (characters, specifically Unicode [code points](https://en.wikipedia.org/wiki/Code_point)) and other non-terminals. The basic format of a production rule is `nonterminal ::= sequence...`. + +## Example + +Before going deeper, let's look at some of the features demonstrated in `grammars/chess.gbnf`, a small chess notation grammar: +``` +# `root` specifies the pattern for the overall output +root ::= ( + # it must start with the characters "1. " followed by a sequence + # of characters that match the `move` rule, followed by a space, followed + # by another move, and then a newline + "1. " move " " move "\n" + + # it's followed by one or more subsequent moves, numbered with one or two digits + ([1-9] [0-9]? ". " move " " move "\n")+ +) + +# `move` is an abstract representation, which can be a pawn, nonpawn, or castle. +# The `[+#]?` denotes the possibility of checking or mate signs after moves +move ::= (pawn | nonpawn | castle) [+#]? + +pawn ::= ... +nonpawn ::= ... +castle ::= ... +``` + +## Non-Terminals and Terminals + +Non-terminal symbols (rule names) stand for a pattern of terminals and other non-terminals. They are required to be a dashed lowercase word, like `move`, `castle`, or `check-mate`. + +Terminals are actual characters ([code points](https://en.wikipedia.org/wiki/Code_point)). They can be specified as a sequence like `"1"` or `"O-O"` or as ranges like `[1-9]` or `[NBKQR]`. + +## Characters and character ranges + +Terminals support the full range of Unicode. Unicode characters can be specified directly in the grammar, for example `hiragana ::= [ぁ-ゟ]`, or with escapes: 8-bit (`\xXX`), 16-bit (`\uXXXX`) or 32-bit (`\UXXXXXXXX`). + +Character ranges can be negated with `^`: +``` +single-line ::= [^\n]+ "\n"` +``` + +## Sequences and Alternatives + +The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc. + +Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`. + +Parentheses `()` can be used to group sequences, which allows for embedding alternatives in a larger rule or applying repetition and optptional symbols (below) to a sequence. + +## Repetition and Optional Symbols + +- `*` after a symbol or sequence means that it can be repeated zero or more times. +- `+` denotes that the symbol or sequence should appear one or more times. +- `?` makes the preceding symbol or sequence optional. + +## Comments and newlines + +Comments can be specified with `#`: +``` +# defines optional whitspace +ws ::= [ \t\n]+ +``` + +Newlines are allowed between rules and between symbols or sequences nested inside parentheses. Additionally, a newline after an alternate marker `|` will continue the current rule, even outside of parentheses. + +## The root rule + +In a full grammar, the `root` rule always defines the starting point of the grammar. In other words, it specifies what the entire output must match. + +``` +# a grammar for lists +root ::= ("- " item)+ +item ::= [^\n]+ "\n" +``` + +## Next steps + +This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with: +``` +./main -m --grammar-file grammars/some-grammar.gbnf -p 'Some prompt' +``` From b8ad1b66b23f9b2e6e4531e9a62753323036a556 Mon Sep 17 00:00:00 2001 From: Xiao-Yong Jin Date: Wed, 23 Aug 2023 02:12:12 -0500 Subject: [PATCH 07/10] server : allow json array in prompt or content for direct token input (#2306) * server: allow json array in prompt or content We accept an array of strings and numbers representing tokens, in addition to the current string valued prompt or content. This allows direct token input, so that any special tokens can be processed and used at the frontend during the construction of the json data, before sending to the server. And the server does not need to know or parse special tokens from textual input. With this, we can use EOS and BOS used in llama-2-chat models. * server: use tokenizePrompt(json) and default "" if empty prompt * server: fix prompt check * server: tokenize endpoint no longer adds BOS --- examples/server/README.md | 2 +- examples/server/server.cpp | 80 ++++++++++++++++++++++++++++++++++---- 2 files changed, 74 insertions(+), 8 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 4d97db2e4..77997f98d 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -126,7 +126,7 @@ node . `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. - `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does. + `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does. `stop`: Specify a JSON array of stopping strings. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e5bc52cd0..1e6d10c1d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -190,6 +190,7 @@ struct llama_server_context size_t n_past = 0; size_t n_remain = 0; + json prompt; std::vector embd; std::vector last_n_tokens; @@ -267,6 +268,53 @@ struct llama_server_context return true; } + std::vector tokenize(json json_prompt, bool add_bos) + { + // If `add_bos` is true, we only add BOS, when json_prompt is a string, + // or the first element of the json_prompt array is a string. + std::vector prompt_tokens; + + if (json_prompt.is_array()) + { + bool first = true; + for (const auto& p : json_prompt) + { + if (p.is_string()) + { + auto s = p.template get(); + std::vector p; + if (first) + { + s.insert(0, 1, ' '); // add a space if it's the first + p = ::llama_tokenize(ctx, s, add_bos); + first = false; + } + else + { + p = ::llama_tokenize(ctx, s, false); + } + prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); + } + else + { + if (first) + { + first = false; + } + prompt_tokens.push_back(p.template get()); + } + } + } + else + { + auto s = json_prompt.template get(); + s.insert(0, 1, ' '); // always add a first space + prompt_tokens = ::llama_tokenize(ctx, s, add_bos); + } + + return prompt_tokens; + } + bool loadGrammar() { if (!params.grammar.empty()) { @@ -294,8 +342,8 @@ struct llama_server_context void loadPrompt() { - params.prompt.insert(0, 1, ' '); // always add a first space - std::vector prompt_tokens = ::llama_tokenize(ctx, params.prompt, true); + auto prompt_tokens = tokenize(prompt, true); // always add BOS + num_prompt_tokens = prompt_tokens.size(); if (params.n_keep < 0) @@ -1016,7 +1064,7 @@ static json format_final_response(llama_server_context &llama, const std::string {"tokens_predicted", llama.num_tokens_predicted}, {"tokens_evaluated", llama.num_prompt_tokens}, {"generation_settings", format_generation_settings(llama)}, - {"prompt", llama.params.prompt}, + {"prompt", llama.prompt}, {"truncated", llama.truncated}, {"stopped_eos", llama.stopped_eos}, {"stopped_word", llama.stopped_word}, @@ -1085,10 +1133,18 @@ static void parse_options_completion(const json &body, llama_server_context &lla llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl); llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep); llama.params.seed = json_value(body, "seed", default_params.seed); - llama.params.prompt = json_value(body, "prompt", default_params.prompt); llama.params.grammar = json_value(body, "grammar", default_params.grammar); llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs); + if (body.count("prompt") != 0) + { + llama.prompt = body["prompt"]; + } + else + { + llama.prompt = ""; + } + llama.params.logit_bias.clear(); if (json_value(body, "ignore_eos", false)) { @@ -1345,8 +1401,11 @@ int main(int argc, char **argv) auto lock = llama.lock(); const json body = json::parse(req.body); - const std::string content = json_value(body, "content", ""); - const std::vector tokens = llama_tokenize(llama.ctx, content, false); + std::vector tokens; + if (body.count("content") != 0) + { + tokens = llama.tokenize(body["content"], false); + } const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); }); @@ -1358,7 +1417,14 @@ int main(int argc, char **argv) llama.rewind(); llama_reset_timings(llama.ctx); - llama.params.prompt = json_value(body, "content", ""); + if (body.count("content") != 0) + { + llama.prompt = body["content"]; + } + else + { + llama.prompt = ""; + } llama.params.n_predict = 0; llama.loadPrompt(); llama.beginCompletion(); From 7f7ddd5002040804e33fcdbde44aa22f8635f57d Mon Sep 17 00:00:00 2001 From: IgnacioFDM Date: Wed, 23 Aug 2023 06:31:09 -0300 Subject: [PATCH 08/10] Fix ggml to gguf conversion on Windows (#2733) This fixes `RuntimeWarning: overflow encountered in long_scalars` Credit: anon (not mine) --- convert-llama-ggmlv3-to-gguf.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py index 5b038fc0a..86d459680 100644 --- a/convert-llama-ggmlv3-to-gguf.py +++ b/convert-llama-ggmlv3-to-gguf.py @@ -1,12 +1,10 @@ -import sys, struct, math, argparse, warnings +import sys, struct, math, argparse from pathlib import Path import numpy as np import gguf -warnings.filterwarnings('error') - # Note: Does not support GGML_QKK_64 QK_K = 256 # Items here are (block size, type size) @@ -95,7 +93,7 @@ class Tensor: pad = ((offset + 31) & ~31) - offset offset += pad n_elems = np.prod(self.dims) - n_bytes = (n_elems * tysize) // blksize + n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize) self.start_offset = offset self.len_bytes = n_bytes offset += n_bytes @@ -327,11 +325,7 @@ def main(): data = np.memmap(cfg.input, mode = 'r') model = GGMLV3Model() print('* Scanning GGML input file') - try: - offset = model.load(data, 0) - except OverflowError: - print(f'!!! Caught overflow loading tensors. The most likely issue is running on Windows but not in WSL. Try running in WSL if possible.', file = sys.stderr) - raise + offset = model.load(data, 0) print(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None From 62959e740e8759d246ac8d09036950efde09981c Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Wed, 23 Aug 2023 12:56:42 +0300 Subject: [PATCH 09/10] Strided perplexity (#2714) * Implementing strided computation of perplexity * Alternative way to output PPL results --------- Co-authored-by: Iwan Kawrakow --- common/common.cpp | 12 +++ common/common.h | 4 + examples/perplexity/perplexity.cpp | 126 ++++++++++++++++++++++++++++- 3 files changed, 141 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 2a83b379e..88a962ae3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -417,6 +417,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.antiprompt.push_back(argv[i]); } else if (arg == "--perplexity") { params.perplexity = true; + } else if (arg == "--ppl-stride") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.ppl_stride = std::stoi(argv[i]); + } else if (arg == "--ppl-output-type") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.ppl_output_type = std::stoi(argv[i]); } else if (arg == "--hellaswag") { params.hellaswag = true; } else if (arg == "--hellaswag-tasks") { diff --git a/common/common.h b/common/common.h index 18fd951ea..d68a8ef88 100644 --- a/common/common.h +++ b/common/common.h @@ -64,6 +64,10 @@ struct gpt_params { std::string lora_adapter = ""; // lora adapter path std::string lora_base = ""; // base model path for the lora adapter + int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. + int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line + // (which is more convenient to use for plotting) + // bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index f3c045aec..e89725efc 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -27,7 +27,121 @@ std::vector softmax(const std::vector& logits) { return probs; } +void perplexity_v2(llama_context * ctx, const gpt_params & params) { + + // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research + // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` + // Output: `perplexity: 13.5106 [114/114]` + // BOS tokens will be added for each chunk before eval + + if (params.ppl_stride <= 0) { + fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); + return; + } + auto tokens = ::llama_tokenize(ctx, params.prompt, true); + + const int calc_chunk = params.n_ctx; + + fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); + + if (int(tokens.size()) <= calc_chunk) { + fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, + tokens.size(), params.n_ctx, params.ppl_stride); + return; + } + + const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride; + + const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); + const int n_vocab = llama_n_vocab(ctx); + const int n_batch = params.n_batch; + + int count = 0; + double nll = 0.0; + + fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); + + for (int i = 0; i < n_chunk; ++i) { + const int start = i * params.ppl_stride; + const int end = start + calc_chunk; + + const int num_batches = (calc_chunk + n_batch - 1) / n_batch; + //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches); + + std::vector logits; + + const auto t_start = std::chrono::high_resolution_clock::now(); + + for (int j = 0; j < num_batches; ++j) { + const int batch_start = start + j * n_batch; + const int batch_size = std::min(end - batch_start, n_batch); + + //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); + if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) { + //fprintf(stderr, "%s : failed to eval\n", __func__); + return; + } + + // save original token and restore it after eval + const auto token_org = tokens[batch_start]; + + // add BOS token for the first batch of each chunk + if (j == 0) { + tokens[batch_start] = llama_token_bos(ctx); + } + + const auto batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + + if (j == 0) { + tokens[batch_start] = token_org; + } + } + + const auto t_end = std::chrono::high_resolution_clock::now(); + + if (i == 0) { + const float t_total = std::chrono::duration(t_end - t_start).count(); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + int total_seconds = (int)(t_total * n_chunk); + if (total_seconds >= 60*60) { + fprintf(stderr, "%d hours ", total_seconds / (60*60)); + total_seconds = total_seconds % (60*60); + } + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + } + + //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); + for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) { + + // Calculate probability of next token, given the previous ones. + const std::vector tok_logits( + logits.begin() + (j + 0) * n_vocab, + logits.begin() + (j + 1) * n_vocab); + + const float prob = softmax(tok_logits)[tokens[start + j + 1]]; + + nll += -std::log(prob); + ++count; + } + // perplexity is e^(average negative log-likelihood) + if (params.ppl_output_type == 0) { + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + } else { + printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count)); + } + fflush(stdout); + } + printf("\n"); +} + void perplexity(llama_context * ctx, const gpt_params & params) { + + if (params.ppl_stride > 0) { + perplexity_v2(ctx, params); + return; + } + // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` @@ -116,7 +230,11 @@ void perplexity(llama_context * ctx, const gpt_params & params) { ++count; } // perplexity is e^(average negative log-likelihood) - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + if (params.ppl_output_type == 0) { + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + } else { + printf("%8d %.4lf\n", i*params.n_ctx, std::exp(nll / count)); + } fflush(stdout); } printf("\n"); @@ -369,6 +487,12 @@ int main(int argc, char ** argv) { params.perplexity = true; params.n_batch = std::min(params.n_batch, params.n_ctx); + if (params.ppl_stride > 0) { + fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", + params.n_ctx, params.n_ctx + params.ppl_stride/2); + params.n_ctx += params.ppl_stride/2; + } + if (params.n_ctx > 2048) { fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" "expect poor results\n", __func__, params.n_ctx); From 8207214b6a37a46526cee9e72d4c9092b9d1872f Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Wed, 23 Aug 2023 12:57:12 +0300 Subject: [PATCH 10/10] Fix values shown in the quantize tool help (#2735) Co-authored-by: Iwan Kawrakow --- examples/quantize/quantize.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index f628d0642..d172f645a 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -14,25 +14,25 @@ struct quant_option { }; static const std::vector QUANT_OPTIONS = { - { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", }, - { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", }, - { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", }, - { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", }, + { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", }, + { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", }, + { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, + { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, #ifdef GGML_USE_K_QUANTS - { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", }, + { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, - { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", }, - { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", }, - { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", }, + { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, + { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, - { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", }, - { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", }, + { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", }, + { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", }, { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, - { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", }, - { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", }, - { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", }, + { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, + { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, + { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", }, #endif - { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", }, + { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, };