From a17a2683d8fdb899ba497d0c28ccafb28c62efb6 Mon Sep 17 00:00:00 2001 From: tslmy Date: Thu, 6 Jul 2023 09:17:50 -0700 Subject: [PATCH 1/4] alpaca.sh : update model file name (#2074) The original file name, `ggml-alpaca-7b-q4.bin`, implied the first-generation GGML. After the breaking changes (mentioned in https://github.com/ggerganov/llama.cpp/issues/382), `llama.cpp` requires GGML V3 now. Those model files are named `*ggmlv3*.bin`. We should change the example to an actually working model file, so that this thing is more likely to run out-of-the-box for more people, and less people would waste time downloading the old Alpaca model. --- examples/alpaca.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/alpaca.sh b/examples/alpaca.sh index aef207f36..8d2bae691 100755 --- a/examples/alpaca.sh +++ b/examples/alpaca.sh @@ -7,7 +7,7 @@ cd `dirname $0` cd .. -./main -m ./models/ggml-alpaca-7b-q4.bin \ +./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \ --color \ -f ./prompts/alpaca.txt \ --ctx_size 2048 \ From 36680f6e40e4440c3ec3385d0b7e5ca8bb6c37f7 Mon Sep 17 00:00:00 2001 From: Judd Date: Fri, 7 Jul 2023 00:23:49 +0800 Subject: [PATCH 2/4] convert : update for baichuan (#2081) 1. guess n_layers; 2. relax warnings on context size; 3. add a note that its derivations are also supported. Co-authored-by: Judd --- README.md | 2 +- convert.py | 6 ++++++ examples/embedding/embedding.cpp | 2 +- examples/main/main.cpp | 2 +- examples/perplexity/perplexity.cpp | 2 +- examples/server/README.md | 2 +- 6 files changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 32f17c2d1..863aef123 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ as the main playground for developing new features for the [ggml](https://github - [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy) - [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b) - [X] [WizardLM](https://github.com/nlpxucan/WizardLM) -- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) +- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft)) **Bindings:** diff --git a/convert.py b/convert.py index 142692776..66509b99c 100644 --- a/convert.py +++ b/convert.py @@ -154,9 +154,15 @@ class Params: # try transformer naming first if "model.layers.0.self_attn.q_proj.weight" in model: n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming + n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) else: n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) + if n_layer < 1: + raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" + "Suggestion: provide 'config.json' of the model in the same directory containing model files.") + n_head=n_embd // 128 # guessed return Params( diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 2b7eb39c5..03e801c2a 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -18,7 +18,7 @@ int main(int argc, char ** argv) { params.embedding = true; if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" "expect poor results\n", __func__, params.n_ctx); } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 3a171925b..0f6391acb 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -85,7 +85,7 @@ int main(int argc, char ** argv) { } if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" "expect poor results\n", __func__, params.n_ctx); } else if (params.n_ctx < 8) { fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index dd54ed3c4..fd4b03cb2 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -130,7 +130,7 @@ int main(int argc, char ** argv) { params.n_batch = std::min(params.n_batch, params.n_ctx); if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" "expect poor results\n", __func__, params.n_ctx); } diff --git a/examples/server/README.md b/examples/server/README.md index c5139c16b..ad9b6bb08 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -7,7 +7,7 @@ Command line options: - `--threads N`, `-t N`: Set the number of threads to use during computation. - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). - `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. -- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. +- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. From dfd9fce6d65599bf33df43e616e85aa639bdae4c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 19:41:31 +0300 Subject: [PATCH 3/4] ggml : fix restrict usage --- ggml.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ggml.h b/ggml.h index 24ca8ae22..d0710c555 100644 --- a/ggml.h +++ b/ggml.h @@ -1514,9 +1514,15 @@ extern "C" { // Internal types and functions exposed for tests and benchmarks // - typedef void (*ggml_to_float_t)(const void * x, float * y, int k); - typedef void (*ggml_from_float_t)(const float * x, void * y, int k); - typedef void (*ggml_vec_dot_t)(const int n, float * s, const void * x, const void * y); +#ifdef __cplusplus +// restrict not standard in C++ +#define GGML_RESTRICT +#else +#define GGML_RESTRICT restrict +#endif + typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); + typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); typedef struct { ggml_to_float_t to_float; From 481f793acc3882a09d45d8d2c3076ad3d1c60cfc Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 7 Jul 2023 11:34:18 +0800 Subject: [PATCH 4/4] Fix opencl by wrap #if-else-endif with \n (#2086) --- ggml-opencl.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index fa0bdbefb..eb214a836 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... const int in = tid - step*im; // 0...15 or 0...7 -#if K_QUANTS_PER_ITERATION == 1 +\n#if K_QUANTS_PER_ITERATION == 1\n const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 const int is = 0; -#else + +\n#else\n + const int l0 = 4 * in; // 0, 4, 8, ..., 28 const int is = in / 4; -#endif + +\n#endif\n + const int ql_offset = 64*im + l0; const int qh_offset = 32*im + l0; const int s_offset = 8*im + is; @@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, const float d = vload_half(0, &x[i].d); -#if K_QUANTS_PER_ITERATION == 1 +\n#if K_QUANTS_PER_ITERATION == 1\n float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) @@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); tmp[16 * ix + tid] += sum; -#else +\n#else\n float sum = 0; for (int l = 0; l < 4; ++l) { sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) @@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); } tmp[16 * ix + tid] += sum; -#endif +\n#endif\n }