From a5c42c4b13b3be9e58fe8f9adbb6ee60417674a6 Mon Sep 17 00:00:00 2001 From: anzz1 Date: Wed, 29 Mar 2023 16:19:29 +0300 Subject: [PATCH 01/15] Fix typo in llama.h (#593) --- llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.h b/llama.h index 587d85323..3368de3e0 100644 --- a/llama.h +++ b/llama.h @@ -6,7 +6,7 @@ #include #ifdef LLAMA_SHARED -# ifdef _WIN32 && !defined __MINGW32__ +# if defined(_WIN32) && !defined(__MINGW32__) # ifdef LLAMA_BUILD # define LLAMA_API __declspec(dllexport) # else From 83df5639eb182ed7c122382907691d8baa3c32df Mon Sep 17 00:00:00 2001 From: anzz1 Date: Wed, 29 Mar 2023 16:20:07 +0300 Subject: [PATCH 02/15] Fix GCC warning about binary literal (#595) 0b10101010 -> 0xAA /* 0b10101010 */ --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index efe9316bb..c049f00a9 100644 --- a/ggml.c +++ b/ggml.c @@ -1962,7 +1962,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest // Compute cross scales for the block const __m256 scale_0 = _mm256_mul_ps( d0v, m1v ); const __m256 scale_1 = _mm256_mul_ps( m0v, d1v ); - const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 ); + const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0xAA /* 0b10101010 */ ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes __m256i bx = bytesFromNibbles( x[i].qs ); From a6956b25a1c783e5e96fe06c9c00438f846ef047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20L=C3=BCtke?= Date: Wed, 29 Mar 2023 17:10:24 +0200 Subject: [PATCH 03/15] add example of re-act pattern (#583) * add example of re-act pattern * spelling... * fixed whitespace in reverse prompt issue --- examples/reason-act.sh | 17 +++++++++++++++++ prompts/reason-act.txt | 18 ++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100755 examples/reason-act.sh create mode 100644 prompts/reason-act.txt diff --git a/examples/reason-act.sh b/examples/reason-act.sh new file mode 100755 index 000000000..e7fe655db --- /dev/null +++ b/examples/reason-act.sh @@ -0,0 +1,17 @@ + +#!/bin/bash + +cd `dirname $0` +cd .. + +# get -m model parameter otherwise defer to default +if [ "$1" == "-m" ]; then + MODEL="-m $2 " +fi + +./main $MODEL --color \ + -f ./prompts/reason-act.txt \ + -i --interactive-first \ + --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \ + -r "Question:" -r "Observation:" --in-prefix " " \ + -n -1 diff --git a/prompts/reason-act.txt b/prompts/reason-act.txt new file mode 100644 index 000000000..872016631 --- /dev/null +++ b/prompts/reason-act.txt @@ -0,0 +1,18 @@ +You run in a loop of Thought, Action, Observation. +At the end of the loop either Answer or restate your Thought and Action. +Use Thought to describe your thoughts about the question you have been asked. +Use Action to run one of these actions available to you: +- calculate[python math expression] +Observation will be the result of running those actions + + +Question: What is 4 * 7 / 3? +Thought: Do I need to use an action? Yes, I use calculate to do math +Action: calculate[4 * 7 / 3] +Observation: 9.3333333333 +Thought: Do I need to use an action? No, have the result +Answer: The calculate tool says it is 9.3333333333 +Question: What is capital of france? +Thought: Do I need to use an action? No, I know the answer +Answer: Paris is the capital of France +Question: From 41318d708ed196ff727dce14d263a64b23c7333d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ABl=20Kerbiriou?= Date: Wed, 29 Mar 2023 18:10:07 +0200 Subject: [PATCH 04/15] llama : use the same threshold for OpenBLAS and ggml thread limiting (#577) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 2d0279258..aa0c362d9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -856,7 +856,7 @@ static bool llama_eval_internal( // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; - gf.n_threads = N > 255 && ggml_cpu_has_blas() ? 1 : n_threads; + gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, tokens, N*ggml_element_size(embd)); From 53635c081c49321d523567112f9fddfbba6b787b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Mar 2023 19:29:26 +0300 Subject: [PATCH 05/15] py : add GPT4All conversion script For now: copy-paste Too much time for me to deduplicate the python code --- convert-gpt4all-to-ggml.py | 107 ++++++++++++++++++++++++++++ convert-unversioned-ggml-to-ggml.py | 2 +- 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 convert-gpt4all-to-ggml.py diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py new file mode 100644 index 000000000..f1d9d7aef --- /dev/null +++ b/convert-gpt4all-to-ggml.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +# +# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py +# + +# Original by https://github.com/eiz +# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818 +import argparse +import glob +import os +import struct +import sys +from sentencepiece import SentencePieceProcessor + +HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] + +def parse_args(): + parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format') + parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin') + parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file') + return parser.parse_args() + +def read_header(f_in): + struct_fmt = "i" * (3 + len(HPARAMS)) + struct_size = struct.calcsize(struct_fmt) + buf = f_in.read(struct_size) + return struct.unpack(struct_fmt, buf) + +def write_header(f_out, header): + (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header + + if magic != 0x67676d6c: + raise Exception('Invalid file magic. Must be an old style ggml file.') + + values = [ + 0x67676d66, # magic: ggml in hex + 1, # file version + vocab_size, + dim, + multiple_of, + n_heads, + n_layers, + rot, + ftype + ] + f_out.write(struct.pack("i" * len(values), *values)) + +def write_tokens(fout, tokenizer): + for i in range(tokenizer.vocab_size()): + if tokenizer.is_unknown(i): + text = " \u2047 ".encode("utf-8") + elif tokenizer.is_control(i): + text = b"" + elif tokenizer.is_byte(i): + piece = tokenizer.id_to_piece(i) + if len(piece) != 6: + print(f"Invalid token: {piece}") + sys.exit(1) + byte_value = int(piece[3:-1], 16) + text = struct.pack("B", byte_value) + else: + text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") + fout.write(struct.pack("i", len(text))) + fout.write(text) + fout.write(struct.pack("f", tokenizer.get_score(i))) + + # TODO: GPT4All - add extra token + text = "".encode("utf-8") + fout.write(struct.pack("i", len(text))) + fout.write(text) + fout.write(struct.pack("f", 0.0)) + +def read_tokens(f_in, tokenizer): + for i in range(tokenizer.vocab_size()): + len_b = f_in.read(4) + (length,) = struct.unpack("i", len_b) + f_in.read(length) + +def copy_all_data(f_out, f_in): + while True: + buf = f_in.read(1024 * 1024) + if not buf: + break + f_out.write(buf) + +def convert_one_file(path_in, tokenizer): + path_tmp = f"{path_in}.tmp" + path_orig= f"{path_in}.orig" + print(f"converting {path_in}") + with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out: + write_header(f_out, read_header(f_in)) + read_tokens(f_in, tokenizer) + write_tokens(f_out, tokenizer) + copy_all_data(f_out, f_in) + os.rename(path_in, path_orig) + os.rename(path_tmp, path_in) + +def main(): + args = parse_args() + + tokenizer = SentencePieceProcessor(args.tokenizer_model) + + convert_one_file(args.gpt4all_model, tokenizer) + +if __name__ == "__main__": + main() diff --git a/convert-unversioned-ggml-to-ggml.py b/convert-unversioned-ggml-to-ggml.py index 2457e3181..33b6243bd 100644 --- a/convert-unversioned-ggml-to-ggml.py +++ b/convert-unversioned-ggml-to-ggml.py @@ -27,7 +27,7 @@ def write_header(f_out, header): if magic != 0x67676d6c: raise Exception('Invalid file magic. Must be an old style ggml file.') - + values = [ 0x67676d66, # magic: ggml in hex 1, # file version From 516d88e75c9e768c0001a452dbad212494c586b3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Mar 2023 19:37:20 +0300 Subject: [PATCH 06/15] readme : add GPT4All instructions (close #588) --- README.md | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5675a927b..c2323f40a 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** - [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457) -- New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370 -- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64 -- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105 +- Support for [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all) ## Description @@ -37,6 +35,12 @@ Supported platforms: - [X] Windows (via CMake) - [X] Docker +Supported models: + +- [X] LLaMA +- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca) +- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all) + --- Here is a typical run using LLaMA-7B: @@ -222,6 +226,17 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. > ``` +### Using [GPT4All](https://github.com/nomic-ai/gpt4all) + +- Obtain the `gpt4all-lora-quantized.bin` model +- It is distributed in the old `ggml` format which is not obsoleted. So you have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py): + + ```bash + python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model + ``` + +- You can now use the newly generated `gpt4all-lora-quantized.bin` model in exactly the same way as all other models. The original model is stored in the same folder with a suffix `.orig` + ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data - **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.** From b467702b87461543c75013207e9adc6d20dcc01d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Mar 2023 19:38:31 +0300 Subject: [PATCH 07/15] readme : fix typos --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c2323f40a..e30452ee0 100644 --- a/README.md +++ b/README.md @@ -229,13 +229,15 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. ### Using [GPT4All](https://github.com/nomic-ai/gpt4all) - Obtain the `gpt4all-lora-quantized.bin` model -- It is distributed in the old `ggml` format which is not obsoleted. So you have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py): +- It is distributed in the old `ggml` format which is now obsoleted +- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py): ```bash python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model ``` -- You can now use the newly generated `gpt4all-lora-quantized.bin` model in exactly the same way as all other models. The original model is stored in the same folder with a suffix `.orig` +- You can now use the newly generated `gpt4all-lora-quantized.bin` model in exactly the same way as all other models +- The original model is saved in the same folder with a suffix `.orig` ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data From d9ad104440d84a0cc0734bff47ef0ba41ba740c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9rence?= <13496987+Royalphax@users.noreply.github.com> Date: Wed, 29 Mar 2023 19:21:09 +0200 Subject: [PATCH 08/15] Create chat-13B.bat (#592) * Create chat-13B.bat Same script than chat-13B.sh, but for windows users. Tested and working on windows 10/11 v 22H2 * Apply suggestions from code review --------- Co-authored-by: anzz1 --- examples/chat-13B.bat | 57 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 examples/chat-13B.bat diff --git a/examples/chat-13B.bat b/examples/chat-13B.bat new file mode 100644 index 000000000..c5c8ac6ef --- /dev/null +++ b/examples/chat-13B.bat @@ -0,0 +1,57 @@ +@setlocal disabledelayedexpansion enableextensions +@echo off + +cd /d "%~dp0.." +if not "%errorlevel%"=="0" ( + echo Unable to change directory. + pause + exit /b 1 +) + +if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin" +if not defined USER_NAME set "USER_NAME=User" +if not defined AI_NAME set "AI_NAME=ChatLLaMa" +rem Adjust to the number of CPU cores you want to use. +rem if not defined N_THREAD set "N_THREAD=8" +rem Number of tokens to predict (made it larger than default because we want a long interaction) +if not defined N_PREDICTS set "N_PREDICTS=2048" +if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647" + +rem Default main script paths +set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe" + +rem Get main script path from command line arguments +set "MAIN_SCRIPT_PATH=%~1" + +rem If the main script path was not specified, try the default paths +if not defined MAIN_SCRIPT_PATH ( + for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do ( + if exist "%%i" set "MAIN_SCRIPT_PATH=%%i" + ) +) + +rem If the main script path was not found, tell the user how to specify it +if not defined MAIN_SCRIPT_PATH ( + echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations: + echo %DEFAULT_MAIN_SCRIPT_PATHS% + pause + exit /b 1 +) + +rem Default context, feel free to edit it +set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown." + +rem Set a temporary variable if N_THREAD is set +if defined N_THREAD ( + set "_N_THREAD=--threads %N_THREAD%" +) else ( + set "_N_THREAD=" +) + +rem Run the script +echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^ + --model "%MODEL%" ^ + --n_predict %N_PREDICTS% ^ + --color --interactive ^ + --reverse-prompt "%USER_NAME%:" ^ + --prompt "%PROMPT_TEXT%" From 61cbfff5c95e45236883b1b60e025f8f6fa8c8a3 Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Wed, 29 Mar 2023 20:09:25 +0200 Subject: [PATCH 09/15] rename convert_ggml_to_pth.py -> convert-ggml-to-pth.py (#600) to match filenames of other converters --- convert_ggml_to_pth.py => convert-ggml-to-pth.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename convert_ggml_to_pth.py => convert-ggml-to-pth.py (100%) diff --git a/convert_ggml_to_pth.py b/convert-ggml-to-pth.py similarity index 100% rename from convert_ggml_to_pth.py rename to convert-ggml-to-pth.py From 3b44d30d9b618f0f2eb9abcfe912770a4e7d85d4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Mar 2023 21:47:33 +0300 Subject: [PATCH 10/15] ggml : add ARM_NEON ggml_vec_dot_q4_1() --- ggml.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/ggml.c b/ggml.c index c049f00a9..0906cf90e 100644 --- a/ggml.c +++ b/ggml.c @@ -2008,6 +2008,45 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); sumf = _mm_cvtss_f32( res ) + acc_offset * QK; +#elif defined(__ARM_NEON) + float sum00 = 0.0f; + float sum01 = 0.0f; + float sum10 = 0.0f; + float sum11 = 0.0f; + + for (int i = 0; i < nb; ++i) { + const block_q4_1 * restrict x0 = &x[i + 0]; + const block_q4_1 * restrict y0 = &y[i + 0]; + + const uint8x16_t m4b = vdupq_n_u8(0xf); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v1_0 = vld1q_u8(y0->qs); + + // and with 0xf + const uint8x16_t v0_0l = vandq_u8(v0_0, m4b); + const uint8x16_t v1_0l = vandq_u8(v1_0, m4b); + + const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4); + const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4); + + // dot product into uint16x8_t + const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); + const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); + + const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h)); + const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h)); + + const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h); + const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h); + + sum00 += x0->m*y0->m; + sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h)); + sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h)); + sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0)); + } + + sumf = QK*sum00 + sum01 + sum10 + sum11; #else // scalar for (int i = 0; i < nb; i++) { From f202ada131f60059112a948f660b2e0ac93d049a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Mar 2023 22:03:02 +0300 Subject: [PATCH 11/15] ggml : add ARM_NEON quantize_row_q4_1() --- ggml.c | 56 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/ggml.c b/ggml.c index 0906cf90e..51cd3b91c 100644 --- a/ggml.c +++ b/ggml.c @@ -564,10 +564,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int } } #elif __ARM_NEON - uint8_t pp[QK/2]; for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - float32x4_t srcv [8]; float32x4_t asrcv[8]; float32x4_t amaxv[8]; @@ -579,7 +576,8 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); - amax = MAX( + // absolute max + const float amax = MAX( MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)), MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3))); @@ -593,11 +591,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f)); const int32x4_t vi = vcvtq_s32_f32(vf); - pp[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); - pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); + y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); + y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); } - - memcpy(y[i].qs, pp, sizeof(pp)); } #elif defined(__AVX2__) for (int i = 0; i < nb; i++) { @@ -665,7 +661,6 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int _mm_storeu_si128( ( __m128i* )y[i].qs, res ); } #elif defined(__wasm_simd128__) - uint8_t pp[QK/2]; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -694,11 +689,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f)); const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf); - pp[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4); - pp[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4); + y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4); + y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4); } - - memcpy(y[i].qs, pp, sizeof(pp)); } #else // scalar @@ -750,11 +743,11 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) { assert(k % QK == 0); -#if defined(__AVX2__) const int nb = k / QK; block_q4_1 * restrict y = vy; +#if defined(__AVX2__) for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors __m256 v0 = _mm256_loadu_ps( x ); @@ -828,6 +821,41 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int __m128i res = packNibbles( i0 ); _mm_storeu_si128( ( __m128i* )y[i].qs, res ); } +#elif __ARM_NEON + for (int i = 0; i < nb; i++) { + float32x4_t srcv[8]; + float32x4_t minv[8]; + float32x4_t maxv[8]; + + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); + + for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]); + for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]); + for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]); + + for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]); + for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]); + for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]); + + const float min = vminvq_f32(minv[0]); + const float max = vmaxvq_f32(maxv[0]); + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + y[i].m = min; + + const float32x4_t minv0 = vdupq_n_f32(min); + + for (int l = 0; l < 8; l++) { + const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id); + const int32x4_t vi = vcvtq_s32_f32(v); + + y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); + y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); + } + } #else // scalar quantize_row_q4_1_reference(x, vy, k); From cea1c859483a5cfc7e2b31a06f8561d7a7604870 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Mar 2023 22:10:01 +0300 Subject: [PATCH 12/15] ggml : add ARM_NEON dequantize_row_q4_1() --- ggml.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/ggml.c b/ggml.c index 51cd3b91c..ccdba30e0 100644 --- a/ggml.c +++ b/ggml.c @@ -1016,6 +1016,50 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in } } } +#elif defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + const float32x4_t vd = vdupq_n_f32(x[i].d); + const float32x4_t vm = vdupq_n_f32(x[i].m); + + const uint8_t * restrict pp = x[i].qs; + + for (int l = 0; l < QK; l += 16) { + // Load 16x4-bit integers into 8x8-bit integers + const uint8x8_t v8 = vld1_u8(pp + l/2); + + // Expand 4-bit qs to 8-bit bytes + const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f)); + const uint8x8_t v1 = vshr_n_u8(v8, 4); + + // Interleave and combine + const uint8x8_t vx_0 = vzip1_u8(v0, v1); + const uint8x8_t vx_1 = vzip2_u8(v0, v1); + + const uint8x16_t vq = vcombine_u8(vx_0, vx_1); + + // convert to 2x uint16x8_t + const uint16x8_t vi_0 = vmovl_s8(vget_low_u8 (vq)); + const uint16x8_t vi_1 = vmovl_s8(vget_high_u8(vq)); + + // convert to 4x float32x4_t + const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0))); + const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0))); + const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1))); + const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1))); + + // multiply by d and add m + const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd); + const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd); + const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd); + const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd); + + // Store + vst1q_f32(y + i*QK + l + 0, r0); + vst1q_f32(y + i*QK + l + 4, r1); + vst1q_f32(y + i*QK + l + 8, r2); + vst1q_f32(y + i*QK + l + 12, r3); + } + } #else for (int i = 0; i < nb; i++) { const float d = x[i].d; From 0ba76c1e73ae21038b80bfb5a746157376c88173 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Mar 2023 22:13:12 +0300 Subject: [PATCH 13/15] llama : fix compile warnings when reading the vocab --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index aa0c362d9..e4998efa2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1444,7 +1444,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s return false; } - std::string word; + std::vector word(32); vocab.id_to_token.resize(n_vocab); for (int i = 0; i < n_vocab; i++) { uint32_t len; @@ -1459,10 +1459,10 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s finp.read ((char *) &score, sizeof(score)); fout.write((char *) &score, sizeof(score)); - vocab.token_to_id[word] = i; + vocab.token_to_id[word.data()] = i; auto &tok_score = vocab.id_to_token[i]; - tok_score.tok = word; + tok_score.tok = word.data(); tok_score.score = score; } } From b51c717d5cf9181c33afcb84554e47f6d539c891 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Mar 2023 22:15:34 +0300 Subject: [PATCH 14/15] ggml : init time on first ggml_init() call --- ggml.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml.c b/ggml.c index ccdba30e0..02675ee67 100644 --- a/ggml.c +++ b/ggml.c @@ -2748,6 +2748,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { static bool is_first_call = true; if (is_first_call) { + // initialize time system (required on Windows) + ggml_time_init(); + // initialize GELU, SILU and EXP F32 tables { const uint64_t t_start = ggml_time_us(); UNUSED(t_start); From 9cbc404ba6699a9ba4925ea25a60552b13491c7a Mon Sep 17 00:00:00 2001 From: anzz1 Date: Wed, 29 Mar 2023 23:44:39 +0300 Subject: [PATCH 15/15] ci : re-enable AVX512 testing (Windows-MSVC) (#584) * CI: Re-enable AVX512 testing (Windows-MSVC) Now with 100% less base64 encoding * plain __cpuid is enough here --- .github/workflows/build.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b5cf71a5e..88e70e495 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -176,7 +176,13 @@ jobs: if: ${{ matrix.build == 'avx512' }} continue-on-error: true run: | - echo "TODO: check avx512f" + cd build + $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath) + $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim())) + $cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe') + echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c + & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main + .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO" - name: Test id: cmake_test