diff --git a/convert_ggml_to_pth.py b/convert-ggml-to-pth.py similarity index 100% rename from convert_ggml_to_pth.py rename to convert-ggml-to-pth.py diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py new file mode 100644 index 000000000..f1d9d7aef --- /dev/null +++ b/convert-gpt4all-to-ggml.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +# +# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py +# + +# Original by https://github.com/eiz +# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818 +import argparse +import glob +import os +import struct +import sys +from sentencepiece import SentencePieceProcessor + +HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] + +def parse_args(): + parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format') + parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin') + parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file') + return parser.parse_args() + +def read_header(f_in): + struct_fmt = "i" * (3 + len(HPARAMS)) + struct_size = struct.calcsize(struct_fmt) + buf = f_in.read(struct_size) + return struct.unpack(struct_fmt, buf) + +def write_header(f_out, header): + (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header + + if magic != 0x67676d6c: + raise Exception('Invalid file magic. Must be an old style ggml file.') + + values = [ + 0x67676d66, # magic: ggml in hex + 1, # file version + vocab_size, + dim, + multiple_of, + n_heads, + n_layers, + rot, + ftype + ] + f_out.write(struct.pack("i" * len(values), *values)) + +def write_tokens(fout, tokenizer): + for i in range(tokenizer.vocab_size()): + if tokenizer.is_unknown(i): + text = " \u2047 ".encode("utf-8") + elif tokenizer.is_control(i): + text = b"" + elif tokenizer.is_byte(i): + piece = tokenizer.id_to_piece(i) + if len(piece) != 6: + print(f"Invalid token: {piece}") + sys.exit(1) + byte_value = int(piece[3:-1], 16) + text = struct.pack("B", byte_value) + else: + text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") + fout.write(struct.pack("i", len(text))) + fout.write(text) + fout.write(struct.pack("f", tokenizer.get_score(i))) + + # TODO: GPT4All - add extra token + text = "".encode("utf-8") + fout.write(struct.pack("i", len(text))) + fout.write(text) + fout.write(struct.pack("f", 0.0)) + +def read_tokens(f_in, tokenizer): + for i in range(tokenizer.vocab_size()): + len_b = f_in.read(4) + (length,) = struct.unpack("i", len_b) + f_in.read(length) + +def copy_all_data(f_out, f_in): + while True: + buf = f_in.read(1024 * 1024) + if not buf: + break + f_out.write(buf) + +def convert_one_file(path_in, tokenizer): + path_tmp = f"{path_in}.tmp" + path_orig= f"{path_in}.orig" + print(f"converting {path_in}") + with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out: + write_header(f_out, read_header(f_in)) + read_tokens(f_in, tokenizer) + write_tokens(f_out, tokenizer) + copy_all_data(f_out, f_in) + os.rename(path_in, path_orig) + os.rename(path_tmp, path_in) + +def main(): + args = parse_args() + + tokenizer = SentencePieceProcessor(args.tokenizer_model) + + convert_one_file(args.gpt4all_model, tokenizer) + +if __name__ == "__main__": + main() diff --git a/convert-unversioned-ggml-to-ggml.py b/convert-unversioned-ggml-to-ggml.py index 2457e3181..33b6243bd 100644 --- a/convert-unversioned-ggml-to-ggml.py +++ b/convert-unversioned-ggml-to-ggml.py @@ -27,7 +27,7 @@ def write_header(f_out, header): if magic != 0x67676d6c: raise Exception('Invalid file magic. Must be an old style ggml file.') - + values = [ 0x67676d66, # magic: ggml in hex 1, # file version diff --git a/examples/chat-13B.bat b/examples/chat-13B.bat new file mode 100644 index 000000000..c5c8ac6ef --- /dev/null +++ b/examples/chat-13B.bat @@ -0,0 +1,57 @@ +@setlocal disabledelayedexpansion enableextensions +@echo off + +cd /d "%~dp0.." +if not "%errorlevel%"=="0" ( + echo Unable to change directory. + pause + exit /b 1 +) + +if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin" +if not defined USER_NAME set "USER_NAME=User" +if not defined AI_NAME set "AI_NAME=ChatLLaMa" +rem Adjust to the number of CPU cores you want to use. +rem if not defined N_THREAD set "N_THREAD=8" +rem Number of tokens to predict (made it larger than default because we want a long interaction) +if not defined N_PREDICTS set "N_PREDICTS=2048" +if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647" + +rem Default main script paths +set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe" + +rem Get main script path from command line arguments +set "MAIN_SCRIPT_PATH=%~1" + +rem If the main script path was not specified, try the default paths +if not defined MAIN_SCRIPT_PATH ( + for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do ( + if exist "%%i" set "MAIN_SCRIPT_PATH=%%i" + ) +) + +rem If the main script path was not found, tell the user how to specify it +if not defined MAIN_SCRIPT_PATH ( + echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations: + echo %DEFAULT_MAIN_SCRIPT_PATHS% + pause + exit /b 1 +) + +rem Default context, feel free to edit it +set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown." + +rem Set a temporary variable if N_THREAD is set +if defined N_THREAD ( + set "_N_THREAD=--threads %N_THREAD%" +) else ( + set "_N_THREAD=" +) + +rem Run the script +echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^ + --model "%MODEL%" ^ + --n_predict %N_PREDICTS% ^ + --color --interactive ^ + --reverse-prompt "%USER_NAME%:" ^ + --prompt "%PROMPT_TEXT%" diff --git a/examples/reason-act.sh b/examples/reason-act.sh new file mode 100755 index 000000000..e7fe655db --- /dev/null +++ b/examples/reason-act.sh @@ -0,0 +1,17 @@ + +#!/bin/bash + +cd `dirname $0` +cd .. + +# get -m model parameter otherwise defer to default +if [ "$1" == "-m" ]; then + MODEL="-m $2 " +fi + +./main $MODEL --color \ + -f ./prompts/reason-act.txt \ + -i --interactive-first \ + --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \ + -r "Question:" -r "Observation:" --in-prefix " " \ + -n -1 diff --git a/ggml.c b/ggml.c index efe9316bb..02675ee67 100644 --- a/ggml.c +++ b/ggml.c @@ -564,10 +564,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int } } #elif __ARM_NEON - uint8_t pp[QK/2]; for (int i = 0; i < nb; i++) { - float amax = 0.0f; // absolute max - float32x4_t srcv [8]; float32x4_t asrcv[8]; float32x4_t amaxv[8]; @@ -579,7 +576,8 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); - amax = MAX( + // absolute max + const float amax = MAX( MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)), MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3))); @@ -593,11 +591,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f)); const int32x4_t vi = vcvtq_s32_f32(vf); - pp[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); - pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); + y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); + y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); } - - memcpy(y[i].qs, pp, sizeof(pp)); } #elif defined(__AVX2__) for (int i = 0; i < nb; i++) { @@ -665,7 +661,6 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int _mm_storeu_si128( ( __m128i* )y[i].qs, res ); } #elif defined(__wasm_simd128__) - uint8_t pp[QK/2]; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -694,11 +689,9 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f)); const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf); - pp[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4); - pp[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4); + y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4); + y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4); } - - memcpy(y[i].qs, pp, sizeof(pp)); } #else // scalar @@ -750,11 +743,11 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) { assert(k % QK == 0); -#if defined(__AVX2__) const int nb = k / QK; block_q4_1 * restrict y = vy; +#if defined(__AVX2__) for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors __m256 v0 = _mm256_loadu_ps( x ); @@ -828,6 +821,41 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int __m128i res = packNibbles( i0 ); _mm_storeu_si128( ( __m128i* )y[i].qs, res ); } +#elif __ARM_NEON + for (int i = 0; i < nb; i++) { + float32x4_t srcv[8]; + float32x4_t minv[8]; + float32x4_t maxv[8]; + + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); + + for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]); + for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]); + for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]); + + for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]); + for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]); + for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]); + + const float min = vminvq_f32(minv[0]); + const float max = vmaxvq_f32(maxv[0]); + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + y[i].m = min; + + const float32x4_t minv0 = vdupq_n_f32(min); + + for (int l = 0; l < 8; l++) { + const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id); + const int32x4_t vi = vcvtq_s32_f32(v); + + y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); + y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); + } + } #else // scalar quantize_row_q4_1_reference(x, vy, k); @@ -988,6 +1016,50 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in } } } +#elif defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + const float32x4_t vd = vdupq_n_f32(x[i].d); + const float32x4_t vm = vdupq_n_f32(x[i].m); + + const uint8_t * restrict pp = x[i].qs; + + for (int l = 0; l < QK; l += 16) { + // Load 16x4-bit integers into 8x8-bit integers + const uint8x8_t v8 = vld1_u8(pp + l/2); + + // Expand 4-bit qs to 8-bit bytes + const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f)); + const uint8x8_t v1 = vshr_n_u8(v8, 4); + + // Interleave and combine + const uint8x8_t vx_0 = vzip1_u8(v0, v1); + const uint8x8_t vx_1 = vzip2_u8(v0, v1); + + const uint8x16_t vq = vcombine_u8(vx_0, vx_1); + + // convert to 2x uint16x8_t + const uint16x8_t vi_0 = vmovl_s8(vget_low_u8 (vq)); + const uint16x8_t vi_1 = vmovl_s8(vget_high_u8(vq)); + + // convert to 4x float32x4_t + const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0))); + const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0))); + const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1))); + const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1))); + + // multiply by d and add m + const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd); + const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd); + const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd); + const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd); + + // Store + vst1q_f32(y + i*QK + l + 0, r0); + vst1q_f32(y + i*QK + l + 4, r1); + vst1q_f32(y + i*QK + l + 8, r2); + vst1q_f32(y + i*QK + l + 12, r3); + } + } #else for (int i = 0; i < nb; i++) { const float d = x[i].d; @@ -1962,7 +2034,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest // Compute cross scales for the block const __m256 scale_0 = _mm256_mul_ps( d0v, m1v ); const __m256 scale_1 = _mm256_mul_ps( m0v, d1v ); - const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 ); + const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0xAA /* 0b10101010 */ ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes __m256i bx = bytesFromNibbles( x[i].qs ); @@ -2008,6 +2080,45 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); sumf = _mm_cvtss_f32( res ) + acc_offset * QK; +#elif defined(__ARM_NEON) + float sum00 = 0.0f; + float sum01 = 0.0f; + float sum10 = 0.0f; + float sum11 = 0.0f; + + for (int i = 0; i < nb; ++i) { + const block_q4_1 * restrict x0 = &x[i + 0]; + const block_q4_1 * restrict y0 = &y[i + 0]; + + const uint8x16_t m4b = vdupq_n_u8(0xf); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v1_0 = vld1q_u8(y0->qs); + + // and with 0xf + const uint8x16_t v0_0l = vandq_u8(v0_0, m4b); + const uint8x16_t v1_0l = vandq_u8(v1_0, m4b); + + const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4); + const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4); + + // dot product into uint16x8_t + const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); + const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); + + const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h)); + const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h)); + + const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h); + const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h); + + sum00 += x0->m*y0->m; + sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h)); + sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h)); + sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0)); + } + + sumf = QK*sum00 + sum01 + sum10 + sum11; #else // scalar for (int i = 0; i < nb; i++) { @@ -2637,6 +2748,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { static bool is_first_call = true; if (is_first_call) { + // initialize time system (required on Windows) + ggml_time_init(); + // initialize GELU, SILU and EXP F32 tables { const uint64_t t_start = ggml_time_us(); UNUSED(t_start); diff --git a/llama.cpp b/llama.cpp index 29974696d..b2e90ea4c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -863,7 +863,7 @@ static bool llama_eval_internal( // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; - gf.n_threads = N > 255 && ggml_cpu_has_blas() ? 1 : n_threads; + gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, tokens, N*ggml_element_size(embd)); @@ -1451,7 +1451,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s return false; } - std::string word; + std::vector word(32); vocab.id_to_token.resize(n_vocab); for (int i = 0; i < n_vocab; i++) { uint32_t len; @@ -1466,10 +1466,10 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s finp.read ((char *) &score, sizeof(score)); fout.write((char *) &score, sizeof(score)); - vocab.token_to_id[word] = i; + vocab.token_to_id[word.data()] = i; auto &tok_score = vocab.id_to_token[i]; - tok_score.tok = word; + tok_score.tok = word.data(); tok_score.score = score; } } diff --git a/llama.h b/llama.h index 587d85323..3368de3e0 100644 --- a/llama.h +++ b/llama.h @@ -6,7 +6,7 @@ #include #ifdef LLAMA_SHARED -# ifdef _WIN32 && !defined __MINGW32__ +# if defined(_WIN32) && !defined(__MINGW32__) # ifdef LLAMA_BUILD # define LLAMA_API __declspec(dllexport) # else diff --git a/llamacpp.dll b/llamacpp.dll index 21981fa9e..033c05c5b 100644 Binary files a/llamacpp.dll and b/llamacpp.dll differ diff --git a/llamacpp_blas.dll b/llamacpp_blas.dll index 6aa72fdbc..ced649012 100644 Binary files a/llamacpp_blas.dll and b/llamacpp_blas.dll differ diff --git a/main.exe b/main.exe index 557c7343f..6128bdb0f 100644 Binary files a/main.exe and b/main.exe differ diff --git a/prompts/reason-act.txt b/prompts/reason-act.txt new file mode 100644 index 000000000..872016631 --- /dev/null +++ b/prompts/reason-act.txt @@ -0,0 +1,18 @@ +You run in a loop of Thought, Action, Observation. +At the end of the loop either Answer or restate your Thought and Action. +Use Thought to describe your thoughts about the question you have been asked. +Use Action to run one of these actions available to you: +- calculate[python math expression] +Observation will be the result of running those actions + + +Question: What is 4 * 7 / 3? +Thought: Do I need to use an action? Yes, I use calculate to do math +Action: calculate[4 * 7 / 3] +Observation: 9.3333333333 +Thought: Do I need to use an action? No, have the result +Answer: The calculate tool says it is 9.3333333333 +Question: What is capital of france? +Thought: Do I need to use an action? No, I know the answer +Answer: Paris is the capital of France +Question: diff --git a/quantize.exe b/quantize.exe index 9562ac9a9..8033911cf 100644 Binary files a/quantize.exe and b/quantize.exe differ