diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile index cd575efa0..2e629f8ce 100644 --- a/.devops/main.Dockerfile +++ b/.devops/main.Dockerfile @@ -15,4 +15,4 @@ FROM ubuntu:$UBUNTU_VERSION as runtime COPY --from=build /app/main /main -ENTRYPOINT [ "/main" ] \ No newline at end of file +ENTRYPOINT [ "/main" ] diff --git a/.dockerignore b/.dockerignore index 952990f26..462fac23a 100644 --- a/.dockerignore +++ b/.dockerignore @@ -21,4 +21,4 @@ models/* arm_neon.h compile_commands.json -Dockerfile \ No newline at end of file +Dockerfile diff --git a/.ecrc b/.ecrc new file mode 100644 index 000000000..b682057dd --- /dev/null +++ b/.ecrc @@ -0,0 +1,5 @@ +{ + "Disable": { + "IndentSize": true + } +} diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..df8aaf504 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,16 @@ +# https://EditorConfig.org + +# Top-most EditorConfig file +root = true + +# Unix-style newlines with a newline ending every file, utf-8 charset +[*] +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +charset = utf-8 +indent_style = space +indent_size = 4 + +[Makefile] +indent_style = tab diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md index 0d508802d..8fd955356 100644 --- a/.github/ISSUE_TEMPLATE/custom.md +++ b/.github/ISSUE_TEMPLATE/custom.md @@ -22,9 +22,9 @@ Please provide a detailed written description of what you were trying to do, and # Current Behavior -Please provide a detailed written description of what `llama.cpp` did, instead. +Please provide a detailed written description of what `llama.cpp` did, instead. -# Environment and Context +# Environment and Context Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions. @@ -133,7 +133,7 @@ llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin. llama_model_load: .......................................................................................... done llama_model_load: model size = 4869.09 MB / num tensors = 723 -system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | +system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | main: prompt: 'Please close your issue when it has been answered.' main: number of tokens in prompt = 11 @@ -166,14 +166,14 @@ main: total time = 246406.42 ms Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.': - 3636882.89 msec task-clock # 14.677 CPUs utilized - 13509 context-switches # 3.714 /sec - 2436 cpu-migrations # 0.670 /sec - 10476679 page-faults # 2.881 K/sec + 3636882.89 msec task-clock # 14.677 CPUs utilized + 13509 context-switches # 3.714 /sec + 2436 cpu-migrations # 0.670 /sec + 10476679 page-faults # 2.881 K/sec 13133115082869 cycles # 3.611 GHz (16.77%) 29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%) 10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%) - 23479217109614 instructions # 1.79 insn per cycle + 23479217109614 instructions # 1.79 insn per cycle # 0.44 stalled cycles per insn (16.76%) 2353072268027 branches # 647.002 M/sec (16.77%) 1998682780 branch-misses # 0.08% of all branches (16.76%) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index f70821de2..28402c933 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -60,4 +60,4 @@ jobs: push: ${{ github.event_name == 'push' }} platforms: linux/amd64,linux/arm64 tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}" - file: ${{ matrix.config.dockerfile }} \ No newline at end of file + file: ${{ matrix.config.dockerfile }} diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml new file mode 100644 index 000000000..b4e535acf --- /dev/null +++ b/.github/workflows/editorconfig.yml @@ -0,0 +1,17 @@ +name: EditorConfig Checker + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + editorconfig: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: editorconfig-checker/action-editorconfig-checker@main + - run: editorconfig-checker diff --git a/README.md b/README.md index 5ef4318eb..da05ef87a 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ New features will probably be added mostly through community contributions. - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894) +- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) **Bindings:** @@ -242,7 +243,7 @@ There 26 letters in the English Alphabet The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis > List 5 words that start with "ca". cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. -> +> ``` ### Using [GPT4All](https://github.com/nomic-ai/gpt4all) @@ -253,17 +254,17 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py): ```bash - python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model + python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin ``` - + - You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models - The original model is saved in the same folder with a suffix `.orig` ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data - **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.** -- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. +- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data. - Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files. - The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory: @@ -283,7 +284,7 @@ convert the model from the old format to the new format with [./migrate-ggml-202 - GPT-3.5 / InstructGPT / ChatGPT: - [Aligning language models to follow instructions](https://openai.com/research/instruction-following) - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) - + ### Perplexity (Measuring model quality) You can use the `perplexity` example to measure perplexity over the given prompt. For more background, diff --git a/examples/Miku.sh b/examples/Miku.sh index 352478a15..c4cbf80f2 100755 --- a/examples/Miku.sh +++ b/examples/Miku.sh @@ -19,15 +19,15 @@ GEN_OPTIONS=(--batch_size 1024 --top_p 0.5) if [ -n "$N_THREAD" ]; then - GEN_OPTIONS+=(--threads "$N_THREAD") + GEN_OPTIONS+=(--threads "$N_THREAD") fi ./main "${GEN_OPTIONS[@]}" \ - --model "$MODEL" \ - --n_predict "$N_PREDICTS" \ - --color --interactive \ - --reverse-prompt "${USER_NAME}:" \ - --prompt " + --model "$MODEL" \ + --n_predict "$N_PREDICTS" \ + --color --interactive \ + --reverse-prompt "${USER_NAME}:" \ + --prompt " This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer. ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. diff --git a/examples/common.cpp b/examples/common.cpp index 859253b58..cb7b583d5 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -25,9 +25,9 @@ extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHand extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode); extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID); extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID); -extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags, - const wchar_t * lpWideCharStr, int cchWideChar, - char * lpMultiByteStr, int cbMultiByte, +extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags, + const wchar_t * lpWideCharStr, int cchWideChar, + char * lpMultiByteStr, int cbMultiByte, const char * lpDefaultChar, bool * lpUsedDefaultChar); #define CP_UTF8 65001 #endif @@ -448,10 +448,10 @@ void win32_console_init(bool enable_color) { // Convert a wide Unicode string to an UTF8 string void win32_utf8_encode(const std::wstring & wstr, std::string & str) { - int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); - std::string strTo(size_needed, 0); - WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL); - str = strTo; + int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); + std::string strTo(size_needed, 0); + WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL); + str = strTo; } #endif diff --git a/examples/embedding/README.md b/examples/embedding/README.md index 21d8be65f..fe8f5dcc6 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -1,3 +1,3 @@ -# embedding - -TODO +# embedding + +TODO diff --git a/examples/main/README.md b/examples/main/README.md index 4701aa558..f09e7ba97 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -1,3 +1,3 @@ -# main - -TODO +# main + +TODO diff --git a/examples/main/main.cpp b/examples/main/main.cpp index c4090d99f..9252ef3bb 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,3 +1,8 @@ +// Defines sigaction on msys: +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include "common.h" #include "llama.h" @@ -164,7 +169,7 @@ int main(int argc, char ** argv) { const auto inp_sfx = ::llama_tokenize(ctx, instruct_suffix, params.instruct_suffix_bos); // enable interactive mode if reverse prompt or interactive start is specified - if (params.antiprompt.size() != 0 || params.stopprompt.size() != 0 || params.interactive_start) { + if (params.antiprompt.size() != 0 || params.stopprompt.size() != 0 || params.interactive_start) { params.interactive = true; } diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md index a932275c2..eacfb17c6 100644 --- a/examples/perplexity/README.md +++ b/examples/perplexity/README.md @@ -1,3 +1,3 @@ -# perplexity - -TODO +# perplexity + +TODO diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 680757c6b..5c9e2ad94 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -5,15 +5,15 @@ #include // usage: -// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type +// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type // int main(int argc, char ** argv) { ggml_time_init(); if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - fprintf(stderr, " type = 2 - q4_0\n"); - fprintf(stderr, " type = 3 - q4_1\n"); + fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); + fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); return 1; } @@ -27,7 +27,7 @@ int main(int argc, char ** argv) { const std::string fname_inp = argv[1]; const std::string fname_out = argv[2]; - const int itype = atoi(argv[3]); + const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]); const int64_t t_main_start_us = ggml_time_us(); @@ -37,7 +37,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) { + if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } diff --git a/ggml.c b/ggml.c index 326b8e842..a26b4853f 100644 --- a/ggml.c +++ b/ggml.c @@ -1,4 +1,4 @@ -// Defines CLOCK_MONOTONIC and asprintf on Linux +// Defines CLOCK_MONOTONIC on Linux #define _GNU_SOURCE #include "ggml.h" @@ -26,14 +26,9 @@ #define static_assert(cond, msg) struct global_scope_noop_trick #endif -#if defined _MSC_VER || defined(__MINGW32__) +#if defined(_WIN32) -#if !defined(__MINGW32__) -#include -#else -// ref: https://github.com/ggerganov/whisper.cpp/issues/168 #include -#endif typedef volatile LONG atomic_int; typedef atomic_int atomic_bool; @@ -55,6 +50,7 @@ typedef HANDLE pthread_t; typedef DWORD thread_ret_t; static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { + (void) unused; HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); if (handle == NULL) { @@ -66,6 +62,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void } static int pthread_join(pthread_t thread, void* unused) { + (void) unused; return (int) WaitForSingleObject(thread, INFINITE); } @@ -231,12 +228,12 @@ static inline float fp32_from_bits(uint32_t w) { } static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; } static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { @@ -599,10 +596,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); - // absolute max - const float amax = MAX( - MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)), - MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3))); + const float amax = vmaxvq_f32(amaxv[0]); const float d = amax / ((1 << 3) - 1); const float id = d ? 1.0f/d : 0.0f; @@ -924,7 +918,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int float32x4_t minv[8]; float32x4_t maxv[8]; - for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK + 4*l); for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]); for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]); @@ -947,7 +941,8 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int for (int l = 0; l < 8; l++) { const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id); - const int32x4_t vi = vcvtq_s32_f32(v); + const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest + const int32x4_t vi = vcvtq_s32_f32(vf); y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4); y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); @@ -1886,7 +1881,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3)); #endif #else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); @@ -1951,7 +1946,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); - /* Prepare the constants we will need during execution */ + /* Prepare the constants we will need during execution */ const __m256i lowMask = _mm256_set1_epi8( 0xF ); const __m256i offset_8 = _mm256_set1_epi16( 8 ); @@ -1961,61 +1956,59 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest // Main loop for (int i = 0; i < nb; i+=UNROLL_COUNT) { - - // This loop will be unrolled by the compiler + // This loop will be unrolled by the compiler for (int u=0;u we now have a vector of 8 int_32t */ - __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q ); + /* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */ + __m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q ); - /* Convert to vectore of 8 int32_t to 8 floats */ - __m256 q = _mm256_cvtepi32_ps( xy_q ); + /* Convert to vectore of 8 int32_t to 8 floats */ + __m256 q = _mm256_cvtepi32_ps( xy_q ); - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( scale, q, acc ); + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( scale, q, acc ); } - - } + } // Return horizontal sum of the acc vector __m128 res = _mm256_extractf128_ps( acc, 1 ); @@ -2076,18 +2069,18 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest float sum1 = 0.0f; for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &px[i + 0]; - const block_q4_0 * restrict y0 = &py[i + 0]; - const block_q4_0 * restrict x1 = &px[i + 1]; - const block_q4_0 * restrict y1 = &py[i + 1]; + const block_q4_0 * restrict x0 = &x[i + 0]; + const block_q4_0 * restrict y0 = &y[i + 0]; + const block_q4_0 * restrict x1 = &x[i + 1]; + const block_q4_0 * restrict y1 = &y[i + 1]; const v128_t m4b = wasm_u8x16_splat(0xf); const v128_t s8b = wasm_i8x16_splat(0x8); - const v128_t v0_0 = wasm_v128_load(x0.qs); - const v128_t v0_1 = wasm_v128_load(y0.qs); - const v128_t v1_0 = wasm_v128_load(x1.qs); - const v128_t v1_1 = wasm_v128_load(y1.qs); + const v128_t v0_0 = wasm_v128_load(x0->qs); + const v128_t v0_1 = wasm_v128_load(y0->qs); + const v128_t v1_0 = wasm_v128_load(x1->qs); + const v128_t v1_1 = wasm_v128_load(y1->qs); // 4-bit -> 8-bit const v128_t v0_0l = wasm_v128_and(v0_0, m4b); @@ -2567,29 +2560,26 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x // static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { - QK, - QK, - 1, - 1, - 1, - 1, - 1, + [GGML_TYPE_F32] = 1, + [GGML_TYPE_F16] = 1, + [GGML_TYPE_Q4_0] = QK, + [GGML_TYPE_Q4_1] = QK, + [GGML_TYPE_I8] = 1, + [GGML_TYPE_I16] = 1, + [GGML_TYPE_I32] = 1, }; - -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5"); +static_assert(GGML_TYPE_COUNT == 7, "GGML_BLCK_SIZE is outdated"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { - sizeof(block_q4_0), - sizeof(block_q4_1), - sizeof(int8_t ), - sizeof(int16_t), - sizeof(int32_t), - sizeof(ggml_fp16_t), - sizeof(float ), + [GGML_TYPE_F32] = sizeof(float), + [GGML_TYPE_F16] = sizeof(ggml_fp16_t), + [GGML_TYPE_Q4_0] = sizeof(block_q4_0), + [GGML_TYPE_Q4_1] = sizeof(block_q4_1), + [GGML_TYPE_I8] = sizeof(int8_t), + [GGML_TYPE_I16] = sizeof(int16_t), + [GGML_TYPE_I32] = sizeof(int32_t), }; - -// don't forget to update the array above when adding new types -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5"); +static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated"); static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", @@ -2618,6 +2608,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "SCALE", "CPY", + "CONT", "RESHAPE", "VIEW", "PERMUTE", @@ -2633,7 +2624,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "FLASH_FF", }; -static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35"); +static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -2662,6 +2653,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "x*v", "x-\\>y", + "cont(x)", "reshape(x)", "view(x)", "permute(x)", @@ -2677,7 +2669,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "flash_ff(x)", }; -static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35"); +static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -4310,6 +4302,41 @@ struct ggml_tensor * ggml_cpy_inplace( return ggml_cpy_impl(ctx, a, b, true); } +// ggml_cont + +struct ggml_tensor * ggml_cont_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_CONT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_cont_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_cont_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_cont_impl(ctx, a, true); +} + // ggml_reshape struct ggml_tensor * ggml_reshape( @@ -4852,6 +4879,85 @@ static void ggml_compute_forward_dup_f16( // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy + if (ggml_is_contiguous(dst)) { + if (src0->nb[0] == sizeof(ggml_fp16_t)) { + if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + const size_t rs = ne00*nb00; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + char * dst_ptr = (char *) dst->data + id*rs; + + memcpy(dst_ptr, src0_ptr, rs); + + id++; + } + } + } + } else if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + id++; + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } + return; + } + // dst counters int64_t i10 = 0; int64_t i11 = 0; @@ -4946,6 +5052,105 @@ static void ggml_compute_forward_dup_f32( return; } + if (src0->type == dst->type && + src0->ne[0] == dst->ne[0] && + src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + if (ggml_is_contiguous(dst)) { + // TODO: simplify + if (src0->nb[0] == sizeof(float)) { + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + const size_t rs = ne00*nb00; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + char * dst_ptr = (char *) dst->data + id*rs; + + memcpy(dst_ptr, src0_ptr, rs); + + id++; + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + id++; + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } + + return; + } + // dst counters int64_t i10 = 0; int64_t i11 = 0; @@ -5066,14 +5271,18 @@ static void ggml_compute_forward_add_f32( GGML_ASSERT(nb00 == sizeof(float)); if (nb10 == sizeof(float)) { - const int j0 = (n/nth)*ith; - const int j1 = ith == nth - 1 ? n : (n/nth)*(ith + 1); - - for (int j = j0; j < j1; j++) { + for (int j = ith; j < n; j += nth) { +#ifdef GGML_USE_ACCELERATE + vDSP_vadd( + (float *) ((char *) src0->data + j*nb01), 1, + (float *) ((char *) src1->data + j*nb11), 1, + (float *) ((char *) dst->data + j*nb1), 1, nc); +#else ggml_vec_add_f32(nc, (float *) ((char *) dst->data + j*nb1), (float *) ((char *) src0->data + j*nb01), (float *) ((char *) src1->data + j*nb11)); +#endif } } else { // src1 is not contiguous @@ -6821,6 +7030,15 @@ static void ggml_compute_forward_cpy( ggml_compute_forward_dup(params, src0, dst); } +// ggml_compute_forward_cont + +static void ggml_compute_forward_cont( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + ggml_compute_forward_dup(params, src0, dst); +} + // ggml_compute_forward_reshape static void ggml_compute_forward_reshape( @@ -8651,6 +8869,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_cpy(params, tensor->src0, tensor); } break; + case GGML_OP_CONT: + { + ggml_compute_forward_cont(params, tensor->src0, tensor); + } break; case GGML_OP_RESHAPE: { ggml_compute_forward_reshape(params, tensor->src0, tensor); @@ -8895,8 +9117,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src1->grad = ggml_add_impl(ctx, src1->grad, - // TODO: fix transpose, the node will break the graph connections - ggml_mul_mat(ctx, ggml_transpose(ctx, src0), tensor->grad), + ggml_mul_mat(ctx, + ggml_cont(ctx, ggml_transpose(ctx, src0)), + tensor->grad), inplace); } } break; @@ -8908,6 +9131,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_CONT: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_RESHAPE: { GGML_ASSERT(false); // TODO: not implemented @@ -9362,6 +9589,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) node->n_tasks = n_threads; } break; case GGML_OP_CPY: + case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: diff --git a/ggml.h b/ggml.h index af16c647c..7d8b7a182 100644 --- a/ggml.h +++ b/ggml.h @@ -198,13 +198,14 @@ struct ggml_object; struct ggml_context; enum ggml_type { - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, + // explicitly numbered values are used in llama.cpp files + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, - GGML_TYPE_F16, - GGML_TYPE_F32, GGML_TYPE_COUNT, }; @@ -236,6 +237,7 @@ enum ggml_op { GGML_OP_SCALE, GGML_OP_CPY, + GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_VIEW, GGML_OP_PERMUTE, @@ -525,6 +527,11 @@ struct ggml_tensor * ggml_cpy( struct ggml_tensor * a, struct ggml_tensor * b); +// make contiguous +struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a); + // return view(a), b specifies the new shape // TODO: when we start computing gradient, make a copy instead of view struct ggml_tensor * ggml_reshape( diff --git a/llama.cpp b/llama.cpp index 203a1adc0..653558be9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,3 +1,8 @@ +// Defines fileno on msys: +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include "llama_util.h" #include "llama.h" #include "llama_internal.h" @@ -77,7 +82,7 @@ struct llama_hparams { uint32_t n_head = 32; uint32_t n_layer = 32; uint32_t n_rot = 64; - uint32_t f16 = 1; + enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; bool operator!=(const llama_hparams & other) const { return memcmp(this, &other, sizeof(llama_hparams)); @@ -427,7 +432,7 @@ struct llama_file_loader { hparams.n_head = file.read_u32(); hparams.n_layer = file.read_u32(); hparams.n_rot = file.read_u32(); - hparams.f16 = file.read_u32(); + hparams.ftype = (enum llama_ftype) file.read_u32(); } void read_vocab() { vocab.id_to_token.resize(hparams.n_vocab); @@ -453,20 +458,21 @@ struct llama_file_loader { llama_load_tensor_shard shard; uint32_t n_dims = file.read_u32(); uint32_t name_len = file.read_u32(); - uint32_t ftype = file.read_u32(); + shard.type = (enum ggml_type) file.read_u32(); shard.ne.resize(n_dims); file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); std::string name = file.read_string(name_len); if (n_dims < 1 || n_dims > 2) { throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); } - switch (ftype) { - case 0: shard.type = GGML_TYPE_F32; break; - case 1: shard.type = GGML_TYPE_F16; break; - case 2: shard.type = GGML_TYPE_Q4_0; break; - case 3: shard.type = GGML_TYPE_Q4_1; break; + switch (shard.type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + break; default: { - throw format("unrecognized ftype %u\n", ftype); + throw format("unrecognized tensor type %u\n", shard.type); } } @@ -497,18 +503,18 @@ struct llama_file_loader { struct llama_file_saver { llama_file file; llama_file_loader * any_file_loader; - llama_file_saver(const char * fname, llama_file_loader * any_file_loader, uint32_t new_f16) + llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype) : file(fname, "wb"), any_file_loader(any_file_loader) { fprintf(stderr, "llama.cpp: saving model to %s\n", fname); write_magic(); - write_hparams(new_f16); + write_hparams(new_ftype); write_vocab(); } void write_magic() { file.write_u32('ggjt'); // magic file.write_u32(1); // version } - void write_hparams(uint32_t new_f16) { + void write_hparams(enum llama_ftype new_ftype) { const llama_hparams & hparams = any_file_loader->hparams; file.write_u32(hparams.n_vocab); file.write_u32(hparams.n_embd); @@ -516,7 +522,7 @@ struct llama_file_saver { file.write_u32(hparams.n_head); file.write_u32(hparams.n_layer); file.write_u32(hparams.n_rot); - file.write_u32(new_f16); + file.write_u32(new_ftype); } void write_vocab() { if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { @@ -531,17 +537,17 @@ struct llama_file_saver { } } void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { - uint32_t ftype; switch (new_type) { - case GGML_TYPE_F32: ftype = 0; break; - case GGML_TYPE_F16: ftype = 1; break; - case GGML_TYPE_Q4_0: ftype = 2; break; - case GGML_TYPE_Q4_1: ftype = 3; break; + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + break; default: LLAMA_ASSERT(false); } file.write_u32((uint32_t) tensor.ne.size()); file.write_u32((uint32_t) tensor.name.size()); - file.write_u32(ftype); + file.write_u32(new_type); file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); file.write_raw(tensor.name.data(), tensor.name.size()); file.seek(-file.tell() & 31, SEEK_CUR); @@ -815,6 +821,16 @@ static const char *llama_file_version_name(llama_file_version version) { } } +static const char *llama_ftype_name(enum llama_ftype ftype) { + switch (ftype) { + case LLAMA_FTYPE_ALL_F32: return "all F32"; + case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; + case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; + case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; + default: LLAMA_ASSERT(false); + } +} + static const char *llama_model_type_name(e_model type) { switch (type) { case MODEL_7B: return "7B"; @@ -867,7 +883,7 @@ static void llama_model_load_internal( fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); - fprintf(stderr, "%s: f16 = %u\n", __func__, hparams.f16); + fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); @@ -1539,17 +1555,17 @@ static llama_vocab::id llama_sample_top_p_top_k( // quantization // -static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) { ggml_type quantized_type; - switch (itype) { - case 2: quantized_type = GGML_TYPE_Q4_0; break; - case 3: quantized_type = GGML_TYPE_Q4_1; break; - default: throw format("invalid quantization type %d\n", itype); + switch (ftype) { + case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; + case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; + default: throw format("invalid output file type %d\n", ftype); }; std::unique_ptr model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false, /*vocab_only*/ false)); - llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), (uint32_t) itype); + llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); size_t total_size_org = 0; size_t total_size_new = 0; @@ -1740,9 +1756,9 @@ void llama_free(struct llama_context * ctx) { int llama_model_quantize( const char * fname_inp, const char * fname_out, - int itype) { + enum llama_ftype ftype) { try { - llama_model_quantize_internal(fname_inp, fname_out, itype); + llama_model_quantize_internal(fname_inp, fname_out, ftype); return 0; } catch (const std::string & err) { fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); diff --git a/llama.h b/llama.h index 42c364c6b..8a0d50fb8 100644 --- a/llama.h +++ b/llama.h @@ -65,6 +65,14 @@ extern "C" { void * progress_callback_user_data; }; + // model file types + enum llama_ftype { + LLAMA_FTYPE_ALL_F32 = 0, + LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + }; + LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API bool llama_mmap_supported(); @@ -85,7 +93,7 @@ extern "C" { LLAMA_API int llama_model_quantize( const char * fname_inp, const char * fname_out, - int itype); + enum llama_ftype ftype); // Returns the KV cache that will contain the context for the // ongoing prediction with the model. diff --git a/llama_util.h b/llama_util.h index d68f49bd2..653bf7138 100755 --- a/llama_util.h +++ b/llama_util.h @@ -26,7 +26,9 @@ #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN - #define NOMINMAX + #ifndef NOMINMAX + #define NOMINMAX + #endif #include #include #include // for _fseeki64 @@ -209,6 +211,7 @@ struct llama_mmap { throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()); } + #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 // Advise the kernel to preload the mapped memory WIN32_MEMORY_RANGE_ENTRY range; range.VirtualAddress = addr; @@ -217,6 +220,9 @@ struct llama_mmap { fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", llama_format_win_err(GetLastError()).c_str()); } + #else + #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") + #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8 } ~llama_mmap() { @@ -338,8 +344,8 @@ struct llama_mlock { // Hopefully a megabyte is enough overhead: size_t increment = size + 1048576; // The minimum must be <= the maximum, so we need to increase both: - min_ws_size += size; - max_ws_size += size; + min_ws_size += increment; + max_ws_size += increment; if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", llama_format_win_err(GetLastError()).c_str());