merge

2023-03-22 17:22:45 -07:00 · 2023-03-22 17:22:45 -07:00 · 84ab887349
commit 84ab887349
parent 84130caf5e ee8a788786
12 changed files with 180 additions and 125 deletions
--- a/.github/ISSUE_TEMPLATE/custom.md
+++ b/.github/ISSUE_TEMPLATE/custom.md
@ -1,7 +1,7 @@
 ---
-name: Custom issue template
+name: Issue and enhancement template
-about: Used to report user-related issues with the software
+about: Used to report issues and request enhancements for llama.cpp
-title: "[User] I encountered a problem .."
+title: "[User] Insert summary of your issue or enhancement.."
 labels: ''
 assignees: ''
@ -18,11 +18,11 @@ Please answer the following questions for yourself before submitting an issue.
 # Expected Behavior
-Please provide a detailed written description of what you were trying to do, and what you expected `lamma.cpp` to do.
+Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
 # Current Behavior
-Please provide a detailed written description of what `lamma.cpp` did, instead. 
+Please provide a detailed written description of what `llama.cpp` did, instead. 
 # Environment and Context 
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -89,7 +89,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake ..
+          cmake -DLLAMA_AVX2=OFF ..
          cmake --build . --config Release
          ctest --output-on-failure
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -217,6 +217,7 @@ add_library(utils OBJECT
 target_include_directories(utils PUBLIC .)
 target_compile_features(utils PUBLIC cxx_std_11) # don't bump
 target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
 add_library(ggml OBJECT
            ggml.c
@ -226,12 +227,13 @@ target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
-add_library(llama OBJECT
+add_library(llama
            llama.cpp
            llama.h)
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
 #
 # Executables
--- a/README.md
+++ b/README.md
@ -240,6 +240,40 @@ or
 `shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
 ### Perplexity (Measuring model quality)
 You can pass `--perplexity` as a command line option to measure perplexity over the given prompt.  For more background,
 see https://huggingface.co/docs/transformers/perplexity.  However, in general, lower perplexity is better for LLMs.
 #### Measurements
 https://github.com/ggerganov/llama.cpp/pull/270 is the unofficial tracking page for now.  llama.cpp is measuring very well
 compared to the baseline implementations.  Quantization has a small negative impact to quality, but, as you can see, running
 13B at q4_0 beats the 7B f16 model by a significant amount.
 All measurements are done against wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
 Note that the changing the context length will have a significant impact on perplexity (longer context = better perplexity).
 ```
 Perplexity - model options
 5.5985 - 13B, q4_0
 5.9565 - 7B, f16
 6.3001 - 7B, q4_1
 6.5949 - 7B, q4_0
 6.5995 - 7B, q4_0, --memory_f16
 ```
 #### How to run
 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
 2. Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
 3. Output:
 ```
 Calculating perplexity over 655 chunks
 24.43 seconds per pass - ETA 4.45 hours
 [1]4.5970,[2]5.1807,[3]6.0382,...
 ```
 And after 4.45 hours, you will have the final perplexity.
 ### Android
 You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
@ -290,7 +324,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
 ## Limitations
 - We don't know yet how much the quantization affects the quality of the generated text
 - Probably the token sampling can be improved
 - The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder,
  there is no benefit compared to the ARM_NEON intrinsics implementation. Of course, it's possible that I simply don't
--- a/ggml.c
+++ b/ggml.c
@ -1,3 +1,6 @@
 // Defines CLOCK_MONOTONIC on Linux
 #define _POSIX_C_SOURCE 199309L
 #include "ggml.h"
 #if defined(_MSC_VER) || defined(__MINGW32__)
@ -400,9 +403,55 @@ static inline __m128i packNibbles( __m256i bytes )
 // method 5
 // blocks of QK elements
 // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
    assert(k % QK == 0);
    const int nb = k / QK;
    const size_t bs = sizeof(float) + QK/2;
    uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
    uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
    uint8_t pp[QK/2];
    for (int i = 0; i < nb; i++) {
        float amax = 0.0f; // absolute max
        for (int l = 0; l < QK; l++) {
            const float v = x[i*QK + l];
            amax = MAX(amax, fabsf(v));
        }
        const float d = amax / ((1 << 3) - 1);
        const float id = d ? 1.0f/d : 0.0f;
        *(float *)pd = d;
        pd += bs;
        for (int l = 0; l < QK; l += 2) {
            const float v0 = x[i*QK + l + 0]*id;
            const float v1 = x[i*QK + l + 1]*id;
            const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
            const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
            assert(vi0 >= 0 && vi0 < 16);
            assert(vi1 >= 0 && vi1 < 16);
            pp[l/2] = vi0 | (vi1 << 4);
        }
        memcpy(pb, pp, sizeof(pp));
        pb += bs;
    }
 }
 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
    assert(k % QK == 0);
 #if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
    const int nb = k / QK;
    const size_t bs = sizeof(float) + QK/2;
@ -410,6 +459,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
    uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
    uint8_t pp[QK/2];
 #endif
 #if __ARM_NEON
 #if QK == 32
@ -566,36 +616,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
 #endif
 #else
    // scalar
-    for (int i = 0; i < nb; i++) {
+    quantize_row_q4_0_reference(x, y, k);
        float amax = 0.0f; // absolute max
        for (int l = 0; l < QK; l++) {
            const float v = x[i*QK + l];
            amax = MAX(amax, fabsf(v));
        }
        const float d = amax / ((1 << 3) - 1);
        const float id = d ? 1.0f/d : 0.0f;
        *(float *)pd = d;
        pd += bs;
        for (int l = 0; l < QK; l += 2) {
            const float v0 = x[i*QK + l + 0]*id;
            const float v1 = x[i*QK + l + 1]*id;
            const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
            const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
            assert(vi0 >= 0 && vi0 < 16);
            assert(vi1 >= 0 && vi1 < 16);
            pp[l/2] = vi0 | (vi1 << 4);
        }
        memcpy(pb, pp, sizeof(pp));
        pb += bs;
    }
 #endif
 }
@ -10702,119 +10723,60 @@ enum ggml_opt_result ggml_opt(
 ////////////////////////////////////////////////////////////////////////////////
-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
    const int nb = k / qk;
    const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
    const size_t row_size = nb*bs;
    assert(k % qk == 0);
    const size_t pp_size = qk / 2;
    uint8_t * pp = (uint8_t *) alloca(pp_size);
    char * pdst = (char *) dst;
    for (int j = 0; j < n; j += k) {
        uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
        uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
        quantize_row_q4_0_reference(src + j, pd, k);
        for (int i = 0; i < nb; i++) {
-            float amax = 0.0f; // absolute max
+            for (int l = 0; l < qk; l += 2) {
                const uint8_t vi0 = pb[l/2] & 0xF;
                const uint8_t vi1 = pb[l/2] >> 4;
-            {
+                hist[vi0]++;
-                for (int l = 0; l < qk; l++) {
+                hist[vi1]++;
                    const float v = src[j + i*qk + l];
                    amax = MAX(amax, fabsf(v));
                }
                const float d = amax / ((1 << 3) - 1);
                const float id = d ? 1.0f/d : 0.0f;
                *(float *) pd = d;
                pd += bs;
                for (int l = 0; l < qk; l += 2) {
                    const float v0 = (src[j + i*qk + l + 0])*id;
                    const float v1 = (src[j + i*qk + l + 1])*id;
                    const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
                    const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
                    assert(vi0 >= 0 && vi0 < 16);
                    assert(vi1 >= 0 && vi1 < 16);
                    hist[vi0]++;
                    hist[vi1]++;
                    pp[l/2] = vi0 | (vi1 << 4);
                }
                memcpy(pb, pp, pp_size);
                pb += bs;
            }
            pb += bs;
        }
    }
    return (n/k)*row_size;
 }
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
    const int nb = k / qk;
    const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
    const size_t row_size = nb*bs;
    assert(k % qk == 0);
    const size_t pp_size = qk / 2;
    uint8_t * pp = (uint8_t *) alloca(pp_size);
    char * pdst = (char *) dst;
    for (int j = 0; j < n; j += k) {
        uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
        uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs +   sizeof(float));
        uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
-        //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
+        quantize_row_q4_1(src + j, pd, k);
        for (int i = 0; i < nb; i++) {
-            float min = FLT_MAX;
+            for (int l = 0; l < qk; l += 2) {
-            float max = -FLT_MAX;
+                const uint8_t vi0 = pb[l/2] & 0xF;
                const uint8_t vi1 = pb[l/2] >> 4;
-            {
+                hist[vi0]++;
-                for (int l = 0; l < qk; l++) {
+                hist[vi1]++;
                    const float v = src[j + i*qk + l];
                    if (v < min) min = v;
                    if (v > max) max = v;
                }
                const float d = (max - min) / ((1 << 4) - 1);
                const float id = d ? 1.0f/d : 0.0f;
                *(float *) pd = d;
                *(float *) pm = min;
                pd += bs;
                pm += bs;
                for (int l = 0; l < qk; l += 2) {
                    const float v0 = (src[j + i*qk + l + 0] - min)*id;
                    const float v1 = (src[j + i*qk + l + 1] - min)*id;
                    const uint8_t vi0 = round(v0);
                    const uint8_t vi1 = round(v1);
                    assert(vi0 >= 0 && vi0 < 16);
                    assert(vi1 >= 0 && vi1 < 16);
                    hist[vi0]++;
                    hist[vi1]++;
                    pp[l/2] = vi0 | (vi1 << 4);
                }
                memcpy(pb, pp, pp_size);
                pb += bs;
            }
            pb += bs;
        }
    }
--- a/ggml.h
+++ b/ggml.h
@ -745,8 +745,8 @@ enum ggml_opt_result ggml_opt(
 // quantization
 //
-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
 //
 // system info
--- a/llama.cpp
+++ b/llama.cpp
@ -9,6 +9,7 @@
 #include <queue>
 #include <regex>
 #include <cassert>
 #include <cstring>
 // determine number of model parts based on the dimension
 static const std::unordered_map<int, int> LLAMA_N_PARTS = {
--- a/main.cpp
+++ b/main.cpp
@ -85,7 +85,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
-    auto tokens = ::llama_tokenize(ctx, params.prompt.c_str(), true);
+    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
    int count = 0;
    double nll = 0.0;
@ -254,6 +254,10 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }
    if (params.interactive_start) {
        params.interactive = true;
    }
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
    fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@ -297,7 +301,7 @@ int main(int argc, char ** argv) {
 #endif
               " - Press Return to return control to LLaMa.\n"
               " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = true;
+        is_interacting = params.interactive_start;
    }
    int input_consumed = 0;
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,4 +1,9 @@
-set(TEST_TARGET test-tokenizer-0)
+function(llama_add_test source)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+    get_filename_component(TEST_TARGET ${source} NAME_WE)
-target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
+    add_executable(${TEST_TARGET} ${source})
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
+    target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()
 llama_add_test(test-quantize.c)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
--- a/tests/test-quantize.c
+++ b/tests/test-quantize.c
@ -0,0 +1,42 @@
 #include "ggml.h"
 #undef NDEBUG
 #include <assert.h>
 #include <math.h>
 int main(void) {
    #define QK 32
    float src[QK];
    uint8_t dst[24];
    int64_t hist[16];
    for (int i = 0; i < QK; i++) {
        src[i] = (float)(i + 1);
    }
    size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
    assert(size == 20);
    float max_result = ((float *)dst)[0];
    float max_expected = src[31] / ((1 << 3) - 1);
    assert(max_result == max_expected);
    for (int i = 0; i < QK; i++) {
        uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
        uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
        assert(q4_result == q4_expected);
    }
    size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
    assert(size == 24);
    float delta_result = ((float *)dst)[0];
    float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
    assert(delta_result == delta_expected);
    float min_result = ((float *)dst)[1];
    float min_expected = src[0];
    assert(min_result == min_expected);
    for (int i = 0; i < QK; i++) {
        uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
        uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
        assert(q4_result == q4_expected);
    }
    return 0;
 }
--- a/utils.cpp
+++ b/utils.cpp
@ -67,6 +67,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.embedding = true;
        } else if (arg == "--interactive-start") {
            params.interactive = true;
        } else if (arg == "--interactive-first") {
            params.interactive_start = true;
        } else if (arg == "-ins" || arg == "--instruct") {
            params.instruct = true;
@ -101,9 +102,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
    fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
    fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
-    fprintf(stderr, "                        in interactive mode, poll user input upon seeing PROMPT (can be\n");
+    fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
    fprintf(stderr, "                        specified more than once for multiple prompts).\n");
    fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for <= 0)\n");
@ -151,8 +153,10 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
 // TODO: not great allocating this every time
 std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    std::vector<llama_token> res(8096);
+    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
    std::vector<llama_token> res(text.size() + (int)add_bos);
    int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
    assert(n >= 0);
    res.resize(n);
    return res;
--- a/utils.h
+++ b/utils.h
@ -39,8 +39,10 @@ struct gpt_params {
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
    bool embedding         = false; // get only sentence embedding
-    bool interactive_start = false; // reverse prompt immediately
+    bool interactive_start = false; // wait for user input immediately
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool ignore_eos        = false; // do not stop generating after eos
    bool perplexity        = false; // compute perplexity over the prompt