Merge branch 'master' into llama-v2-70b

2023-07-23 15:08:48 +03:00 · 2023-07-23 15:08:48 +03:00 · 3ed2553fcd
commit 3ed2553fcd
parent 8194d591b2 1d0824b247
9 changed files with 216 additions and 149 deletions
--- a/8
+++ b/8
@ -235,13 +235,15 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 endif # LLAMA_CUBLAS
 ifdef LLAMA_CLBLAST
-	CFLAGS   += -DGGML_USE_CLBLAST
+
-	CXXFLAGS += -DGGML_USE_CLBLAST
+	CFLAGS   += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
 	CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
 		LDFLAGS += -lclblast -framework OpenCL
 	else
-		LDFLAGS += -lclblast -lOpenCL
+		LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
 	endif
 	OBJS    += ggml-opencl.o
--- a/README.md
+++ b/README.md
@ -242,6 +242,23 @@ In order to build llama.cpp you have three different options.
    zig build -Doptimize=ReleaseFast
    ```
 -   Using `gmake` (FreeBSD):
    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
    2. Add your user to **video** group
    3. Install compilation dependencies.
        ```bash
        sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
            opencl clblast openblas
            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
        ```
    **Notes:** With this packages you can build llama.cpp with OPENBLAS and
    CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
    the instructions for use and activate this options in this document below.
 ### Metal Build
 Using Metal allows the computation to be executed on the GPU for Apple devices:
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -464,92 +464,92 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stdout, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
+    fprintf(stdout, "\n");
-    fprintf(stderr, "options:\n");
+    fprintf(stdout, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stdout, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
+    fprintf(stdout, "  -i, --interactive     run in interactive mode\n");
-    fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
+    fprintf(stdout, "  --interactive-first   run in interactive mode and wait for input right away\n");
-    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
+    fprintf(stdout, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
-    fprintf(stderr, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
+    fprintf(stdout, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
-    fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
+    fprintf(stdout, "  -r PROMPT, --reverse-prompt PROMPT\n");
-    fprintf(stderr, "                        halt generation at PROMPT, return control in interactive mode\n");
+    fprintf(stdout, "                        halt generation at PROMPT, return control in interactive mode\n");
-    fprintf(stderr, "                        (can be specified more than once for multiple prompts).\n");
+    fprintf(stdout, "                        (can be specified more than once for multiple prompts).\n");
-    fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
+    fprintf(stdout, "  --color               colorise output to distinguish prompt and user input from generations\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
+    fprintf(stdout, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
+    fprintf(stdout, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
+    fprintf(stdout, "                        prompt to start generation with (default: empty)\n");
-    fprintf(stderr, "  -e                    process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
+    fprintf(stdout, "  -e                    process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
+    fprintf(stdout, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
-    fprintf(stderr, "  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
+    fprintf(stdout, "  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
-    fprintf(stderr, "                        not supported with --interactive or other interactive options\n");
+    fprintf(stdout, "                        not supported with --interactive or other interactive options\n");
-    fprintf(stderr, "  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
+    fprintf(stdout, "  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
-    fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
+    fprintf(stdout, "  --random-prompt       start with a randomized prompt.\n");
-    fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
+    fprintf(stdout, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
-    fprintf(stderr, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
+    fprintf(stdout, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
-    fprintf(stderr, "  -f FNAME, --file FNAME\n");
+    fprintf(stdout, "  -f FNAME, --file FNAME\n");
-    fprintf(stderr, "                        prompt file to start generation.\n");
+    fprintf(stdout, "                        prompt file to start generation.\n");
-    fprintf(stderr, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
+    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
-    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stderr, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
+    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
-    fprintf(stderr, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
-    fprintf(stderr, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
-    fprintf(stderr, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
+    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
-    fprintf(stderr, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
+    fprintf(stdout, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
-    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    fprintf(stdout, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    fprintf(stdout, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    fprintf(stdout, "  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
-    fprintf(stderr, "  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    fprintf(stdout, "  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
-    fprintf(stderr, "  --mirostat N          use Mirostat sampling.\n");
+    fprintf(stdout, "  --mirostat N          use Mirostat sampling.\n");
-    fprintf(stderr, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
+    fprintf(stdout, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    fprintf(stderr, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
+    fprintf(stdout, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
-    fprintf(stderr, "  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
+    fprintf(stdout, "  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
-    fprintf(stderr, "  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
+    fprintf(stdout, "  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
-    fprintf(stderr, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
+    fprintf(stdout, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
-    fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
+    fprintf(stdout, "                        modifies the likelihood of token appearing in the completion,\n");
-    fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
+    fprintf(stdout, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
-    fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
+    fprintf(stdout, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    fprintf(stderr, "  --cfg-negative-prompt PROMPT \n");
+    fprintf(stdout, "  --cfg-negative-prompt PROMPT \n");
-    fprintf(stderr, "                        negative prompt to use for guidance. (default: empty)\n");
+    fprintf(stdout, "                        negative prompt to use for guidance. (default: empty)\n");
-    fprintf(stderr, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
+    fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
-    fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
-    fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
-    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
+    fprintf(stdout, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
+    fprintf(stdout, "  --no-penalize-nl      do not penalize newline token\n");
-    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
-    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
+    fprintf(stdout, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    fprintf(stderr, "  --perplexity          compute perplexity over each ctx window of the prompt\n");
+    fprintf(stdout, "  --perplexity          compute perplexity over each ctx window of the prompt\n");
-    fprintf(stderr, "  --perplexity-lines    compute perplexity over each line of the prompt\n");
+    fprintf(stdout, "  --perplexity-lines    compute perplexity over each line of the prompt\n");
-    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+    fprintf(stdout, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
-    fprintf(stderr, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
+    fprintf(stdout, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    if (llama_mlock_supported()) {
-        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
    if (llama_mmap_supported()) {
-        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+        fprintf(stdout, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
-    fprintf(stderr, "  --numa                attempt optimizations that help on some NUMA systems\n");
+    fprintf(stdout, "  --numa                attempt optimizations that help on some NUMA systems\n");
-    fprintf(stderr, "                        if run without this previously, it is recommended to drop the system page cache before using this\n");
+    fprintf(stdout, "                        if run without this previously, it is recommended to drop the system page cache before using this\n");
-    fprintf(stderr, "                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
+    fprintf(stdout, "                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stdout, "  -ngl N, --n-gpu-layers N\n");
-    fprintf(stderr, "                        number of layers to store in VRAM\n");
+    fprintf(stdout, "                        number of layers to store in VRAM\n");
-    fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
-    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
-    fprintf(stderr, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
 #endif
-    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
+    fprintf(stdout, "  --mtest               compute maximum memory usage\n");
-    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
+    fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
-    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
+    fprintf(stdout, "  --verbose-prompt      print prompt before generation\n");
-    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stdout, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stdout, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "\n");
+    fprintf(stdout, "\n");
 }
 std::string gpt_random_prompt(std::mt19937 & rng) {
--- a/examples/llm.vim
+++ b/examples/llm.vim
@ -2,57 +2,22 @@ function! Llm()
  let url = "http://127.0.0.1:8080/completion"
  " Save the current cursor position
  let save_cursor = getpos('.')
  silent! %s/\n/\\n/g
  silent! %s/\t/\\t/g
  silent! %s/\\n$//
  " Get the content of the current buffer
  let buffer_content = join(getline(1, '$'), "\n")
  " Replace true newlines with "\n"
  let buffer_content = substitute(buffer_content, '\n', '\\n', 'g')
  " Trim leading/trailing whitespace
  let buffer_content = substitute(buffer_content, '^\s\+', '', '')
  let buffer_content = substitute(buffer_content, '\s\+$', '', '')
  " Create the JSON payload
-  " can't escape backslash, \n gets replaced as \\n
+  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false}
-  let json_payload = '{"prompt":"' . escape(buffer_content, '"/') . '","temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream":false}'
+  let json_payload.prompt = buffer_content
  let prompt_tmpfile = tempname()
  let response_tmpfile = tempname()
  call writefile([json_payload], prompt_tmpfile)
  " Define the curl command
-  let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -o ' . shellescape(response_tmpfile) . ' -d @' . shellescape(prompt_tmpfile) . ' ' . url
+  let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
-  silent execute '!'.curl_command
+  let response = system(curl_command, json_encode(json_payload))
  let response = join(readfile(response_tmpfile), '')
  let start_marker = '{"content":"'
  let end_marker = '","generation_settings'
  let content_start = stridx(response, start_marker) + len(start_marker)
  let content_end = stridx(response, end_marker, content_start)
  " Extract the content field from the response
-  let content = strpart(response, content_start, content_end - content_start)
+  let content = json_decode(response).content
  " Insert the content at the cursor position
  call setline(line('.'), getline('.') . content)
  " Replace newline "\n" strings with actual newlines in the content
  silent! %s/\\n/\r/g
  " and tabs
  silent! %s/\\t/\t/g
  " and quote marks for C sources
  silent! %s/\\"/\"/g
  " Remove the temporary file
  call delete(prompt_tmpfile)
  call delete(response_tmpfile)
 endfunction
 command! Llm call Llm()
--- a/flake.nix
+++ b/flake.nix
@ -7,7 +7,8 @@
    flake-utils.lib.eachDefaultSystem (system:
      let
        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
-        osSpecific = with pkgs; [ openmpi ] ++
+        buildInputs = with pkgs; [ openmpi ];
        osSpecific = with pkgs; buildInputs ++
        (
          if isAarch64 && isDarwin then
            with pkgs.darwin.apple_sdk_11_0.frameworks; [
@ -29,18 +30,24 @@
        nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
        llama-python =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
      in {
        packages.default = pkgs.stdenv.mkDerivation {
          name = "llama.cpp";
          src = ./.;
        postPatch = ''
          substituteInPlace ./ggml-metal.m \
            --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
          substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
        '';
        postInstall = ''
          mv $out/bin/main $out/bin/llama
          mv $out/bin/server $out/bin/llama-server
        '';
        cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
      in {
        packages.default = pkgs.stdenv.mkDerivation {
          name = "llama.cpp";
          src = ./.;
          postPatch = postPatch;
          nativeBuildInputs = nativeBuildInputs;
          buildInputs = osSpecific;
-          cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
+          cmakeFlags = cmakeFlags
            ++ (if isAarch64 && isDarwin then [
              "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
              "-DLLAMA_METAL=ON"
@ -48,10 +55,19 @@
              "-DLLAMA_BLAS=ON"
              "-DLLAMA_BLAS_VENDOR=OpenBLAS"
          ]);
-          postInstall = ''
+          postInstall = postInstall;
-            mv $out/bin/main $out/bin/llama
+          meta.mainProgram = "llama";
-            mv $out/bin/server $out/bin/llama-server
+        };
-          '';
+        packages.opencl = pkgs.stdenv.mkDerivation {
          name = "llama.cpp";
          src = ./.;
          postPatch = postPatch;
          nativeBuildInputs = nativeBuildInputs;
          buildInputs = with pkgs; buildInputs ++ [ clblast ];
          cmakeFlags = cmakeFlags ++ [
            "-DLLAMA_CLBLAST=ON"
          ];
          postInstall = postInstall;
          meta.mainProgram = "llama";
        };
        apps.llama-server = {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -935,12 +935,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
    uint16_t aux[4];
    const uint8_t * sc = (const uint8_t *)aux;
 #if K_QUANTS_PER_ITERATION == 2
    uint32_t q32[4];
    const uint8_t * q4 = (const uint8_t *)q32;
 #else
    uint16_t q16[4];
    const uint8_t * q4 = (const uint8_t *)q16;
 #endif
    float tmp = 0; // partial sum for thread in warp
    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
        const uint8_t * q1 = x[i].qs + q_offset;
        const uint8_t * q2 = q1 + 64;
        const float   * y1 = yy + i*QK_K + y_offset;
        const float   * y2 = y1 + 128;
@ -953,14 +959,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
 #if K_QUANTS_PER_ITERATION == 2
        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
        const uint32_t * q2 = q1 + 16;
        q32[0] = q1[0] & 0x0f0f0f0f;
        q32[1] = q1[0] & 0xf0f0f0f0;
        q32[2] = q2[0] & 0x0f0f0f0f;
        q32[3] = q2[0] & 0xf0f0f0f0;
        float4 s = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
-        for (int l = 0; l < n; ++l) {
+        for (int l = 0; l < 4; ++l) {
-            s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
+            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
-            s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
+            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
+        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
 #else
        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
        const uint16_t * q2 = q1 + 32;
        q16[0] = q1[0] & 0x0f0f;
        q16[1] = q1[0] & 0xf0f0;
        q16[2] = q2[0] & 0x0f0f;
        q16[3] = q2[0] & 0xf0f0;
        float4 s = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
        for (int l = 0; l < 2; ++l) {
            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
        }
        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
 #endif
    }
 #else
@ -1521,7 +1554,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-    const int bq8_offset = QR4_K * (iqs / QI8_1);
+    const int bq8_offset = QR4_K * (iqs / QI8_1); // 0, 2, 4, 6
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;
@ -1531,11 +1564,20 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
-    for (int i = 0; i < QR4_K; ++i) {
+    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
-        const int isc = bq8_offset + i;
+    uint16_t aux[2];
    const int j = bq8_offset/2;
    if (j < 2) {
        aux[0] = scales[j+0] & 0x3f3f;
        aux[1] = scales[j+2] & 0x3f3f;
    } else {
        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
    }
    const uint8_t * sc = (const uint8_t *)aux;
    const uint8_t * m  = sc + 2;
-        uint8_t sc, m;
+    for (int i = 0; i < QR4_K; ++i) {
        get_scale_min_k4(isc, bq4_K->scales, sc, m);
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
@ -1543,8 +1585,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
        const int vi = (v >> (4*i)) & 0x0F0F0F0F;
-        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc); // SIMD dot product
+        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc[i]); // SIMD dot product
-        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m);  // multiply constant part of q4_K with sum of q8_1 values
+        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
    }
    return d*sumf_d - dmin*sumf_m;
@ -2505,7 +2547,9 @@ static size_t g_scratch_offset = 0;
 static int g_device_count = -1;
 static int g_main_device = 0;
 #ifndef GGML_CUDA_FORCE_DMMV
 static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
 #endif
 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@ -2528,7 +2572,9 @@ void ggml_init_cublas() {
            g_tensor_split[id] = total_vram;
            total_vram += prop.totalGlobalMem;
 #ifndef GGML_CUDA_FORCE_DMMV
            g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
 #endif
        }
        for (int id = 0; id < g_device_count; ++id) {
            g_tensor_split[id] /= total_vram;
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -42,6 +42,7 @@ struct ggml_metal_context {
    id<MTLComputePipelineState> pipeline_##name
    GGML_METAL_DECL_KERNEL(add);
    GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast
    GGML_METAL_DECL_KERNEL(mul);
    GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
    GGML_METAL_DECL_KERNEL(scale);
@ -157,6 +158,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
        GGML_METAL_ADD_KERNEL(add);
        GGML_METAL_ADD_KERNEL(add_row);
        GGML_METAL_ADD_KERNEL(mul);
        GGML_METAL_ADD_KERNEL(mul_row);
        GGML_METAL_ADD_KERNEL(scale);
@ -464,10 +466,16 @@ void ggml_metal_graph_compute(
                                encoder = [command_buffer computeCommandEncoder];
                            }
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_add_row];
                            } else {
                                [encoder setComputePipelineState:ctx->pipeline_add];
                            }
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
                            const int64_t n = ggml_nelements(dst);
@ -919,7 +927,9 @@ void ggml_metal_graph_compute(
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_DUP:
                    case GGML_OP_CPY:
                    case GGML_OP_CONT:
                        {
                            if (encoder == nil) {
                                encoder = [command_buffer computeCommandEncoder];
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -67,6 +67,17 @@ kernel void kernel_add(
    dst[tpig] = src0[tpig] + src1[tpig];
 }
 // assumption: src1 is a row
 // broadcast src1 into src0
 kernel void kernel_add_row(
        device const float * src0,
        device const float * src1,
        device       float * dst,
        constant   int64_t & ne00,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] + src1[tpig % ne00];
 }
 kernel void kernel_mul(
        device const float * src0,
        device const float * src1,
--- a/llama.cpp
+++ b/llama.cpp
@ -2849,7 +2849,7 @@ struct llama_context * llama_new_context_with_model(
        const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
-        printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+        fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
 #define LLAMA_METAL_CHECK_BUF(result)                                          \
    if (!(result)) {                                                           \