diff --git a/Makefile b/Makefile index 1ea3c4562..e620835ef 100644 --- a/Makefile +++ b/Makefile @@ -235,13 +235,15 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h endif # LLAMA_CUBLAS ifdef LLAMA_CLBLAST - CFLAGS += -DGGML_USE_CLBLAST - CXXFLAGS += -DGGML_USE_CLBLAST + + CFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL) + CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL) + # Mac provides OpenCL as a framework ifeq ($(UNAME_S),Darwin) LDFLAGS += -lclblast -framework OpenCL else - LDFLAGS += -lclblast -lOpenCL + LDFLAGS += $(shell pkg-config --libs clblast OpenCL) endif OBJS += ggml-opencl.o diff --git a/README.md b/README.md index f45e4bf08..c9fe6187b 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,23 @@ In order to build llama.cpp you have three different options. zig build -Doptimize=ReleaseFast ``` +- Using `gmake` (FreeBSD): + + 1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics) + 2. Add your user to **video** group + 3. Install compilation dependencies. + + ```bash + sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \ + opencl clblast openblas + + gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4 + ``` + + **Notes:** With this packages you can build llama.cpp with OPENBLAS and + CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read + the instructions for use and activate this options in this document below. + ### Metal Build Using Metal allows the computation to be executed on the GPU for Apple devices: diff --git a/examples/common.cpp b/examples/common.cpp index 55e977171..5608ca87f 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -464,92 +464,92 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -i, --interactive run in interactive mode\n"); - fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n"); - fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); - fprintf(stderr, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); - fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); - fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n"); - fprintf(stderr, " (can be specified more than once for multiple prompts).\n"); - fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); - fprintf(stderr, " prompt to start generation with (default: empty)\n"); - fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); - fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); - fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); - fprintf(stderr, " not supported with --interactive or other interactive options\n"); - fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); - fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); - fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); - fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); - fprintf(stderr, " -f FNAME, --file FNAME\n"); - fprintf(stderr, " prompt file to start generation.\n"); - fprintf(stderr, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); - fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa); - fprintf(stderr, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); - fprintf(stderr, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); - fprintf(stderr, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); - fprintf(stderr, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); - fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); - fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); - fprintf(stderr, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); - fprintf(stderr, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); - fprintf(stderr, " --mirostat N use Mirostat sampling.\n"); - fprintf(stderr, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); - fprintf(stderr, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); - fprintf(stderr, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); - fprintf(stderr, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); - fprintf(stderr, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); - fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n"); - fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); - fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); - fprintf(stderr, " --cfg-negative-prompt PROMPT \n"); - fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n"); - fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); - fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); - fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); - fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); - fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); - fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); - fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp); - fprintf(stderr, " --perplexity compute perplexity over each ctx window of the prompt\n"); - fprintf(stderr, " --perplexity-lines compute perplexity over each line of the prompt\n"); - fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); + fprintf(stdout, "usage: %s [options]\n", argv[0]); + fprintf(stdout, "\n"); + fprintf(stdout, "options:\n"); + fprintf(stdout, " -h, --help show this help message and exit\n"); + fprintf(stdout, " -i, --interactive run in interactive mode\n"); + fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n"); + fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); + fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n"); + fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n"); + fprintf(stdout, " (can be specified more than once for multiple prompts).\n"); + fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n"); + fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); + fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stdout, " -p PROMPT, --prompt PROMPT\n"); + fprintf(stdout, " prompt to start generation with (default: empty)\n"); + fprintf(stdout, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); + fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); + fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); + fprintf(stdout, " not supported with --interactive or other interactive options\n"); + fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); + fprintf(stdout, " --random-prompt start with a randomized prompt.\n"); + fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); + fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); + fprintf(stdout, " -f FNAME, --file FNAME\n"); + fprintf(stdout, " prompt file to start generation.\n"); + fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); + fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa); + fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); + fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); + fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); + fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); + fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); + fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); + fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); + fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); + fprintf(stdout, " --mirostat N use Mirostat sampling.\n"); + fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); + fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); + fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); + fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); + fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); + fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n"); + fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); + fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); + fprintf(stdout, " --cfg-negative-prompt PROMPT \n"); + fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); + fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); + fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); + fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); + fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); + fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); + fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); + fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp); + fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n"); + fprintf(stdout, " --perplexity-lines compute perplexity over each line of the prompt\n"); + fprintf(stdout, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); + fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); if (llama_mlock_supported()) { - fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } if (llama_mmap_supported()) { - fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } - fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n"); - fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n"); - fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); + fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n"); + fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n"); + fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); - fprintf(stderr, " number of layers to store in VRAM\n"); - fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); - fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); + fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); + fprintf(stdout, " number of layers to store in VRAM\n"); + fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); + fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); + fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); #endif - fprintf(stderr, " --mtest compute maximum memory usage\n"); - fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); - fprintf(stderr, " --verbose-prompt print prompt before generation\n"); - fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, "\n"); + fprintf(stdout, " --mtest compute maximum memory usage\n"); + fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); + fprintf(stdout, " --verbose-prompt print prompt before generation\n"); + fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + fprintf(stdout, " -m FNAME, --model FNAME\n"); + fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); + fprintf(stdout, "\n"); } std::string gpt_random_prompt(std::mt19937 & rng) { diff --git a/examples/llm.vim b/examples/llm.vim index 16e308c38..efecad0cd 100644 --- a/examples/llm.vim +++ b/examples/llm.vim @@ -2,57 +2,22 @@ function! Llm() let url = "http://127.0.0.1:8080/completion" - " Save the current cursor position - let save_cursor = getpos('.') - - silent! %s/\n/\\n/g - silent! %s/\t/\\t/g - silent! %s/\\n$// - " Get the content of the current buffer let buffer_content = join(getline(1, '$'), "\n") - " Replace true newlines with "\n" - let buffer_content = substitute(buffer_content, '\n', '\\n', 'g') - - " Trim leading/trailing whitespace - let buffer_content = substitute(buffer_content, '^\s\+', '', '') - let buffer_content = substitute(buffer_content, '\s\+$', '', '') - " Create the JSON payload - " can't escape backslash, \n gets replaced as \\n - let json_payload = '{"prompt":"' . escape(buffer_content, '"/') . '","temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream":false}' - - let prompt_tmpfile = tempname() - let response_tmpfile = tempname() - call writefile([json_payload], prompt_tmpfile) + let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false} + let json_payload.prompt = buffer_content " Define the curl command - let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -o ' . shellescape(response_tmpfile) . ' -d @' . shellescape(prompt_tmpfile) . ' ' . url - silent execute '!'.curl_command - - let response = join(readfile(response_tmpfile), '') - let start_marker = '{"content":"' - let end_marker = '","generation_settings' - let content_start = stridx(response, start_marker) + len(start_marker) - let content_end = stridx(response, end_marker, content_start) + let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url + let response = system(curl_command, json_encode(json_payload)) " Extract the content field from the response - let content = strpart(response, content_start, content_end - content_start) + let content = json_decode(response).content " Insert the content at the cursor position call setline(line('.'), getline('.') . content) - - " Replace newline "\n" strings with actual newlines in the content - silent! %s/\\n/\r/g - " and tabs - silent! %s/\\t/\t/g - " and quote marks for C sources - silent! %s/\\"/\"/g - - " Remove the temporary file - call delete(prompt_tmpfile) - call delete(response_tmpfile) endfunction command! Llm call Llm() diff --git a/flake.nix b/flake.nix index 7f148f144..4178e97ff 100644 --- a/flake.nix +++ b/flake.nix @@ -7,7 +7,8 @@ flake-utils.lib.eachDefaultSystem (system: let inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin; - osSpecific = with pkgs; [ openmpi ] ++ + buildInputs = with pkgs; [ openmpi ]; + osSpecific = with pkgs; buildInputs ++ ( if isAarch64 && isDarwin then with pkgs.darwin.apple_sdk_11_0.frameworks; [ @@ -29,18 +30,24 @@ nativeBuildInputs = with pkgs; [ cmake pkgconfig ]; llama-python = pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]); + postPatch = '' + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" + substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python' + ''; + postInstall = '' + mv $out/bin/main $out/bin/llama + mv $out/bin/server $out/bin/llama-server + ''; + cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]; in { packages.default = pkgs.stdenv.mkDerivation { name = "llama.cpp"; src = ./.; - postPatch = '' - substituteInPlace ./ggml-metal.m \ - --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" - substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python' - ''; + postPatch = postPatch; nativeBuildInputs = nativeBuildInputs; buildInputs = osSpecific; - cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ] + cmakeFlags = cmakeFlags ++ (if isAarch64 && isDarwin then [ "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" "-DLLAMA_METAL=ON" @@ -48,10 +55,19 @@ "-DLLAMA_BLAS=ON" "-DLLAMA_BLAS_VENDOR=OpenBLAS" ]); - postInstall = '' - mv $out/bin/main $out/bin/llama - mv $out/bin/server $out/bin/llama-server - ''; + postInstall = postInstall; + meta.mainProgram = "llama"; + }; + packages.opencl = pkgs.stdenv.mkDerivation { + name = "llama.cpp"; + src = ./.; + postPatch = postPatch; + nativeBuildInputs = nativeBuildInputs; + buildInputs = with pkgs; buildInputs ++ [ clblast ]; + cmakeFlags = cmakeFlags ++ [ + "-DLLAMA_CLBLAST=ON" + ]; + postInstall = postInstall; meta.mainProgram = "llama"; }; apps.llama-server = { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b77135233..720447440 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -935,12 +935,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, uint16_t aux[4]; const uint8_t * sc = (const uint8_t *)aux; +#if K_QUANTS_PER_ITERATION == 2 + uint32_t q32[4]; + const uint8_t * q4 = (const uint8_t *)q32; +#else + uint16_t q16[4]; + const uint8_t * q4 = (const uint8_t *)q16; +#endif + float tmp = 0; // partial sum for thread in warp for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - const uint8_t * q1 = x[i].qs + q_offset; - const uint8_t * q2 = q1 + 64; const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; @@ -953,14 +959,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); +#if K_QUANTS_PER_ITERATION == 2 + const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + float4 s = {0.f, 0.f, 0.f, 0.f}; float smin = 0; - for (int l = 0; l < n; ++l) { - s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4); - s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4); + for (int l = 0; l < 4; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4]; + s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12]; smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; } - tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#else + const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#endif } #else @@ -1521,7 +1554,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q4_K * bq4_K = (const block_q4_K *) vbq; - const int bq8_offset = QR4_K * (iqs / QI8_1); + const int bq8_offset = QR4_K * (iqs / QI8_1); // 0, 2, 4, 6 float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -1531,11 +1564,20 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]); - for (int i = 0; i < QR4_K; ++i) { - const int isc = bq8_offset + i; + const uint16_t * scales = (const uint16_t *)bq4_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; - uint8_t sc, m; - get_scale_min_k4(isc, bq4_K->scales, sc, m); + for (int i = 0; i < QR4_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); @@ -1543,8 +1585,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const int vi = (v >> (4*i)) & 0x0F0F0F0F; - sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product - sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values + sumf_d += d8i * (__dp4a(vi, ui, 0) * sc[i]); // SIMD dot product + sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m[i]); // multiply constant part of q4_K with sum of q8_1 values } return d*sumf_d - dmin*sumf_m; @@ -2505,7 +2547,9 @@ static size_t g_scratch_offset = 0; static int g_device_count = -1; static int g_main_device = 0; +#ifndef GGML_CUDA_FORCE_DMMV static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; +#endif static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; @@ -2528,7 +2572,9 @@ void ggml_init_cublas() { g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; +#ifndef GGML_CUDA_FORCE_DMMV g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; +#endif } for (int id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; diff --git a/ggml-metal.m b/ggml-metal.m index 2810fa2a8..78a3b65f1 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -42,6 +42,7 @@ struct ggml_metal_context { id pipeline_##name GGML_METAL_DECL_KERNEL(add); + GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast GGML_METAL_DECL_KERNEL(mul); GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast GGML_METAL_DECL_KERNEL(scale); @@ -157,6 +158,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); GGML_METAL_ADD_KERNEL(add); + GGML_METAL_ADD_KERNEL(add_row); GGML_METAL_ADD_KERNEL(mul); GGML_METAL_ADD_KERNEL(mul_row); GGML_METAL_ADD_KERNEL(scale); @@ -464,10 +466,16 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } - [encoder setComputePipelineState:ctx->pipeline_add]; + if (ggml_nelements(src1) == ne10) { + // src1 is a row + [encoder setComputePipelineState:ctx->pipeline_add_row]; + } else { + [encoder setComputePipelineState:ctx->pipeline_add]; + } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; const int64_t n = ggml_nelements(dst); @@ -919,7 +927,9 @@ void ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_OP_DUP: case GGML_OP_CPY: + case GGML_OP_CONT: { if (encoder == nil) { encoder = [command_buffer computeCommandEncoder]; diff --git a/ggml-metal.metal b/ggml-metal.metal index 5a9a6d842..987376d56 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -67,6 +67,17 @@ kernel void kernel_add( dst[tpig] = src0[tpig] + src1[tpig]; } +// assumption: src1 is a row +// broadcast src1 into src0 +kernel void kernel_add_row( + device const float * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] + src1[tpig % ne00]; +} + kernel void kernel_mul( device const float * src0, device const float * src1, diff --git a/llama.cpp b/llama.cpp index 896821271..5a8453bec 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2849,7 +2849,7 @@ struct llama_context * llama_new_context_with_model( const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); #define LLAMA_METAL_CHECK_BUF(result) \ if (!(result)) { \