Merge branch 'master' into llama-v2-70b
This commit is contained in:
commit
3ed2553fcd
9 changed files with 216 additions and 149 deletions
8
Makefile
8
Makefile
|
@ -235,13 +235,15 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
endif # LLAMA_CUBLAS
|
endif # LLAMA_CUBLAS
|
||||||
|
|
||||||
ifdef LLAMA_CLBLAST
|
ifdef LLAMA_CLBLAST
|
||||||
CFLAGS += -DGGML_USE_CLBLAST
|
|
||||||
CXXFLAGS += -DGGML_USE_CLBLAST
|
CFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
|
||||||
|
CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
|
||||||
|
|
||||||
# Mac provides OpenCL as a framework
|
# Mac provides OpenCL as a framework
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
LDFLAGS += -lclblast -framework OpenCL
|
LDFLAGS += -lclblast -framework OpenCL
|
||||||
else
|
else
|
||||||
LDFLAGS += -lclblast -lOpenCL
|
LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
|
||||||
endif
|
endif
|
||||||
OBJS += ggml-opencl.o
|
OBJS += ggml-opencl.o
|
||||||
|
|
||||||
|
|
17
README.md
17
README.md
|
@ -242,6 +242,23 @@ In order to build llama.cpp you have three different options.
|
||||||
zig build -Doptimize=ReleaseFast
|
zig build -Doptimize=ReleaseFast
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- Using `gmake` (FreeBSD):
|
||||||
|
|
||||||
|
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
|
||||||
|
2. Add your user to **video** group
|
||||||
|
3. Install compilation dependencies.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
|
||||||
|
opencl clblast openblas
|
||||||
|
|
||||||
|
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
|
||||||
|
```
|
||||||
|
|
||||||
|
**Notes:** With this packages you can build llama.cpp with OPENBLAS and
|
||||||
|
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
|
||||||
|
the instructions for use and activate this options in this document below.
|
||||||
|
|
||||||
### Metal Build
|
### Metal Build
|
||||||
|
|
||||||
Using Metal allows the computation to be executed on the GPU for Apple devices:
|
Using Metal allows the computation to be executed on the GPU for Apple devices:
|
||||||
|
|
|
@ -464,92 +464,92 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
fprintf(stdout, "usage: %s [options]\n", argv[0]);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stdout, "\n");
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stdout, "options:\n");
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
fprintf(stdout, " -h, --help show this help message and exit\n");
|
||||||
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
fprintf(stdout, " -i, --interactive run in interactive mode\n");
|
||||||
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
|
fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
fprintf(stderr, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
||||||
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n");
|
||||||
fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n");
|
fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n");
|
||||||
fprintf(stderr, " (can be specified more than once for multiple prompts).\n");
|
fprintf(stdout, " (can be specified more than once for multiple prompts).\n");
|
||||||
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n");
|
||||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
||||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
fprintf(stdout, " -p PROMPT, --prompt PROMPT\n");
|
||||||
fprintf(stderr, " prompt to start generation with (default: empty)\n");
|
fprintf(stdout, " prompt to start generation with (default: empty)\n");
|
||||||
fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
|
fprintf(stdout, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
|
||||||
fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
|
fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
|
||||||
fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
|
fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
|
||||||
fprintf(stderr, " not supported with --interactive or other interactive options\n");
|
fprintf(stdout, " not supported with --interactive or other interactive options\n");
|
||||||
fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
|
fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
|
||||||
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
|
fprintf(stdout, " --random-prompt start with a randomized prompt.\n");
|
||||||
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
||||||
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
|
fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
|
||||||
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
fprintf(stdout, " -f FNAME, --file FNAME\n");
|
||||||
fprintf(stderr, " prompt file to start generation.\n");
|
fprintf(stdout, " prompt file to start generation.\n");
|
||||||
fprintf(stderr, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
|
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
|
||||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
fprintf(stderr, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
|
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
|
||||||
fprintf(stderr, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
||||||
fprintf(stderr, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
||||||
fprintf(stderr, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
||||||
fprintf(stderr, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
|
fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
|
||||||
fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
||||||
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
||||||
fprintf(stderr, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
||||||
fprintf(stderr, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
||||||
fprintf(stderr, " --mirostat N use Mirostat sampling.\n");
|
fprintf(stdout, " --mirostat N use Mirostat sampling.\n");
|
||||||
fprintf(stderr, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
||||||
fprintf(stderr, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
||||||
fprintf(stderr, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
||||||
fprintf(stderr, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
||||||
fprintf(stderr, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
||||||
fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n");
|
fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n");
|
||||||
fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
||||||
fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
||||||
fprintf(stderr, " --cfg-negative-prompt PROMPT \n");
|
fprintf(stdout, " --cfg-negative-prompt PROMPT \n");
|
||||||
fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n");
|
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
|
||||||
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
||||||
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||||
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||||
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||||
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
|
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
|
||||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||||
fprintf(stderr, " --perplexity compute perplexity over each ctx window of the prompt\n");
|
fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n");
|
||||||
fprintf(stderr, " --perplexity-lines compute perplexity over each line of the prompt\n");
|
fprintf(stdout, " --perplexity-lines compute perplexity over each line of the prompt\n");
|
||||||
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
fprintf(stdout, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||||
fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||||
if (llama_mlock_supported()) {
|
if (llama_mlock_supported()) {
|
||||||
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
}
|
}
|
||||||
if (llama_mmap_supported()) {
|
if (llama_mmap_supported()) {
|
||||||
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
|
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n");
|
||||||
fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
|
fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n");
|
||||||
fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
|
||||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
fprintf(stdout, " number of layers to store in VRAM\n");
|
||||||
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
|
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
|
||||||
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
||||||
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
|
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
|
||||||
#endif
|
#endif
|
||||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
fprintf(stdout, " --mtest compute maximum memory usage\n");
|
||||||
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
|
||||||
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
fprintf(stdout, " --verbose-prompt print prompt before generation\n");
|
||||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
fprintf(stdout, " -m FNAME, --model FNAME\n");
|
||||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
|
||||||
fprintf(stderr, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string gpt_random_prompt(std::mt19937 & rng) {
|
std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||||
|
|
|
@ -2,57 +2,22 @@ function! Llm()
|
||||||
|
|
||||||
let url = "http://127.0.0.1:8080/completion"
|
let url = "http://127.0.0.1:8080/completion"
|
||||||
|
|
||||||
" Save the current cursor position
|
|
||||||
let save_cursor = getpos('.')
|
|
||||||
|
|
||||||
silent! %s/\n/\\n/g
|
|
||||||
silent! %s/\t/\\t/g
|
|
||||||
silent! %s/\\n$//
|
|
||||||
|
|
||||||
" Get the content of the current buffer
|
" Get the content of the current buffer
|
||||||
let buffer_content = join(getline(1, '$'), "\n")
|
let buffer_content = join(getline(1, '$'), "\n")
|
||||||
|
|
||||||
" Replace true newlines with "\n"
|
|
||||||
let buffer_content = substitute(buffer_content, '\n', '\\n', 'g')
|
|
||||||
|
|
||||||
" Trim leading/trailing whitespace
|
|
||||||
let buffer_content = substitute(buffer_content, '^\s\+', '', '')
|
|
||||||
let buffer_content = substitute(buffer_content, '\s\+$', '', '')
|
|
||||||
|
|
||||||
" Create the JSON payload
|
" Create the JSON payload
|
||||||
" can't escape backslash, \n gets replaced as \\n
|
let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false}
|
||||||
let json_payload = '{"prompt":"' . escape(buffer_content, '"/') . '","temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream":false}'
|
let json_payload.prompt = buffer_content
|
||||||
|
|
||||||
let prompt_tmpfile = tempname()
|
|
||||||
let response_tmpfile = tempname()
|
|
||||||
call writefile([json_payload], prompt_tmpfile)
|
|
||||||
|
|
||||||
" Define the curl command
|
" Define the curl command
|
||||||
let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -o ' . shellescape(response_tmpfile) . ' -d @' . shellescape(prompt_tmpfile) . ' ' . url
|
let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
|
||||||
silent execute '!'.curl_command
|
let response = system(curl_command, json_encode(json_payload))
|
||||||
|
|
||||||
let response = join(readfile(response_tmpfile), '')
|
|
||||||
let start_marker = '{"content":"'
|
|
||||||
let end_marker = '","generation_settings'
|
|
||||||
let content_start = stridx(response, start_marker) + len(start_marker)
|
|
||||||
let content_end = stridx(response, end_marker, content_start)
|
|
||||||
|
|
||||||
" Extract the content field from the response
|
" Extract the content field from the response
|
||||||
let content = strpart(response, content_start, content_end - content_start)
|
let content = json_decode(response).content
|
||||||
|
|
||||||
" Insert the content at the cursor position
|
" Insert the content at the cursor position
|
||||||
call setline(line('.'), getline('.') . content)
|
call setline(line('.'), getline('.') . content)
|
||||||
|
|
||||||
" Replace newline "\n" strings with actual newlines in the content
|
|
||||||
silent! %s/\\n/\r/g
|
|
||||||
" and tabs
|
|
||||||
silent! %s/\\t/\t/g
|
|
||||||
" and quote marks for C sources
|
|
||||||
silent! %s/\\"/\"/g
|
|
||||||
|
|
||||||
" Remove the temporary file
|
|
||||||
call delete(prompt_tmpfile)
|
|
||||||
call delete(response_tmpfile)
|
|
||||||
endfunction
|
endfunction
|
||||||
|
|
||||||
command! Llm call Llm()
|
command! Llm call Llm()
|
||||||
|
|
36
flake.nix
36
flake.nix
|
@ -7,7 +7,8 @@
|
||||||
flake-utils.lib.eachDefaultSystem (system:
|
flake-utils.lib.eachDefaultSystem (system:
|
||||||
let
|
let
|
||||||
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
||||||
osSpecific = with pkgs; [ openmpi ] ++
|
buildInputs = with pkgs; [ openmpi ];
|
||||||
|
osSpecific = with pkgs; buildInputs ++
|
||||||
(
|
(
|
||||||
if isAarch64 && isDarwin then
|
if isAarch64 && isDarwin then
|
||||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||||
|
@ -29,18 +30,24 @@
|
||||||
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
|
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
|
||||||
llama-python =
|
llama-python =
|
||||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||||
in {
|
|
||||||
packages.default = pkgs.stdenv.mkDerivation {
|
|
||||||
name = "llama.cpp";
|
|
||||||
src = ./.;
|
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
|
substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
|
||||||
'';
|
'';
|
||||||
|
postInstall = ''
|
||||||
|
mv $out/bin/main $out/bin/llama
|
||||||
|
mv $out/bin/server $out/bin/llama-server
|
||||||
|
'';
|
||||||
|
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
|
||||||
|
in {
|
||||||
|
packages.default = pkgs.stdenv.mkDerivation {
|
||||||
|
name = "llama.cpp";
|
||||||
|
src = ./.;
|
||||||
|
postPatch = postPatch;
|
||||||
nativeBuildInputs = nativeBuildInputs;
|
nativeBuildInputs = nativeBuildInputs;
|
||||||
buildInputs = osSpecific;
|
buildInputs = osSpecific;
|
||||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
|
cmakeFlags = cmakeFlags
|
||||||
++ (if isAarch64 && isDarwin then [
|
++ (if isAarch64 && isDarwin then [
|
||||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||||
"-DLLAMA_METAL=ON"
|
"-DLLAMA_METAL=ON"
|
||||||
|
@ -48,10 +55,19 @@
|
||||||
"-DLLAMA_BLAS=ON"
|
"-DLLAMA_BLAS=ON"
|
||||||
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
||||||
]);
|
]);
|
||||||
postInstall = ''
|
postInstall = postInstall;
|
||||||
mv $out/bin/main $out/bin/llama
|
meta.mainProgram = "llama";
|
||||||
mv $out/bin/server $out/bin/llama-server
|
};
|
||||||
'';
|
packages.opencl = pkgs.stdenv.mkDerivation {
|
||||||
|
name = "llama.cpp";
|
||||||
|
src = ./.;
|
||||||
|
postPatch = postPatch;
|
||||||
|
nativeBuildInputs = nativeBuildInputs;
|
||||||
|
buildInputs = with pkgs; buildInputs ++ [ clblast ];
|
||||||
|
cmakeFlags = cmakeFlags ++ [
|
||||||
|
"-DLLAMA_CLBLAST=ON"
|
||||||
|
];
|
||||||
|
postInstall = postInstall;
|
||||||
meta.mainProgram = "llama";
|
meta.mainProgram = "llama";
|
||||||
};
|
};
|
||||||
apps.llama-server = {
|
apps.llama-server = {
|
||||||
|
|
72
ggml-cuda.cu
72
ggml-cuda.cu
|
@ -935,12 +935,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||||
uint16_t aux[4];
|
uint16_t aux[4];
|
||||||
const uint8_t * sc = (const uint8_t *)aux;
|
const uint8_t * sc = (const uint8_t *)aux;
|
||||||
|
|
||||||
|
#if K_QUANTS_PER_ITERATION == 2
|
||||||
|
uint32_t q32[4];
|
||||||
|
const uint8_t * q4 = (const uint8_t *)q32;
|
||||||
|
#else
|
||||||
|
uint16_t q16[4];
|
||||||
|
const uint8_t * q4 = (const uint8_t *)q16;
|
||||||
|
#endif
|
||||||
|
|
||||||
float tmp = 0; // partial sum for thread in warp
|
float tmp = 0; // partial sum for thread in warp
|
||||||
|
|
||||||
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
||||||
|
|
||||||
const uint8_t * q1 = x[i].qs + q_offset;
|
|
||||||
const uint8_t * q2 = q1 + 64;
|
|
||||||
const float * y1 = yy + i*QK_K + y_offset;
|
const float * y1 = yy + i*QK_K + y_offset;
|
||||||
const float * y2 = y1 + 128;
|
const float * y2 = y1 + 128;
|
||||||
|
|
||||||
|
@ -953,14 +959,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||||
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
||||||
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
||||||
|
|
||||||
|
#if K_QUANTS_PER_ITERATION == 2
|
||||||
|
const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
|
||||||
|
const uint32_t * q2 = q1 + 16;
|
||||||
|
|
||||||
|
q32[0] = q1[0] & 0x0f0f0f0f;
|
||||||
|
q32[1] = q1[0] & 0xf0f0f0f0;
|
||||||
|
q32[2] = q2[0] & 0x0f0f0f0f;
|
||||||
|
q32[3] = q2[0] & 0xf0f0f0f0;
|
||||||
|
|
||||||
float4 s = {0.f, 0.f, 0.f, 0.f};
|
float4 s = {0.f, 0.f, 0.f, 0.f};
|
||||||
float smin = 0;
|
float smin = 0;
|
||||||
for (int l = 0; l < n; ++l) {
|
for (int l = 0; l < 4; ++l) {
|
||||||
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
|
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
|
||||||
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
|
s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
|
||||||
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
||||||
}
|
}
|
||||||
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
||||||
|
#else
|
||||||
|
const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
|
||||||
|
const uint16_t * q2 = q1 + 32;
|
||||||
|
|
||||||
|
q16[0] = q1[0] & 0x0f0f;
|
||||||
|
q16[1] = q1[0] & 0xf0f0;
|
||||||
|
q16[2] = q2[0] & 0x0f0f;
|
||||||
|
q16[3] = q2[0] & 0xf0f0;
|
||||||
|
|
||||||
|
float4 s = {0.f, 0.f, 0.f, 0.f};
|
||||||
|
float smin = 0;
|
||||||
|
for (int l = 0; l < 2; ++l) {
|
||||||
|
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
|
||||||
|
s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
|
||||||
|
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
||||||
|
}
|
||||||
|
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
@ -1521,7 +1554,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||||
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
||||||
|
|
||||||
const int bq8_offset = QR4_K * (iqs / QI8_1);
|
const int bq8_offset = QR4_K * (iqs / QI8_1); // 0, 2, 4, 6
|
||||||
|
|
||||||
float sumf_d = 0.0f;
|
float sumf_d = 0.0f;
|
||||||
float sumf_m = 0.0f;
|
float sumf_m = 0.0f;
|
||||||
|
@ -1531,11 +1564,20 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||||
|
|
||||||
const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
|
const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
|
||||||
|
|
||||||
for (int i = 0; i < QR4_K; ++i) {
|
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
||||||
const int isc = bq8_offset + i;
|
uint16_t aux[2];
|
||||||
|
const int j = bq8_offset/2;
|
||||||
|
if (j < 2) {
|
||||||
|
aux[0] = scales[j+0] & 0x3f3f;
|
||||||
|
aux[1] = scales[j+2] & 0x3f3f;
|
||||||
|
} else {
|
||||||
|
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
||||||
|
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
||||||
|
}
|
||||||
|
const uint8_t * sc = (const uint8_t *)aux;
|
||||||
|
const uint8_t * m = sc + 2;
|
||||||
|
|
||||||
uint8_t sc, m;
|
for (int i = 0; i < QR4_K; ++i) {
|
||||||
get_scale_min_k4(isc, bq4_K->scales, sc, m);
|
|
||||||
|
|
||||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||||
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
||||||
|
@ -1543,8 +1585,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||||
|
|
||||||
const int vi = (v >> (4*i)) & 0x0F0F0F0F;
|
const int vi = (v >> (4*i)) & 0x0F0F0F0F;
|
||||||
|
|
||||||
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc[i]); // SIMD dot product
|
||||||
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
|
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
||||||
}
|
}
|
||||||
|
|
||||||
return d*sumf_d - dmin*sumf_m;
|
return d*sumf_d - dmin*sumf_m;
|
||||||
|
@ -2505,7 +2547,9 @@ static size_t g_scratch_offset = 0;
|
||||||
|
|
||||||
static int g_device_count = -1;
|
static int g_device_count = -1;
|
||||||
static int g_main_device = 0;
|
static int g_main_device = 0;
|
||||||
|
#ifndef GGML_CUDA_FORCE_DMMV
|
||||||
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
||||||
|
#endif
|
||||||
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
|
|
||||||
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||||
|
@ -2528,7 +2572,9 @@ void ggml_init_cublas() {
|
||||||
g_tensor_split[id] = total_vram;
|
g_tensor_split[id] = total_vram;
|
||||||
total_vram += prop.totalGlobalMem;
|
total_vram += prop.totalGlobalMem;
|
||||||
|
|
||||||
|
#ifndef GGML_CUDA_FORCE_DMMV
|
||||||
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
g_tensor_split[id] /= total_vram;
|
g_tensor_split[id] /= total_vram;
|
||||||
|
|
10
ggml-metal.m
10
ggml-metal.m
|
@ -42,6 +42,7 @@ struct ggml_metal_context {
|
||||||
id<MTLComputePipelineState> pipeline_##name
|
id<MTLComputePipelineState> pipeline_##name
|
||||||
|
|
||||||
GGML_METAL_DECL_KERNEL(add);
|
GGML_METAL_DECL_KERNEL(add);
|
||||||
|
GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast
|
||||||
GGML_METAL_DECL_KERNEL(mul);
|
GGML_METAL_DECL_KERNEL(mul);
|
||||||
GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
|
GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
|
||||||
GGML_METAL_DECL_KERNEL(scale);
|
GGML_METAL_DECL_KERNEL(scale);
|
||||||
|
@ -157,6 +158,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
|
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
|
||||||
|
|
||||||
GGML_METAL_ADD_KERNEL(add);
|
GGML_METAL_ADD_KERNEL(add);
|
||||||
|
GGML_METAL_ADD_KERNEL(add_row);
|
||||||
GGML_METAL_ADD_KERNEL(mul);
|
GGML_METAL_ADD_KERNEL(mul);
|
||||||
GGML_METAL_ADD_KERNEL(mul_row);
|
GGML_METAL_ADD_KERNEL(mul_row);
|
||||||
GGML_METAL_ADD_KERNEL(scale);
|
GGML_METAL_ADD_KERNEL(scale);
|
||||||
|
@ -464,10 +466,16 @@ void ggml_metal_graph_compute(
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ggml_nelements(src1) == ne10) {
|
||||||
|
// src1 is a row
|
||||||
|
[encoder setComputePipelineState:ctx->pipeline_add_row];
|
||||||
|
} else {
|
||||||
[encoder setComputePipelineState:ctx->pipeline_add];
|
[encoder setComputePipelineState:ctx->pipeline_add];
|
||||||
|
}
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
const int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
|
@ -919,7 +927,9 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_DUP:
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
|
case GGML_OP_CONT:
|
||||||
{
|
{
|
||||||
if (encoder == nil) {
|
if (encoder == nil) {
|
||||||
encoder = [command_buffer computeCommandEncoder];
|
encoder = [command_buffer computeCommandEncoder];
|
||||||
|
|
|
@ -67,6 +67,17 @@ kernel void kernel_add(
|
||||||
dst[tpig] = src0[tpig] + src1[tpig];
|
dst[tpig] = src0[tpig] + src1[tpig];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// assumption: src1 is a row
|
||||||
|
// broadcast src1 into src0
|
||||||
|
kernel void kernel_add_row(
|
||||||
|
device const float * src0,
|
||||||
|
device const float * src1,
|
||||||
|
device float * dst,
|
||||||
|
constant int64_t & ne00,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
dst[tpig] = src0[tpig] + src1[tpig % ne00];
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_mul(
|
kernel void kernel_mul(
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
device const float * src1,
|
device const float * src1,
|
||||||
|
|
|
@ -2849,7 +2849,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
||||||
|
|
||||||
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
||||||
|
|
||||||
#define LLAMA_METAL_CHECK_BUF(result) \
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
||||||
if (!(result)) { \
|
if (!(result)) { \
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue