Merge branch 'master' into xsn/fix_lora

This commit is contained in:
ngxson 2024-07-15 17:22:40 +02:00
commit 4d9ac0f375
9 changed files with 507 additions and 136 deletions

View file

@ -795,6 +795,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cont_batching = true; params.cont_batching = true;
return true; return true;
} }
if (arg == "-nocb" || arg == "--no-cont-batching") {
params.cont_batching = false;
return true;
}
if (arg == "-fa" || arg == "--flash-attn") { if (arg == "-fa" || arg == "--flash-attn") {
params.flash_attn = true; params.flash_attn = true;
return true; return true;
@ -1536,6 +1540,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel }); options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences }); options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" }); options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
options.push_back({ "multi-modality" }); options.push_back({ "multi-modality" });
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });

View file

@ -9,15 +9,15 @@ Adding a model requires few steps:
After following these steps, you can open PR. After following these steps, you can open PR.
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially: Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](../examples/main) - [main](/examples/main/)
- [imatrix](../examples/imatrix) - [imatrix](/examples/imatrix/)
- [quantize](../examples/quantize) - [quantize](/examples/quantize/)
- [server](../examples/server) - [server](/examples/server/)
### 1. Convert the model to GGUF ### 1. Convert the model to GGUF
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library. This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format). Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors. The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
@ -31,7 +31,7 @@ class MyModel(Model):
model_arch = gguf.MODEL_ARCH.GROK model_arch = gguf.MODEL_ARCH.GROK
``` ```
2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py) 2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`. Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
@ -54,7 +54,7 @@ Example for `falcon` model:
As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist. As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file. Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file.
If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it. If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback). Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
## GGUF specification ## GGUF specification

View file

@ -1,7 +1,7 @@
# Token generation performance troubleshooting # Token generation performance troubleshooting
## Verifying that the model is running on the GPU with CUDA ## Verifying that the model is running on the GPU with CUDA
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell ```shell
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
``` ```

View file

@ -15,69 +15,281 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216). The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
**Command line options:** ## Usage
- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response. ```
- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores). usage: ./llama-server [options]
- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)` general:
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused -h, --help, --usage print usage and exit
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused --version show version and build info
- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused -v, --verbose print verbose information
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. --verbosity N set specific verbosity level (default: 0)
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`. --verbose-prompt print a verbose prompt before generation (default: false)
- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. --no-display-prompt don't print prompt at generation (default: false)
- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used. -co, --color colorise output to distinguish prompt and user input from generations (default: false)
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance. -s, --seed SEED RNG seed (default: -1, use random seed for < 0)
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048` -t, --threads N number of threads to use during generation (default: 8)
- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512` -tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. -td, --threads-draft N number of threads to use during generation (default: same as --threads)
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. -tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft)
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems --draft N number of tokens to draft for speculative decoding (default: 5)
- `--numa distribute`: Spread execution evenly over all nodes -ps, --p-split N speculative decoding split probability (default: 0.1)
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on -lcs, --lookup-cache-static FNAME
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437 path to static lookup cache to use for lookup decoding (not updated by generation)
- `--numa`: Attempt optimizations that may help on some NUMA systems. -lcd, --lookup-cache-dynamic FNAME
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. path to dynamic lookup cache to use for lookup decoding (updated by generation)
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. -c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model)
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600` -n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1` -b, --batch-size N logical maximum batch size (default: 2048)
- `--port`: Set the port to listen. Default: `8080` -ub, --ubatch-size N physical maximum batch size (default: 512)
- `--path`: Path from which to serve static files. Default: disabled --keep N number of tokens to keep from the initial prompt (default: 0, -1 = all)
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. --chunks N max number of chunks to process (default: -1, -1 = all)
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s. -fa, --flash-attn enable Flash Attention (default: disabled)
- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled -p, --prompt PROMPT prompt to start generation with
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error. in conversation mode, this will be used as system prompt
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled (default: '')
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime) -f, --file FNAME a file containing the prompt (default: none)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. --in-file FNAME an input file (repeat to specify multiple files)
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled. -bf, --binary-file FNAME binary file containing the prompt (default: none)
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend. Used together with group attention factor `--grp-attn-n`. Default: `512` -e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1` --no-escape do not process escape sequences
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. -ptc, --print-token-count N print token count every N tokens (default: -1)
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled --prompt-cache FNAME file to cache prompt state for faster startup (default: none)
- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled. --prompt-cache-all if specified, saves user input and generations to cache as well
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) not supported with --interactive or other interactive options
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled --prompt-cache-ro if specified, uses the prompt cache but does not update it
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json` -r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn` can be specified more than once for multiple prompts
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model) -sp, --special special tokens output enabled (default: false)
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25) -cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation) if suffix/prefix are not specified, default chat template will be used
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0) (default: false)
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0) -i, --interactive run in interactive mode (default: false)
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0) -if, --interactive-first run in interactive mode and wait for input right away (default: false)
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls` -mli, --multiline-input allows you to write or paste multiple lines without ending each in '\'
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled) --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string
- `-fa`, `--flash-attn` : enable flash attention (default: disabled). --in-prefix STRING string to prefix user inputs with (default: empty)
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`) --in-suffix STRING string to suffix after user inputs with (default: empty)
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options) --spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
sampling:
--samplers SAMPLERS samplers that will be used for generation in the order, separated by ';'
(default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
--sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt)
--ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
--penalize-nl penalize newline tokens (default: false)
--temp N temperature (default: 0.8)
--top-k N top-k sampling (default: 40, 0 = disabled)
--top-p N top-p sampling (default: 0.9, 1.0 = disabled)
--min-p N min-p sampling (default: 0.1, 0.0 = disabled)
--tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
--typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
--repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
--repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
--presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
--frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
--dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled)
--dynatemp-exp N dynamic temperature exponent (default: 1.0)
--mirostat N use Mirostat sampling.
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
--mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1)
--mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0)
-l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
--cfg-negative-prompt PROMPT
negative prompt to use for guidance (default: '')
--cfg-negative-prompt-file FNAME
negative prompt file to use for guidance
--cfg-scale N strength of guidance (default: 1.0, 1.0 = disable)
--chat-template JINJA_TEMPLATE
set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
grammar:
--grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
--grammar-file FNAME file to read grammar from
-j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
embedding:
--pooling {none,mean,cls,last}
pooling type for embeddings, use model default if unspecified
--attention {causal,non-causal}
attention type for embeddings, use model default if unspecified
context hacking:
--rope-scaling {none,linear,yarn}
RoPE frequency scaling method, defaults to linear unless specified by the model
--rope-scale N RoPE context scaling factor, expands context by a factor of N
--rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
--rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N
--yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)
--yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
--yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
--yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0)
--yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0)
-gan, --grp-attn-n N group-attention factor (default: 1)
-gaw, --grp-attn-w N group-attention width (default: 512.0)
-dkvc, --dump-kv-cache verbose print of the KV cache
-nkvo, --no-kv-offload disable KV offload
-ctk, --cache-type-k TYPE KV cache data type for K (default: f16)
-ctv, --cache-type-v TYPE KV cache data type for V (default: f16)
perplexity:
--all-logits return logits for all tokens in the batch (default: false)
--hellaswag compute HellaSwag score over random tasks from datafile supplied with -f
--hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400)
--winogrande compute Winogrande score over random tasks from datafile supplied with -f
--winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0)
--multiple-choice compute multiple choice score over random tasks from datafile supplied with -f
--multiple-choice-tasks N
number of tasks to use when computing the multiple choice score (default: 0)
--kl-divergence computes KL-divergence to logits provided via --kl-divergence-base
--ppl-stride N stride for perplexity calculation (default: 0)
--ppl-output-type {0,1} output type for perplexity calculation (default: 0)
parallel:
-dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
-np, --parallel N number of parallel sequences to decode (default: 1)
-ns, --sequences N number of sequences to decode (default: 1)
-cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)
multi-modality:
--mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md
--image FILE path to an image file. use with multimodal models. Specify multiple times for batching
backend:
--rpc SERVERS comma separated list of RPC servers
--mlock force system to keep model in RAM rather than swapping or compressing
--no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)
--numa TYPE attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437
model:
--check-tensors check model tensor data for invalid values (default: false)
--override-kv KEY=TYPE:VALUE
advanced option to override model metadata by key. may be specified multiple times.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
--lora FNAME apply LoRA adapter (implies --no-mmap)
--lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)
--lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter
--control-vector FNAME add a control vector
note: this argument can be repeated to add multiple control vectors
--control-vector-scaled FNAME SCALE
add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors
--control-vector-layer-range START END
layer range to apply the control vector(s) to, start and end inclusive
-m, --model FNAME model path (default: models/$filename with filename from --hf-file
or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
-md, --model-draft FNAME draft model for speculative decoding (default: unused)
-mu, --model-url MODEL_URL model download url (default: unused)
-hfr, --hf-repo REPO Hugging Face model repository (default: unused)
-hff, --hf-file FILE Hugging Face model file (default: unused)
-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable)
retrieval:
--context-file FNAME file to load context from (repeat to specify multiple files)
--chunk-size N minimum length of embedded text chunks (default: 64)
--chunk-separator STRING
separator between chunks (default: '
')
passkey:
--junk N number of times to repeat the junk text (default: 250)
--pos N position of the passkey in the junk text (default: -1)
imatrix:
-o, --output FNAME output file (default: 'imatrix.dat')
--output-frequency N output the imatrix every N iterations (default: 10)
--save-frequency N save an imatrix copy every N iterations (default: 0)
--process-output collect data for the output tensor (default: false)
--no-ppl do not compute perplexity (default: true)
--chunk N start processing the input from chunk N (default: 0)
bench:
-pps is the prompt shared across parallel sequences (default: false)
-npp n0,n1,... number of prompt tokens
-ntg n0,n1,... number of text generation tokens
-npl n0,n1,... number of parallel prompts
embedding:
--embd-normalize normalisation for embendings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
--embd-output-format empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
--embd-separator separator of embendings (default \n) for example "<#sep#>"
server:
--host HOST ip address to listen (default: 127.0.0.1)
--port PORT port to listen (default: 8080)
--path PATH path to serve static files from (default: )
--embedding(s) enable embedding endpoint (default: disabled)
--api-key KEY API key to use for authentication (default: none)
--api-key-file FNAME path to file containing API keys (default: none)
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
--ssl-cert-file FNAME path to file a PEM-encoded SSL certificate
--timeout N server read/write timeout in seconds (default: 600)
--threads-http N number of threads used to process HTTP requests (default: -1)
--system-prompt-file FNAME
set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
--log-format {text,json}
log output format: json or text (default: json)
--metrics enable prometheus compatible metrics endpoint (default: disabled)
--no-slots disables slots monitoring endpoint (default: enabled)
--slot-save-path PATH path to save slot kv cache (default: disabled)
--chat-template JINJA_TEMPLATE
set custom jinja chat template (default: template taken from model's metadata)
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-sps, --slot-prompt-similarity SIMILARITY
how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
logging:
--simple-io use basic IO for better compatibility in subprocesses and limited consoles
-ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset)
--log-test Run simple logging test
--log-disable Disable trace logs
--log-enable Enable trace logs
--log-file FNAME Specify a log filename (without extension)
--log-new Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
--log-append Don't truncate the old log file.
cvector:
-o, --output FNAME output file (default: 'control_vector.gguf')
--positive-file FNAME positive prompts file, one prompt per line (default: 'examples/cvector-generator/positive.txt')
--negative-file FNAME negative prompts file, one prompt per line (default: 'examples/cvector-generator/negative.txt')
--pca-batch N batch size used for PCA. Larger batch runs faster, but uses more memory (default: 100)
--pca-iter N number of iterations used for PCA (default: 1000)
--method {pca,mean} dimensionality reduction method to be used (default: pca)
```
**If compiled with `LLAMA_SERVER_SSL=ON`**
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
- `--ssl-cert-file FNAME`: path to file a PEM-encoded SSL certificate
## Build ## Build

View file

@ -14,7 +14,9 @@
#include "ggml-aarch64.h" #include "ggml-aarch64.h"
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Woverlength-strings" #pragma GCC diagnostic ignored "-Woverlength-strings"
#endif
#define UNUSED GGML_UNUSED #define UNUSED GGML_UNUSED

View file

@ -291,29 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k,
dst[i] = x[i] * x[i]; dst[i] = x[i] * x[i];
} }
static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
const sycl::nd_item<3> &item_ct1) {
int nidx = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
if (nidx >= ne0) {
return;
}
// operation
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
if (item_ct1.get_group(0) < ne02) { // src0
int offset_src =
nidx + item_ct1.get_group(1) * ne0 +
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
dst[offset_dst] = x[offset_src];
} else {
int offset_src =
nidx + item_ct1.get_group(1) * ne0 +
(item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
dst[offset_dst] = y[offset_src];
}
}
static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01, static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
const int nb02, const int nb03, const int ne10, const int ne11, const int nb02, const int nb03, const int ne10, const int ne11,
const int ne12, const int ne13, const float sf0, const float sf1, const int ne12, const int ne13, const float sf0, const float sf1,
@ -1347,20 +1324,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
}); });
} }
static void concat_f32_sycl(const float *x, const float *y, float *dst,
const int ne0, int ne1, int ne2, int ne02,
queue_ptr stream) {
int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
sycl::range<3> gridDim(ne2, ne1, num_blocks);
stream->parallel_for(
sycl::nd_range<3>(gridDim *
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) {
concat_f32(x, y, dst, ne0, ne02, item_ct1);
});
}
static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01, static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
const int nb02, const int nb03, const int ne10, const int ne11, const int nb02, const int nb03, const int ne10, const int ne11,
const int ne12, const int ne13, const float sf0, const float sf1, const int ne12, const int ne13, const float sf0, const float sf1,
@ -2429,28 +2392,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
(void) src1_dd; (void) src1_dd;
} }
inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd,
float *dst_dd,
const queue_ptr &main_stream) {
#pragma message("TODO: generalize concat kernel for dim != 2")
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563")
int dim = dst->op_params[0];
GGML_ASSERT(dim == 2);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
concat_f32_sycl(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
}
(void) src1;
(void) dst;
}
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst, const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd, const float *src0_dd, const float *src1_dd,
@ -3359,12 +3300,6 @@ static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_ten
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
static void ggml_sycl_concat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_concat);
GGML_SYCL_DEBUG("call %s done\n", __func__);
}
static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale); ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale);
@ -4101,7 +4036,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
func = ggml_sycl_group_norm; func = ggml_sycl_group_norm;
break; break;
case GGML_OP_CONCAT: case GGML_OP_CONCAT:
func = ggml_sycl_concat; func = ggml_sycl_op_concat;
break; break;
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
func = ggml_sycl_upscale; func = ggml_sycl_upscale;

View file

@ -13,6 +13,7 @@
#ifndef GGML_SYCL_BACKEND_HPP #ifndef GGML_SYCL_BACKEND_HPP
#define GGML_SYCL_BACKEND_HPP #define GGML_SYCL_BACKEND_HPP
#include "concat.hpp"
#include "common.hpp" #include "common.hpp"
#include "convert.hpp" #include "convert.hpp"
#include "dequantize.hpp" #include "dequantize.hpp"

View file

@ -0,0 +1,195 @@
//
// MIT license
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: MIT
//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
#include "concat.hpp"
#include "common.hpp"
static void concat_f32_dim0(const float *x, const float *y, float *dst,
const int ne0, const int ne00,
const sycl::nd_item<3> &item_ct1) {
int nidx = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
if (nidx >= ne0) {
return;
}
// operation
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
if (nidx < ne00) { // src0
int offset_src = nidx + item_ct1.get_group(1) * ne00 +
item_ct1.get_group(0) * ne00 * item_ct1.get_group_range(1);
dst[offset_dst] = x[offset_src];
} else {
int offset_src =
nidx - ne00 + item_ct1.get_group(1) * (ne0 - ne00) +
item_ct1.get_group(0) * (ne0 - ne00) * item_ct1.get_group_range(1);
dst[offset_dst] = y[offset_src];
}
}
static void concat_f32_dim1(const float *x, const float *y, float *dst,
const int ne0, const int ne01,
const sycl::nd_item<3> &item_ct1) {
int nidx = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
if (nidx >= ne0) {
return;
}
// operation
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
if (item_ct1.get_group(1) < ne01) { // src0
int offset_src =
nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
dst[offset_dst] = x[offset_src];
} else {
int offset_src =
nidx + (item_ct1.get_group(1) - ne01) * ne0 +
item_ct1.get_group(0) * ne0 * (item_ct1.get_group_range(1) - ne01);
dst[offset_dst] = y[offset_src];
}
}
static void concat_f32_dim2(const float *x, const float *y, float *dst,
const int ne0, const int ne02,
const sycl::nd_item<3> &item_ct1) {
int nidx = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
if (nidx >= ne0) {
return;
}
// operation
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
if (item_ct1.get_group(0) < ne02) { // src0
int offset_src = nidx + item_ct1.get_group(1) * ne0 +
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
dst[offset_dst] = x[offset_src];
} else {
int offset_src =
nidx + item_ct1.get_group(1) * ne0 +
(item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
dst[offset_dst] = y[offset_src];
}
}
static void concat_f32_sycl(const float *x, const float *y, float *dst,
int ne00, int ne01, int ne02, int ne0, int ne1,
int ne2, int dim, queue_ptr stream) {
int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
sycl::range<3> gridDim(ne2, ne1, num_blocks);
switch (dim) {
case 0:
stream->parallel_for(
sycl::nd_range<3>(gridDim *
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) {
concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
});
break;
case 1:
stream->parallel_for(
sycl::nd_range<3>(gridDim *
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) {
concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
});
break;
default:
stream->parallel_for(
sycl::nd_range<3>(gridDim *
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) {
concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
});
break;
}
}
// non-contiguous kernel (slow)
static void concat_f32_sycl_non_cont(
queue_ptr stream, const char *src0, const char *src1, char *dst,
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00,
uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/,
int64_t /*ne11*/, int64_t /*ne12*/, int64_t /*ne13*/, uint64_t nb10,
uint64_t nb11, uint64_t nb12, uint64_t nb13, int64_t ne0, int64_t ne1,
int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
uint64_t nb3, int32_t dim) {
sycl::range<3> gridDim(ne3, ne2, ne1);
stream->parallel_for(
sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)),
[=](sycl::nd_item<3> item_ct1) {
int64_t i3 = item_ct1.get_group(0);
int64_t i2 = item_ct1.get_group(1);
int64_t i1 = item_ct1.get_group(2);
int64_t o[4] = {0, 0, 0, 0};
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
const float *x;
for (int i0 = item_ct1.get_local_id(2); i0 < ne0;
i0 += item_ct1.get_local_range(2)) {
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 +
(i0)*nb00);
} else {
x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 +
(i1 - o[1]) * nb11 + (i0 - o[0]) * nb10);
}
float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
*y = *x;
}
});
}
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst) {
queue_ptr stream = ctx.stream();
const int32_t dim = ((int32_t *)dst->op_params)[0];
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
const float *src0_d = (const float *)src0->data;
const float *src1_d = (const float *)src1->data;
float *dst_d = (float *)dst->data;
if (dim != 3) {
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
concat_f32_sycl(
src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
}
} else {
const size_t size0 = ggml_nbytes(src0);
const size_t size1 = ggml_nbytes(src1);
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
SYCL_CHECK(CHECK_TRY_ERROR(
stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
}
} else
concat_f32_sycl_non_cont(
stream, (const char *)src0->data, (const char *)src1->data,
(char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
}

View file

@ -0,0 +1,21 @@
//
// MIT license
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: MIT
//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
#ifndef GGML_SYCL_CONCAT_HPP
#define GGML_SYCL_CONCAT_HPP
#include "common.hpp"
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst);
#endif // GGML_SYCL_CONCAT_HPP