Merge branch 'master' into xsn/fix_lora

2024-07-15 17:22:40 +02:00 · 2024-07-15 17:22:40 +02:00 · 4d9ac0f375
commit 4d9ac0f375
parent b1c4069502 8fac431b06
9 changed files with 507 additions and 136 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -795,6 +795,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.cont_batching = true;
        return true;
    }
    if (arg == "-nocb" || arg == "--no-cont-batching") {
        params.cont_batching = false;
        return true;
    }
    if (arg == "-fa" || arg == "--flash-attn") {
        params.flash_attn = true;
        return true;
@ -1536,6 +1540,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-np,   --parallel N",           "number of parallel sequences to decode (default: %d)", params.n_parallel });
    options.push_back({ "*",           "-ns,   --sequences N",          "number of sequences to decode (default: %d)", params.n_sequences });
    options.push_back({ "*",           "-cb,   --cont-batching",        "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
    options.push_back({ "*",           "-nocb, --no-cont-batching",     "disable continuous batching" });
    options.push_back({ "multi-modality" });
    options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@ -9,15 +9,15 @@ Adding a model requires few steps:
 After following these steps, you can open PR.
 Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](../examples/main)
+- [main](/examples/main/)
- [imatrix](../examples/imatrix)
+- [imatrix](/examples/imatrix/)
- [quantize](../examples/quantize)
+- [quantize](/examples/quantize/)
- [server](../examples/server)
+- [server](/examples/server/)
 ### 1. Convert the model to GGUF
 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
+Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
@ -31,7 +31,7 @@ class MyModel(Model):
    model_arch = gguf.MODEL_ARCH.GROK
 ```
-2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
+2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
 Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
@ -54,7 +54,7 @@ Example for `falcon` model:
 As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
-Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
+Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file.
 If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
 When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
-Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
+Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
 ## GGUF specification
--- a/docs/development/token_generation_performance_tips.md
+++ b/docs/development/token_generation_performance_tips.md
@ -1,7 +1,7 @@
 # Token generation performance troubleshooting
 ## Verifying that the model is running on the GPU with CUDA
-Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
 ./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -15,69 +15,281 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
-**Command line options:**
+## Usage
- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
+```
- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores).
+usage: ./llama-server [options]
- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
+
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
+general:
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
+
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
+  -h,    --help, --usage          print usage and exit
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused
+         --version                show version and build info
- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused
+  -v,    --verbose                print verbose information
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
+         --verbosity N            set specific verbosity level (default: 0)
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`.
+         --verbose-prompt         print a verbose prompt before generation (default: false)
- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+         --no-display-prompt      don't print prompt at generation (default: false)
- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used.
+  -co,   --color                  colorise output to distinguish prompt and user input from generations (default: false)
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance.
+  -s,    --seed SEED              RNG seed (default: -1, use random seed for < 0)
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`
+  -t,    --threads N              number of threads to use during generation (default: 8)
- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
+  -tb,   --threads-batch N        number of threads to use during batch and prompt processing (default: same as --threads)
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
+  -td,   --threads-draft N        number of threads to use during generation (default: same as --threads)
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
+  -tbd,  --threads-batch-draft N  number of threads to use during batch and prompt processing (default: same as --threads-draft)
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
+         --draft N                number of tokens to draft for speculative decoding (default: 5)
- `--numa distribute`: Spread execution evenly over all nodes
+  -ps,   --p-split N              speculative decoding split probability (default: 0.1)
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
+  -lcs,  --lookup-cache-static FNAME
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
+                                  path to static lookup cache to use for lookup decoding (not updated by generation)
- `--numa`: Attempt optimizations that may help on some NUMA systems.
+  -lcd,  --lookup-cache-dynamic FNAME
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+                                  path to dynamic lookup cache to use for lookup decoding (updated by generation)
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
+  -c,    --ctx-size N             size of the prompt context (default: 0, 0 = loaded from model)
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`
+  -n,    --predict N              number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`
+  -b,    --batch-size N           logical maximum batch size (default: 2048)
- `--port`: Set the port to listen. Default: `8080`
+  -ub,   --ubatch-size N          physical maximum batch size (default: 512)
- `--path`: Path from which to serve static files. Default: disabled
+         --keep N                 number of tokens to keep from the initial prompt (default: 0, -1 = all)
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
+         --chunks N               max number of chunks to process (default: -1, -1 = all)
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
+  -fa,   --flash-attn             enable Flash Attention (default: disabled)
- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
+  -p,    --prompt PROMPT          prompt to start generation with
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
+                                  in conversation mode, this will be used as system prompt
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
+                                  (default: '')
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+  -f,    --file FNAME             a file containing the prompt (default: none)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
+         --in-file FNAME          an input file (repeat to specify multiple files)
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled.
+  -bf,   --binary-file FNAME      binary file containing the prompt (default: none)
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend.  Used together with group attention factor `--grp-attn-n`. Default: `512`
+  -e,    --escape                 process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
+         --no-escape              do not process escape sequences
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
+  -ptc,  --print-token-count N    print token count every N tokens (default: -1)
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
+         --prompt-cache FNAME     file to cache prompt state for faster startup (default: none)
- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled.
+         --prompt-cache-all       if specified, saves user input and generations to cache as well
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name.  Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+                                  not supported with --interactive or other interactive options
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
+         --prompt-cache-ro        if specified, uses the prompt cache but does not update it
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
+  -r,    --reverse-prompt PROMPT  halt generation at PROMPT, return control in interactive mode
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
+                                  can be specified more than once for multiple prompts
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
+  -sp,   --special                special tokens output enabled (default: false)
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
+  -cnv,  --conversation           run in conversation mode, does not print special tokens and suffix/prefix
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
+                                  if suffix/prefix are not specified, default chat template will be used
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
+                                  (default: false)
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
+  -i,    --interactive            run in interactive mode (default: false)
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
+  -if,   --interactive-first      run in interactive mode and wait for input right away (default: false)
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
+  -mli,  --multiline-input        allows you to write or paste multiple lines without ending each in '\'
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
+         --in-prefix-bos          prefix BOS to user inputs, preceding the `--in-prefix` string
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
+         --in-prefix STRING       string to prefix user inputs with (default: empty)
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
+         --in-suffix STRING       string to suffix after user inputs with (default: empty)
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
+         --spm-infill             use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
+
 sampling:
         --samplers SAMPLERS      samplers that will be used for generation in the order, separated by ';'
                                  (default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
         --sampling-seq SEQUENCE  simplified sequence for samplers that will be used (default: kfypmt)
         --ignore-eos             ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
         --penalize-nl            penalize newline tokens (default: false)
         --temp N                 temperature (default: 0.8)
         --top-k N                top-k sampling (default: 40, 0 = disabled)
         --top-p N                top-p sampling (default: 0.9, 1.0 = disabled)
         --min-p N                min-p sampling (default: 0.1, 0.0 = disabled)
         --tfs N                  tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
         --typical N              locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
         --repeat-last-n N        last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
         --repeat-penalty N       penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
         --presence-penalty N     repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
         --frequency-penalty N    repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
         --dynatemp-range N       dynamic temperature range (default: 0.0, 0.0 = disabled)
         --dynatemp-exp N         dynamic temperature exponent (default: 1.0)
         --mirostat N             use Mirostat sampling.
                                  Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
                                  (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
         --mirostat-lr N          Mirostat learning rate, parameter eta (default: 0.1)
         --mirostat-ent N         Mirostat target entropy, parameter tau (default: 5.0)
         -l TOKEN_ID(+/-)BIAS     modifies the likelihood of token appearing in the completion,
                                  i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
                                  or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
         --cfg-negative-prompt PROMPT
                                  negative prompt to use for guidance (default: '')
         --cfg-negative-prompt-file FNAME
                                  negative prompt file to use for guidance
         --cfg-scale N            strength of guidance (default: 1.0, 1.0 = disable)
         --chat-template JINJA_TEMPLATE
                                  set custom jinja chat template (default: template taken from model's metadata)
                                  if suffix/prefix are specified, template will be disabled
                                  only commonly used templates are accepted:
                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
 grammar:
         --grammar GRAMMAR        BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
         --grammar-file FNAME     file to read grammar from
  -j,    --json-schema SCHEMA     JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
                                  For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
 embedding:
         --pooling {none,mean,cls,last}
                                  pooling type for embeddings, use model default if unspecified
         --attention {causal,non-causal}
                                  attention type for embeddings, use model default if unspecified
 context hacking:
         --rope-scaling {none,linear,yarn}
                                  RoPE frequency scaling method, defaults to linear unless specified by the model
         --rope-scale N           RoPE context scaling factor, expands context by a factor of N
         --rope-freq-base N       RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
         --rope-freq-scale N      RoPE frequency scaling factor, expands context by a factor of 1/N
         --yarn-orig-ctx N        YaRN: original context size of model (default: 0 = model training context size)
         --yarn-ext-factor N      YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
         --yarn-attn-factor N     YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
         --yarn-beta-slow N       YaRN: high correction dim or alpha (default: 1.0)
         --yarn-beta-fast N       YaRN: low correction dim or beta (default: 32.0)
  -gan,  --grp-attn-n N           group-attention factor (default: 1)
  -gaw,  --grp-attn-w N           group-attention width (default: 512.0)
  -dkvc, --dump-kv-cache          verbose print of the KV cache
  -nkvo, --no-kv-offload          disable KV offload
  -ctk,  --cache-type-k TYPE      KV cache data type for K (default: f16)
  -ctv,  --cache-type-v TYPE      KV cache data type for V (default: f16)
 perplexity:
         --all-logits             return logits for all tokens in the batch (default: false)
         --hellaswag              compute HellaSwag score over random tasks from datafile supplied with -f
         --hellaswag-tasks N      number of tasks to use when computing the HellaSwag score (default: 400)
         --winogrande             compute Winogrande score over random tasks from datafile supplied with -f
         --winogrande-tasks N     number of tasks to use when computing the Winogrande score (default: 0)
         --multiple-choice        compute multiple choice score over random tasks from datafile supplied with -f
         --multiple-choice-tasks N
                                  number of tasks to use when computing the multiple choice score (default: 0)
         --kl-divergence          computes KL-divergence to logits provided via --kl-divergence-base
         --ppl-stride N           stride for perplexity calculation (default: 0)
         --ppl-output-type {0,1}  output type for perplexity calculation (default: 0)
 parallel:
  -dt,   --defrag-thold N         KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
  -np,   --parallel N             number of parallel sequences to decode (default: 1)
  -ns,   --sequences N            number of sequences to decode (default: 1)
  -cb,   --cont-batching          enable continuous batching (a.k.a dynamic batching) (default: enabled)
 multi-modality:
         --mmproj FILE            path to a multimodal projector file for LLaVA. see examples/llava/README.md
         --image FILE             path to an image file. use with multimodal models. Specify multiple times for batching
 backend:
         --rpc SERVERS            comma separated list of RPC servers
         --mlock                  force system to keep model in RAM rather than swapping or compressing
         --no-mmap                do not memory-map model (slower load but may reduce pageouts if not using mlock)
         --numa TYPE              attempt optimizations that help on some NUMA systems
                                    - distribute: spread execution evenly over all nodes
                                    - isolate: only spawn threads on CPUs on the node that execution started on
                                    - numactl: use the CPU map provided by numactl
                                  if run without this previously, it is recommended to drop the system page cache before using this
                                  see https://github.com/ggerganov/llama.cpp/issues/1437
 model:
         --check-tensors          check model tensor data for invalid values (default: false)
         --override-kv KEY=TYPE:VALUE
                                  advanced option to override model metadata by key. may be specified multiple times.
                                  types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
         --lora FNAME             apply LoRA adapter (implies --no-mmap)
         --lora-scaled FNAME S    apply LoRA adapter with user defined scaling S (implies --no-mmap)
         --lora-base FNAME        optional model to use as a base for the layers modified by the LoRA adapter
         --control-vector FNAME   add a control vector
                                  note: this argument can be repeated to add multiple control vectors
         --control-vector-scaled FNAME SCALE
                                  add a control vector with user defined scaling SCALE
                                  note: this argument can be repeated to add multiple scaled control vectors
         --control-vector-layer-range START END
                                  layer range to apply the control vector(s) to, start and end inclusive
  -m,    --model FNAME            model path (default: models/$filename with filename from --hf-file
                                  or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
  -md,   --model-draft FNAME      draft model for speculative decoding (default: unused)
  -mu,   --model-url MODEL_URL    model download url (default: unused)
  -hfr,  --hf-repo REPO           Hugging Face model repository (default: unused)
  -hff,  --hf-file FILE           Hugging Face model file (default: unused)
  -hft,  --hf-token TOKEN         Hugging Face access token (default: value from HF_TOKEN environment variable)
 retrieval:
         --context-file FNAME     file to load context from (repeat to specify multiple files)
         --chunk-size N           minimum length of embedded text chunks (default: 64)
         --chunk-separator STRING 
                                  separator between chunks (default: '
                                  ')
 passkey:
         --junk N                 number of times to repeat the junk text (default: 250)
         --pos N                  position of the passkey in the junk text (default: -1)
 imatrix:
  -o,    --output FNAME           output file (default: 'imatrix.dat')
         --output-frequency N     output the imatrix every N iterations (default: 10)
         --save-frequency N       save an imatrix copy every N iterations (default: 0)
         --process-output         collect data for the output tensor (default: false)
         --no-ppl                 do not compute perplexity (default: true)
         --chunk N                start processing the input from chunk N (default: 0)
 bench:
  -pps                            is the prompt shared across parallel sequences (default: false)
  -npp n0,n1,...                  number of prompt tokens
  -ntg n0,n1,...                  number of text generation tokens
  -npl n0,n1,...                  number of parallel prompts
 embedding:
         --embd-normalize         normalisation for embendings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
         --embd-output-format     empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
         --embd-separator         separator of embendings (default \n) for example "<#sep#>"
 server:
         --host HOST              ip address to listen (default: 127.0.0.1)
         --port PORT              port to listen (default: 8080)
         --path PATH              path to serve static files from (default: )
         --embedding(s)           enable embedding endpoint (default: disabled)
         --api-key KEY            API key to use for authentication (default: none)
         --api-key-file FNAME     path to file containing API keys (default: none)
         --ssl-key-file FNAME     path to file a PEM-encoded SSL private key
         --ssl-cert-file FNAME    path to file a PEM-encoded SSL certificate
         --timeout N              server read/write timeout in seconds (default: 600)
         --threads-http N         number of threads used to process HTTP requests (default: -1)
         --system-prompt-file FNAME
                                  set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
         --log-format {text,json} 
                                  log output format: json or text (default: json)
         --metrics                enable prometheus compatible metrics endpoint (default: disabled)
         --no-slots               disables slots monitoring endpoint (default: enabled)
         --slot-save-path PATH    path to save slot kv cache (default: disabled)
         --chat-template JINJA_TEMPLATE
                                  set custom jinja chat template (default: template taken from model's metadata)
                                  only commonly used templates are accepted:
                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
  -sps,  --slot-prompt-similarity SIMILARITY
                                  how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
 logging:
         --simple-io              use basic IO for better compatibility in subprocesses and limited consoles
  -ld,   --logdir LOGDIR          path under which to save YAML logs (no logging if unset)
         --log-test               Run simple logging test
         --log-disable            Disable trace logs
         --log-enable             Enable trace logs
         --log-file FNAME         Specify a log filename (without extension)
         --log-new                Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
         --log-append             Don't truncate the old log file.
 cvector:
  -o,    --output FNAME           output file (default: 'control_vector.gguf')
         --positive-file FNAME    positive prompts file, one prompt per line (default: 'examples/cvector-generator/positive.txt')
         --negative-file FNAME    negative prompts file, one prompt per line (default: 'examples/cvector-generator/negative.txt')
         --pca-batch N            batch size used for PCA. Larger batch runs faster, but uses more memory (default: 100)
         --pca-iter N             number of iterations used for PCA (default: 1000)
         --method {pca,mean}      dimensionality reduction method to be used (default: pca)
 ```
 **If compiled with `LLAMA_SERVER_SSL=ON`**
 - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
 - `--ssl-cert-file FNAME`: path to file a PEM-encoded SSL certificate
 ## Build
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@ -14,7 +14,9 @@
 #include "ggml-aarch64.h"
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Woverlength-strings"
 #endif
 #define UNUSED GGML_UNUSED
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -291,29 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k,
    dst[i] = x[i] * x[i];
 }
 static void concat_f32(const float  *x,const float  *y, float *dst, const int ne0, const int ne02,
                       const sycl::nd_item<3> &item_ct1) {
    int nidx = item_ct1.get_local_id(2) +
               item_ct1.get_group(2) * item_ct1.get_local_range(2);
    if (nidx >= ne0) {
        return;
    }
    // operation
    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
    if (item_ct1.get_group(0) < ne02) { // src0
        int offset_src =
            nidx + item_ct1.get_group(1) * ne0 +
            item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
            dst[offset_dst] = x[offset_src];
    } else {
        int offset_src =
            nidx + item_ct1.get_group(1) * ne0 +
            (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
            dst[offset_dst] = y[offset_src];
    }
 }
 static void upscale_f32(const float  *x, float *dst, const int nb00, const int nb01,
                        const int nb02, const int nb03, const int ne10, const int ne11,
                        const int ne12, const int ne13, const float sf0, const float sf1,
@ -1347,20 +1324,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
        });
 }
 static void concat_f32_sycl(const float *x, const float *y, float *dst,
                            const int ne0, int ne1, int ne2, int ne02,
                            queue_ptr stream) {
    int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
    sycl::range<3> gridDim(ne2, ne1, num_blocks);
    stream->parallel_for(
        sycl::nd_range<3>(gridDim *
                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
        [=](sycl::nd_item<3> item_ct1) {
            concat_f32(x, y, dst, ne0, ne02, item_ct1);
        });
 }
 static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
                             const int nb02, const int nb03, const int ne10, const int ne11,
                             const int ne12, const int ne13, const float sf0, const float sf1,
@ -2429,28 +2392,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
    (void) src1_dd;
 }
 inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                const ggml_tensor *src1, ggml_tensor *dst,
                                const float *src0_dd, const float *src1_dd,
                                float *dst_dd,
                                const queue_ptr &main_stream) {
 #pragma message("TODO: generalize concat kernel for dim != 2")
 #pragma message("      https://github.com/ggerganov/llama.cpp/pull/7563")
    int dim = dst->op_params[0];
    GGML_ASSERT(dim == 2);
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
        concat_f32_sycl(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
    }
    (void) src1;
    (void) dst;
 }
 inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                 const ggml_tensor *src1, ggml_tensor *dst,
                                 const float *src0_dd, const float *src1_dd,
@ -3359,12 +3300,6 @@ static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_ten
    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 static void ggml_sycl_concat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_concat);
    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_SYCL_DEBUG("call %s\n", __func__);
    ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale);
@ -4101,7 +4036,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
            func = ggml_sycl_group_norm;
            break;
        case GGML_OP_CONCAT:
-            func = ggml_sycl_concat;
+            func = ggml_sycl_op_concat;
            break;
        case GGML_OP_UPSCALE:
            func = ggml_sycl_upscale;
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@ -13,6 +13,7 @@
 #ifndef GGML_SYCL_BACKEND_HPP
 #define GGML_SYCL_BACKEND_HPP
 #include "concat.hpp"
 #include "common.hpp"
 #include "convert.hpp"
 #include "dequantize.hpp"
--- a/ggml/src/ggml-sycl/concat.cpp
+++ b/ggml/src/ggml-sycl/concat.cpp
@ -0,0 +1,195 @@
 //
 // MIT license
 // Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: MIT
 //
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 #include "concat.hpp"
 #include "common.hpp"
 static void concat_f32_dim0(const float *x, const float *y, float *dst,
                            const int ne0, const int ne00,
                            const sycl::nd_item<3> &item_ct1) {
  int nidx = item_ct1.get_local_id(2) +
             item_ct1.get_group(2) * item_ct1.get_local_range(2);
  if (nidx >= ne0) {
    return;
  }
  // operation
  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
                   item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
  if (nidx < ne00) { // src0
    int offset_src = nidx + item_ct1.get_group(1) * ne00 +
                     item_ct1.get_group(0) * ne00 * item_ct1.get_group_range(1);
    dst[offset_dst] = x[offset_src];
  } else {
    int offset_src =
        nidx - ne00 + item_ct1.get_group(1) * (ne0 - ne00) +
        item_ct1.get_group(0) * (ne0 - ne00) * item_ct1.get_group_range(1);
    dst[offset_dst] = y[offset_src];
  }
 }
 static void concat_f32_dim1(const float *x, const float *y, float *dst,
                            const int ne0, const int ne01,
                            const sycl::nd_item<3> &item_ct1) {
  int nidx = item_ct1.get_local_id(2) +
             item_ct1.get_group(2) * item_ct1.get_local_range(2);
  if (nidx >= ne0) {
    return;
  }
  // operation
  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
                   item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
  if (item_ct1.get_group(1) < ne01) { // src0
    int offset_src =
        nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
    dst[offset_dst] = x[offset_src];
  } else {
    int offset_src =
        nidx + (item_ct1.get_group(1) - ne01) * ne0 +
        item_ct1.get_group(0) * ne0 * (item_ct1.get_group_range(1) - ne01);
    dst[offset_dst] = y[offset_src];
  }
 }
 static void concat_f32_dim2(const float *x, const float *y, float *dst,
                            const int ne0, const int ne02,
                            const sycl::nd_item<3> &item_ct1) {
  int nidx = item_ct1.get_local_id(2) +
             item_ct1.get_group(2) * item_ct1.get_local_range(2);
  if (nidx >= ne0) {
    return;
  }
  // operation
  int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
                   item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
  if (item_ct1.get_group(0) < ne02) { // src0
    int offset_src = nidx + item_ct1.get_group(1) * ne0 +
                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
    dst[offset_dst] = x[offset_src];
  } else {
    int offset_src =
        nidx + item_ct1.get_group(1) * ne0 +
        (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
    dst[offset_dst] = y[offset_src];
  }
 }
 static void concat_f32_sycl(const float *x, const float *y, float *dst,
                            int ne00, int ne01, int ne02, int ne0, int ne1,
                            int ne2, int dim, queue_ptr stream) {
  int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
  sycl::range<3> gridDim(ne2, ne1, num_blocks);
  switch (dim) {
  case 0:
    stream->parallel_for(
        sycl::nd_range<3>(gridDim *
                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
        [=](sycl::nd_item<3> item_ct1) {
          concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
        });
    break;
  case 1:
    stream->parallel_for(
        sycl::nd_range<3>(gridDim *
                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
        [=](sycl::nd_item<3> item_ct1) {
          concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
        });
    break;
  default:
    stream->parallel_for(
        sycl::nd_range<3>(gridDim *
                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
        [=](sycl::nd_item<3> item_ct1) {
          concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
        });
    break;
  }
 }
 // non-contiguous kernel (slow)
 static void concat_f32_sycl_non_cont(
    queue_ptr stream, const char *src0, const char *src1, char *dst,
    int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00,
    uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/,
    int64_t /*ne11*/, int64_t /*ne12*/, int64_t /*ne13*/, uint64_t nb10,
    uint64_t nb11, uint64_t nb12, uint64_t nb13, int64_t ne0, int64_t ne1,
    int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
    uint64_t nb3, int32_t dim) {
  sycl::range<3> gridDim(ne3, ne2, ne1);
  stream->parallel_for(
      sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)),
      [=](sycl::nd_item<3> item_ct1) {
        int64_t i3 = item_ct1.get_group(0);
        int64_t i2 = item_ct1.get_group(1);
        int64_t i1 = item_ct1.get_group(2);
        int64_t o[4] = {0, 0, 0, 0};
        o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
        const float *x;
        for (int i0 = item_ct1.get_local_id(2); i0 < ne0;
             i0 += item_ct1.get_local_range(2)) {
          if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
            x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 +
                                (i0)*nb00);
          } else {
            x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 +
                                (i1 - o[1]) * nb11 + (i0 - o[0]) * nb10);
          }
          float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
          *y = *x;
        }
      });
 }
 void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                const ggml_tensor *src1, ggml_tensor *dst) {
  queue_ptr stream = ctx.stream();
  const int32_t dim = ((int32_t *)dst->op_params)[0];
  if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
    const float *src0_d = (const float *)src0->data;
    const float *src1_d = (const float *)src1->data;
    float *dst_d = (float *)dst->data;
    if (dim != 3) {
      for (int i3 = 0; i3 < dst->ne[3]; i3++) {
        concat_f32_sycl(
            src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
            dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
            src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
      }
    } else {
      const size_t size0 = ggml_nbytes(src0);
      const size_t size1 = ggml_nbytes(src1);
      SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
      SYCL_CHECK(CHECK_TRY_ERROR(
          stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
    }
  } else
    concat_f32_sycl_non_cont(
        stream, (const char *)src0->data, (const char *)src1->data,
        (char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
        src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
        src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
        src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
        dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
 }
--- a/ggml/src/ggml-sycl/concat.hpp
+++ b/ggml/src/ggml-sycl/concat.hpp
@ -0,0 +1,21 @@
 //
 // MIT license
 // Copyright (C) 2024 Intel Corporation
 // SPDX-License-Identifier: MIT
 //
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 #ifndef GGML_SYCL_CONCAT_HPP
 #define GGML_SYCL_CONCAT_HPP
 #include "common.hpp"
 void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                const ggml_tensor *src1, ggml_tensor *dst);
 #endif // GGML_SYCL_CONCAT_HPP