diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ff7238aba..63143bc94 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -52,7 +52,7 @@ jobs:
id: cmake_test
run: |
cd build
- ctest -L main --verbose --timeout 900
+ ctest -L 'main|curl' --verbose --timeout 900
- name: Determine tag name
id: tag
@@ -101,7 +101,9 @@ jobs:
sysctl -a
mkdir build
cd build
- cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+ # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+ # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+ cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@@ -209,21 +211,21 @@ jobs:
id: depends
run: |
sudo apt-get update
- sudo apt-get install build-essential
+ sudo apt-get install build-essential libcurl4-openssl-dev
- name: Build
id: cmake_build
run: |
mkdir build
cd build
- cmake .. -DLLAMA_FATAL_WARNINGS=ON
+ cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
- ctest -L main --verbose --timeout 900
+ ctest -L 'main|curl' --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
@@ -938,6 +940,12 @@ jobs:
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
+ with:
+ path: ./artifact
+
+ - name: Move artifacts
+ id: move_artifacts
+ run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
- name: Create release
id: create_release
@@ -956,7 +964,7 @@ jobs:
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
- for (let file of await fs.readdirSync('./artifact')) {
+ for (let file of await fs.readdirSync('./artifact/release')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
@@ -964,7 +972,7 @@ jobs:
repo: context.repo.repo,
release_id: release_id,
name: file,
- data: await fs.readFileSync(`./artifact/${file}`)
+ data: await fs.readFileSync(`./artifact/release/${file}`)
});
}
}
diff --git a/.gitignore b/.gitignore
index 9fb5b80c3..fdc5184a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,6 +48,7 @@ models-mnt
/convert-llama2c-to-ggml
/embd-input-test
/embedding
+/eval-callback
/gguf
/gguf-llama-simple
/gguf-split
diff --git a/Makefile b/Makefile
index 11b31c5c8..7a69ad1b3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
- simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
@@ -646,7 +646,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH
ifndef CUDA_POWER_ARCH
-$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
+$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@@ -800,6 +800,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/README-sycl.md b/README-sycl.md
index 2a6151072..635aa2dd7 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -8,9 +8,9 @@
- [Linux](#linux)
- [Windows](#windows)
- [Environment Variable](#environment-variable)
-- [Known Issue](#known-issue)
-- [Q&A](#q&a)
-- [Todo](#todo)
+- [Known Issue](#known-issues)
+- [Q&A](#qa)
+- [TODO](#todo)
## Background
@@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
## OS
-|OS|Status|Verified|
-|-|-|-|
-|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
-|Windows|Support|Windows 11|
+| OS | Status | Verified |
+|---------|---------|------------------------------------|
+| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
+| Windows | Support | Windows 11 |
## Hardware
@@ -66,13 +66,13 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
**Verified devices**
-|Intel GPU| Status | Verified Model|
-|-|-|-|
-|Intel Data Center Max Series| Support| Max 1550, 1100|
-|Intel Data Center Flex Series| Support| Flex 170|
-|Intel Arc Series| Support| Arc 770, 730M|
-|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
-|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
+| Intel GPU | Status | Verified Model |
+|-------------------------------|---------|---------------------------------------|
+| Intel Data Center Max Series | Support | Max 1550, 1100 |
+| Intel Data Center Flex Series | Support | Flex 170 |
+| Intel Arc Series | Support | Arc 770, 730M |
+| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
+| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
*Notes:*
@@ -88,10 +88,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
**Verified devices**
-|Nvidia GPU| Status | Verified Model|
-|-|-|-|
-|Ampere Series| Support| A100, A4000|
-|Ampere Series *(Mobile)*| Support| RTX 40 Series|
+| Nvidia GPU | Status | Verified Model |
+|--------------------------|---------|----------------|
+| Ampere Series | Support | A100, A4000 |
+| Ampere Series *(Mobile)* | Support | RTX 40 Series |
## Docker
The docker build option is currently limited to *intel GPU* targets.
@@ -161,7 +161,7 @@ Platform #0: Intel(R) OpenCL HD Graphics
- **Nvidia GPU**
-In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
+In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
2. **Install IntelĀ® oneAPI Base toolkit**
@@ -295,10 +295,10 @@ found 6 SYCL devices:
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero driver/runtime, recommended |
-|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
+| Attribute | Note |
+|------------------------|-------------------------------------------------------------|
+| compute capability 1.3 | Level-zero driver/runtime, recommended |
+| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
4. Launch inference
@@ -307,10 +307,10 @@ There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
-|Device selection|Parameter|
-|-|-|
-|Single device|--split-mode none --main-gpu DEVICE_ID |
-|Multiple devices|--split-mode layer (default)|
+| Device selection | Parameter |
+|------------------|----------------------------------------|
+| Single device | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default) |
Examples:
@@ -468,10 +468,10 @@ found 6 SYCL devices:
```
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+| Attribute | Note |
+|------------------------|-----------------------------------------------------------|
+| compute capability 1.3 | Level-zero running time, recommended |
+| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
4. Launch inference
@@ -481,10 +481,10 @@ There are two device selection modes:
- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
-|Device selection|Parameter|
-|-|-|
-|Single device|--split-mode none --main-gpu DEVICE_ID |
-|Multiple devices|--split-mode layer (default)|
+| Device selection | Parameter |
+|------------------|----------------------------------------|
+| Single device | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default) |
Examples:
@@ -522,20 +522,20 @@ use 1 SYCL GPUs: [0] with Max compute units:512
#### Build
-|Name|Value|Function|
-|-|-|-|
-|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
-|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
-|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
-|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
-|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
+| Name | Value | Function |
+|--------------------|-----------------------------------|---------------------------------------------|
+| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
+| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
+| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
+| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
+| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
#### Runtime
-|Name|Value|Function|
-|-|-|-|
-|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
-|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer|
+| Name | Value | Function |
+|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
+| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
+| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer |
## Known Issues
@@ -567,6 +567,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
### **GitHub contribution**:
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
-## Todo
+## TODO
- Support row layer split for multiple card runs.
diff --git a/README.md b/README.md
index 462b1b180..00a487fc6 100644
--- a/README.md
+++ b/README.md
@@ -485,14 +485,14 @@ Building the program with BLAS support may lead to some performance improvements
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
- | Option | Legal values | Default | Description |
- |--------------------------------|------------------------|---------|-------------|
- | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
- | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
- | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
- | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
- | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
+ | Option | Legal values | Default | Description |
+ |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+ | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
+ | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+ | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
+ | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
+ | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+ | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
- #### hipBLAS
@@ -534,11 +534,11 @@ Building the program with BLAS support may lead to some performance improvements
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
- | Option | Legal values | Default | Description |
- |-------------------------|------------------------|---------|-------------|
- | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
- | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
- | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+ | Option | Legal values | Default | Description |
+ |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+ | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+ | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+ | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast
@@ -746,11 +746,11 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
| Model | Original size | Quantized size (Q4_0) |
-|------:|--------------:|-----------------------:|
-| 7B | 13 GB | 3.9 GB |
-| 13B | 24 GB | 7.8 GB |
-| 30B | 60 GB | 19.5 GB |
-| 65B | 120 GB | 38.5 GB |
+|------:|--------------:|----------------------:|
+| 7B | 13 GB | 3.9 GB |
+| 13B | 24 GB | 7.8 GB |
+| 30B | 60 GB | 19.5 GB |
+| 65B | 120 GB | 38.5 GB |
### Quantization
@@ -758,7 +758,7 @@ Several quantization methods are supported. They differ in the resulting model d
*(outdated)*
-| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
+| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |
diff --git a/SECURITY.md b/SECURITY.md
index 14504b1bf..f4322c6ee 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -49,11 +49,11 @@ If you intend to run multiple models in parallel with shared memory, it is your
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
-1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
+2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
-1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
+3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
-1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
+4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
## Reporting a vulnerability
diff --git a/common/common.cpp b/common/common.cpp
index 98fc8388c..dda514785 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1745,6 +1745,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
cparams.defrag_thold = params.defrag_thold;
+ cparams.cb_eval = params.cb_eval;
+ cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@@ -2192,7 +2194,7 @@ std::tuple llama_init_from_gpt_par
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}
- {
+ if (params.warmup) {
LOG("warming up the model with an empty run\n");
std::vector tmp = { llama_token_bos(model), llama_token_eos(model), };
diff --git a/common/common.h b/common/common.h
index a7f476c1b..65272b0ba 100644
--- a/common/common.h
+++ b/common/common.h
@@ -80,6 +80,9 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
+ ggml_backend_sched_eval_callback cb_eval = nullptr;
+ void * cb_eval_user_data = nullptr;
+
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
@@ -156,6 +159,7 @@ struct gpt_params {
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
+ bool warmup = true; // warmup run
std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V
diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md
index 3581f3e65..a56b78344 100644
--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@@ -100,6 +100,8 @@ Have a look to existing implementation like `build_llama`, `build_dbrx` or `buil
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
+Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
+
## GGUF specification
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 76496bf06..f421769cc 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,6 +19,7 @@ else()
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
+ add_subdirectory(eval-callback)
add_subdirectory(finetune)
add_subdirectory(gritlm)
add_subdirectory(gguf-split)
diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt
new file mode 100644
index 000000000..c56ba780b
--- /dev/null
+++ b/examples/eval-callback/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(TARGET eval-callback)
+add_executable(${TARGET} eval-callback.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TEST_TARGET test-eval-callback)
+add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md
new file mode 100644
index 000000000..66a37e878
--- /dev/null
+++ b/examples/eval-callback/README.md
@@ -0,0 +1,95 @@
+# llama.cpp/examples/eval-callback
+
+A simple example which demonstrates how to use callback during the inference.
+It simply prints to the console all operations and tensor data.
+
+Usage:
+
+```shell
+eval-callback \
+ --hf-repo ggml-org/models \
+ --hf-file phi-2/ggml-model-q4_0.gguf \
+ --model phi-2-q4_0.gguf \
+ --prompt hello \
+ --seed 42 \
+ -ngl 33
+```
+
+Will print:
+
+```shell
+llm_load_tensors: offloaded 33/33 layers to GPU
+...
+llama_new_context_with_model: n_ctx = 512
+...
+llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
+llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
+llama_new_context_with_model: graph nodes = 1225
+llama_new_context_with_model: graph splits = 2
+ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.0181, 0.0272, 0.0272, ...],
+ ],
+ ]
+ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.6989, 1.0636, 1.0636, ...],
+ ],
+ ]
+ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.1800, 0.2817, 0.2632, ...],
+ ],
+ ]
+ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+ [
+ [
+ [ -0.1863, 0.2970, 0.2604, ...],
+ ],
+ ]
+ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
+ [
+ [
+ [ -1.1238, 1.2876, -1.8086, ...],
+ ],
+ ]
+ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ ],
+ ]
+ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ ],
+ ]
+ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ ],
+ ]
+ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ [ -0.3608, 0.5076, -1.8866, ...],
+ [ 1.7643, 0.0273, -2.1065, ...],
+ ...
+ ],
+ ]
+ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
+ [
+ [
+ [ -1.1135, 1.4604, -1.9226, ...],
+ [ -0.3608, 0.5076, -1.8866, ...],
+ [ 1.7643, 0.0273, -2.1065, ...],
+ ...
+ ],
+ ]
+```
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
new file mode 100644
index 000000000..05f7d6ab1
--- /dev/null
+++ b/examples/eval-callback/eval-callback.cpp
@@ -0,0 +1,185 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include
+#include
+#include
+#include
+#include
+
+/**
+ * This the arbitrary data which will be passed to each callback.
+ * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
+ */
+struct callback_data {
+ std::vector data;
+};
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+ std::string str;
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+ str += std::to_string(t->ne[i]);
+ if (i + 1 < GGML_MAX_DIMS) {
+ str += ", ";
+ }
+ }
+ return str;
+}
+
+static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+ float sum = 0;
+ for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+ printf(" [\n");
+ for (int64_t i2 = 0; i2 < ne[2] && i2 < n; i2++) {
+ printf(" [\n");
+ for (int64_t i1 = 0; i1 < ne[1] && i1 < n; i1++) {
+ printf(" [");
+ for (int64_t i0 = 0; i0 < ne[0] && i0 < n; i0++) {
+ size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+ float v;
+ if (type == GGML_TYPE_F16) {
+ v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
+ } else if (type == GGML_TYPE_F32) {
+ v = *(float *) data + i;
+ } else if (type == GGML_TYPE_I32) {
+ v = (float) *(int32_t *) data + i;
+ } else if (type == GGML_TYPE_I16) {
+ v = (float) *(int16_t *) data + i;
+ } else if (type == GGML_TYPE_I8) {
+ v = (float) *(int8_t *) data + i;
+ } else {
+ GGML_ASSERT(false);
+ }
+ printf("%8.4f", v);
+ sum += v;
+ if (i0 < ne[0] - 1 && i0 < n - 1) printf(", ");
+ }
+ if (ne[0] > n) printf(", ...");
+ printf("],\n");
+ }
+ if (ne[1] > n) printf(" ...\n");
+ printf(" ],\n");
+ }
+ if (ne[2] > n) printf(" ...\n");
+ printf(" ]\n");
+ printf(" sum = %f\n", sum);
+ }
+}
+
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ * see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
+ auto * cb_data = (callback_data *) user_data;
+
+ const struct ggml_tensor * src0 = t->src[0];
+ const struct ggml_tensor * src1 = t->src[1];
+
+ if (ask) {
+ return true; // Always retrieve data
+ }
+
+ char src1_str[128] = {0};
+ if (src1) {
+ sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+ }
+
+ printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+ t->name, ggml_type_name(t->type), ggml_op_desc(t),
+ src0->name, ggml_ne_string(src0).c_str(),
+ src1 ? src1_str : "",
+ ggml_ne_string(t).c_str());
+
+
+ // copy the data from the GPU memory if needed
+ const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+ if (!is_host) {
+ auto n_bytes = ggml_nbytes(t);
+ cb_data->data.resize(n_bytes);
+ ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+ }
+
+ if (!ggml_is_quantized(t->type)) {
+ uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+ ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+ }
+
+ return true;
+}
+
+static bool run(llama_context * ctx, const gpt_params & params) {
+ const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+
+ std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return false;
+ }
+
+ return true;
+}
+
+int main(int argc, char ** argv) {
+
+ callback_data cb_data;
+
+ gpt_params params;
+ if (!gpt_params_parse(argc, argv, params)) {
+ return 1;
+ }
+
+ print_build_info();
+
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+
+ llama_backend_init();
+ llama_numa_init(params.numa);
+
+ // pass the callback to the backend scheduler
+ // it will be executed for each node during the graph computation
+ params.cb_eval = ggml_debug;
+ params.cb_eval_user_data = &cb_data;
+ params.warmup = false;
+
+ // init
+ llama_model * model;
+ llama_context * ctx;
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ if (model == nullptr || ctx == nullptr) {
+ fprintf(stderr, "%s : failed to init\n", __func__);
+ return 1;
+ }
+
+ // print system information
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "%s\n", get_system_info(params).c_str());
+ }
+
+ bool OK = run(ctx, params);
+ if (!OK) {
+ return 1;
+ }
+
+ llama_print_timings(ctx);
+
+ llama_free(ctx);
+ llama_free_model(model);
+
+ llama_backend_free();
+
+ return 0;
+}
diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp
index e4c0c1689..091069ffa 100644
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -17,7 +17,7 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+ llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 1bf55f90c..73609d3e6 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -107,9 +107,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
// the top-k selected expert ids are stored in the ids tensor
// for simplicity, always copy ids to host, because it is small
- // take into account that ids is not contiguous!
GGML_ASSERT(ids->ne[1] == src1->ne[1]);
- GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
m_ids.resize(ggml_nbytes(ids)/sizeof(int));
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
@@ -597,24 +595,18 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
- llama_model_params mparams = llama_model_params_from_gpt_params(params);
-
- llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
- if (model == NULL) {
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
- return 1;
- }
-
- llama_context_params cparams = llama_context_params_from_gpt_params(params);
-
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
- cparams.cb_eval = ik_collect_imatrix;
- cparams.cb_eval_user_data = NULL;
+ params.cb_eval = ik_collect_imatrix;
+ params.cb_eval_user_data = NULL;
+ params.warmup = false;
- llama_context * ctx = llama_new_context_with_model(model, cparams);
- if (ctx == NULL) {
- fprintf(stderr, "%s: error: unable to create context\n", __func__);
+ // init
+ llama_model * model;
+ llama_context * ctx;
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ if (model == nullptr || ctx == nullptr) {
+ fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
}
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 96b048525..413e433dd 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -22,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example:
## Model conversion
-- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
+1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
```sh
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
diff --git a/examples/llava/README.md b/examples/llava/README.md
index 67cb0f22b..d4810d42e 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example:
## LLaVA 1.5
-- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
+1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
diff --git a/examples/main/README.md b/examples/main/README.md
index bb696b562..10a589ceb 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -310,7 +310,7 @@ These options help improve the performance and memory usage of the LLaMA models.
### Quantization
-For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
+For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
## Additional Options
diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md
index 50e1af011..1a8c0dd64 100644
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@@ -3,19 +3,18 @@
TODO
## Llama 2 70B Scorechart
-Quantization | Model size (GiB) | Perplexity | Delta to fp16
--- | -- | -- | --
-Q4_0 | 36.20 | 3.5550 | 3.61%
-Q4_1 | 40.20 | 3.5125 | 2.37%
-Q5_0 | 44.20 | 3.4744 | 1.26%
-Q2_K | 27.27 | 3.7339 | 8.82%
-Q3_K_S | 27.86 | 3.7019 | 7.89%
-Q3_K_M | 30.83 | 3.5932 | 4.72%
-Q3_K_L | 33.67 | 3.5617 | 3.80%
-Q4_K_S | 36.39 | 3.4852 | 1.57%
-Q4_K_M | 38.54 | 3.4725 | 1.20%
-Q5_K_S | 44.20 | 3.4483 | 0.50%
-Q5_K_M | 45.41 | 3.4451 | 0.40%
-Q6_K | 52.70 | 3.4367 | 0.16%
-fp16 | 128.5 | 3.4313 | -
-
+| Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
+|--------------|------------------|------------|---------------|
+| Q4_0 | 36.20 | 3.5550 | 3.61% |
+| Q4_1 | 40.20 | 3.5125 | 2.37% |
+| Q5_0 | 44.20 | 3.4744 | 1.26% |
+| Q2_K | 27.27 | 3.7339 | 8.82% |
+| Q3_K_S | 27.86 | 3.7019 | 7.89% |
+| Q3_K_M | 30.83 | 3.5932 | 4.72% |
+| Q3_K_L | 33.67 | 3.5617 | 3.80% |
+| Q4_K_S | 36.39 | 3.4852 | 1.57% |
+| Q4_K_M | 38.54 | 3.4725 | 1.20% |
+| Q5_K_S | 44.20 | 3.4483 | 0.50% |
+| Q5_K_M | 45.41 | 3.4451 | 0.40% |
+| Q6_K | 52.70 | 3.4367 | 0.16% |
+| fp16 | 128.5 | 3.4313 | - |
diff --git a/examples/quantize/README.md b/examples/quantize/README.md
index c8b9a27a0..8a10365c0 100644
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -4,17 +4,17 @@ TODO
## Llama 2 7B
-Quantization | Bits per Weight (BPW)
--- | --
-Q2_K | 3.35
-Q3_K_S | 3.50
-Q3_K_M | 3.91
-Q3_K_L | 4.27
-Q4_K_S | 4.58
-Q4_K_M | 4.84
-Q5_K_S | 5.52
-Q5_K_M | 5.68
-Q6_K | 6.56
+| Quantization | Bits per Weight (BPW) |
+|--------------|-----------------------|
+| Q2_K | 3.35 |
+| Q3_K_S | 3.50 |
+| Q3_K_M | 3.91 |
+| Q3_K_L | 4.27 |
+| Q4_K_S | 4.58 |
+| Q4_K_M | 4.84 |
+| Q5_K_S | 5.52 |
+| Q5_K_M | 5.68 |
+| Q6_K | 6.56 |
## Llama 2 13B
Quantization | Bits per Weight (BPW)
diff --git a/llama.cpp b/llama.cpp
index e84a99912..8ce319e45 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1638,17 +1638,17 @@ static size_t llama_get_device_memory(int device) {
#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
- ggml_backend_cuda_get_device_memory(device, &total, &free);
+ ggml_backend_cuda_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
- ggml_backend_sycl_get_device_memory(device, &total, &free);
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
- ggml_backend_vk_get_device_memory(device, &total, &free);
+ ggml_backend_vk_get_device_memory(device, &free, &total);
return free;
#else
return 1;
@@ -11121,7 +11121,7 @@ struct llm_tokenizer_bpe {
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
}
- // add the fnished tokens to the final list keeping correct order for next and prev
+ // add the finished tokens to the final list keeping correct order for next and prev
for (auto & sym : symbols) {
if (sym.n > 0) {
sym.prev = final_prev_index;
@@ -11861,7 +11861,9 @@ static void llama_grammar_advance_stack(
std::vector> & new_stacks) {
if (stack.empty()) {
- new_stacks.emplace_back(stack);
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
+ new_stacks.emplace_back(stack);
+ }
return;
}
@@ -11898,7 +11900,10 @@ static void llama_grammar_advance_stack(
}
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
- new_stacks.emplace_back(stack);
+ if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
+ // only add the stack if it's not a duplicate of one we already have
+ new_stacks.emplace_back(stack);
+ }
break;
default:
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -11912,12 +11917,13 @@ static void llama_grammar_advance_stack(
// be positioned at a character range (see `llama_grammar_advance_stack`), and
// produces the N possible stacks if the given char is accepted at those
// positions
-std::vector> llama_grammar_accept(
+void llama_grammar_accept(
const std::vector> & rules,
const std::vector> & stacks,
- const uint32_t chr) {
+ const uint32_t chr,
+ std::vector> & new_stacks) {
- std::vector> new_stacks;
+ new_stacks.clear();
for (const auto & stack : stacks) {
if (stack.empty()) {
@@ -11936,8 +11942,6 @@ std::vector> llama_grammar_accept(
llama_grammar_advance_stack(rules, new_stack, new_stacks);
}
}
-
- return new_stacks;
}
static std::vector llama_grammar_reject_candidates(
@@ -11951,6 +11955,7 @@ static std::vector llama_grammar_reject_candidates_for_
const std::vector & candidates) {
std::vector rejects;
+ rejects.reserve(candidates.size());
if (stack.empty()) {
for (const auto & tok : candidates) {
@@ -11964,6 +11969,8 @@ static std::vector llama_grammar_reject_candidates_for_
const llama_grammar_element * stack_pos = stack.back();
std::vector next_candidates;
+ next_candidates.reserve(candidates.size());
+
for (const auto & tok : candidates) {
if (*tok.code_points == 0) {
// reached end of full codepoints in token, reject iff it ended in a partial sequence
@@ -12771,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
// Note terminating 0 in decoded string
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
const auto & code_points = decoded.first;
+ std::vector> tmp_new_stacks;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+ llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
+ grammar->stacks = tmp_new_stacks;
}
grammar->partial_utf8 = decoded.second;
GGML_ASSERT(!grammar->stacks.empty());
diff --git a/llama.h b/llama.h
index b770a275f..b5da686f7 100644
--- a/llama.h
+++ b/llama.h
@@ -1097,10 +1097,11 @@ const std::vector> & llama_internal
struct llama_context * ctx
);
-std::vector> llama_grammar_accept(
+void llama_grammar_accept(
const std::vector> & rules,
const std::vector> & stacks,
- const uint32_t chr);
+ const uint32_t chr,
+ std::vector> & new_stacks);
std::pair, llama_partial_utf8> decode_utf8(
const std::string & src,
diff --git a/scripts/hf.sh b/scripts/hf.sh
index 1e9e5a6ea..58f83d6fe 100755
--- a/scripts/hf.sh
+++ b/scripts/hf.sh
@@ -3,9 +3,9 @@
# Shortcut for downloading HF models
#
# Usage:
-# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
-# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
-# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
+# ./main -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
+# ./main -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
+# ./main -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
#
# all logs go to stderr
@@ -14,7 +14,7 @@ function log {
}
function usage {
- log "Usage: $0 [[--url] ] [--repo ] [--file ] [-h|--help]"
+ log "Usage: $0 [[--url] ] [--repo ] [--file ] [--outdir [-h|--help]"
exit 1
}
@@ -26,9 +26,9 @@ function has_cmd {
}
if has_cmd wget; then
- cmd="wget -q --show-progress -c -O %s %s"
+ cmd="wget -q --show-progress -c -O %s/%s %s"
elif has_cmd curl; then
- cmd="curl -C - -f -o %s -L %s"
+ cmd="curl -C - -f --output-dir %s -o %s -L %s"
else
log "[E] curl or wget not found"
exit 1
@@ -37,6 +37,7 @@ fi
url=""
repo=""
file=""
+outdir="."
# parse args
while [[ $# -gt 0 ]]; do
@@ -53,6 +54,10 @@ while [[ $# -gt 0 ]]; do
file="$2"
shift 2
;;
+ --outdir)
+ outdir="$2"
+ shift 2
+ ;;
-h|--help)
usage
;;
@@ -94,10 +99,10 @@ basename=$(basename $url)
log "[+] attempting to download $basename"
if [ -n "$cmd" ]; then
- cmd=$(printf "$cmd" "$basename" "$url")
+ cmd=$(printf "$cmd" "$outdir" "$basename" "$url")
log "[+] $cmd"
if $cmd; then
- echo $basename
+ echo $outdir/$basename
exit 0
fi
fi
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 0a9c3b6f5..2d8f228e3 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -38,7 +38,7 @@ number ::= [0-9]+)""";
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+ llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
assert(!grammar->stacks.empty());
}
@@ -138,7 +138,7 @@ ws ::= [ \t\n\r]?)""";
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
++pos;
auto prev_stacks = grammar->stacks;
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+ llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
// Expect that each code point will not cause the grammar to fail
if (grammar->stacks.empty()) {
@@ -173,7 +173,7 @@ ws ::= [ \t\n\r]?)""";
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
- grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+ llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) {
parse_failed = true;
break;