Merge branch 'master' into fix_memcpy_crash

This commit is contained in:
Neo Zhang Jianyu 2024-04-12 16:55:03 +08:00 committed by GitHub
commit 7e54166562
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 469 additions and 152 deletions

View file

@ -52,7 +52,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ctest -L 'main|curl' --verbose --timeout 900
- name: Determine tag name
id: tag
@ -101,7 +101,9 @@ jobs:
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@ -209,21 +211,21 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ctest -L 'main|curl' --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
@ -938,6 +940,12 @@ jobs:
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
with:
path: ./artifact
- name: Move artifacts
id: move_artifacts
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
- name: Create release
id: create_release
@ -956,7 +964,7 @@ jobs:
const path = require('path');
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./artifact')) {
for (let file of await fs.readdirSync('./artifact/release')) {
if (path.extname(file) === '.zip') {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
@ -964,7 +972,7 @@ jobs:
repo: context.repo.repo,
release_id: release_id,
name: file,
data: await fs.readFileSync(`./artifact/${file}`)
data: await fs.readFileSync(`./artifact/release/${file}`)
});
}
}

1
.gitignore vendored
View file

@ -48,6 +48,7 @@ models-mnt
/convert-llama2c-to-ggml
/embd-input-test
/embedding
/eval-callback
/gguf
/gguf-llama-simple
/gguf-split

View file

@ -1,7 +1,7 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
@ -646,7 +646,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH
ifndef CUDA_POWER_ARCH
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@ -800,6 +800,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View file

@ -8,9 +8,9 @@
- [Linux](#linux)
- [Windows](#windows)
- [Environment Variable](#environment-variable)
- [Known Issue](#known-issue)
- [Q&A](#q&a)
- [Todo](#todo)
- [Known Issue](#known-issues)
- [Q&A](#qa)
- [TODO](#todo)
## Background
@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
## OS
|OS|Status|Verified|
|-|-|-|
|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|Windows|Support|Windows 11|
| OS | Status | Verified |
|---------|---------|------------------------------------|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
| Windows | Support | Windows 11 |
## Hardware
@ -66,13 +66,13 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
**Verified devices**
|Intel GPU| Status | Verified Model|
|-|-|-|
|Intel Data Center Max Series| Support| Max 1550, 1100|
|Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770, 730M|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
| Intel GPU | Status | Verified Model |
|-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 |
| Intel Arc Series | Support | Arc 770, 730M |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
*Notes:*
@ -88,10 +88,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
**Verified devices**
|Nvidia GPU| Status | Verified Model|
|-|-|-|
|Ampere Series| Support| A100, A4000|
|Ampere Series *(Mobile)*| Support| RTX 40 Series|
| Nvidia GPU | Status | Verified Model |
|--------------------------|---------|----------------|
| Ampere Series | Support | A100, A4000 |
| Ampere Series *(Mobile)* | Support | RTX 40 Series |
## Docker
The docker build option is currently limited to *intel GPU* targets.
@ -161,7 +161,7 @@ Platform #0: Intel(R) OpenCL HD Graphics
- **Nvidia GPU**
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
2. **Install Intel® oneAPI Base toolkit**
@ -295,10 +295,10 @@ found 6 SYCL devices:
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero driver/runtime, recommended |
|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
| Attribute | Note |
|------------------------|-------------------------------------------------------------|
| compute capability 1.3 | Level-zero driver/runtime, recommended |
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
4. Launch inference
@ -307,10 +307,10 @@ There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
|Device selection|Parameter|
|-|-|
|Single device|--split-mode none --main-gpu DEVICE_ID |
|Multiple devices|--split-mode layer (default)|
| Device selection | Parameter |
|------------------|----------------------------------------|
| Single device | --split-mode none --main-gpu DEVICE_ID |
| Multiple devices | --split-mode layer (default) |
Examples:
@ -468,10 +468,10 @@ found 6 SYCL devices:
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
| Attribute | Note |
|------------------------|-----------------------------------------------------------|
| compute capability 1.3 | Level-zero running time, recommended |
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
4. Launch inference
@ -481,10 +481,10 @@ There are two device selection modes:
- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|Device selection|Parameter|
|-|-|
|Single device|--split-mode none --main-gpu DEVICE_ID |
|Multiple devices|--split-mode layer (default)|
| Device selection | Parameter |
|------------------|----------------------------------------|
| Single device | --split-mode none --main-gpu DEVICE_ID |
| Multiple devices | --split-mode layer (default) |
Examples:
@ -522,20 +522,20 @@ use 1 SYCL GPUs: [0] with Max compute units:512
#### Build
|Name|Value|Function|
|-|-|-|
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
| Name | Value | Function |
|--------------------|-----------------------------------|---------------------------------------------|
| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
#### Runtime
|Name|Value|Function|
|-|-|-|
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
| Name | Value | Function |
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
## Known Issues
@ -567,6 +567,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
### **GitHub contribution**:
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
## Todo
## TODO
- Support row layer split for multiple card runs.

View file

@ -485,14 +485,14 @@ Building the program with BLAS support may lead to some performance improvements
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|--------------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| Option | Legal values | Default | Description |
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
- #### hipBLAS
@ -534,11 +534,11 @@ Building the program with BLAS support may lead to some performance improvements
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
| Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast
@ -746,11 +746,11 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
| Model | Original size | Quantized size (Q4_0) |
|------:|--------------:|-----------------------:|
| 7B | 13 GB | 3.9 GB |
| 13B | 24 GB | 7.8 GB |
| 30B | 60 GB | 19.5 GB |
| 65B | 120 GB | 38.5 GB |
|------:|--------------:|----------------------:|
| 7B | 13 GB | 3.9 GB |
| 13B | 24 GB | 7.8 GB |
| 30B | 60 GB | 19.5 GB |
| 65B | 120 GB | 38.5 GB |
### Quantization
@ -758,7 +758,7 @@ Several quantization methods are supported. They differ in the resulting model d
*(outdated)*
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
| 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G |

View file

@ -49,11 +49,11 @@ If you intend to run multiple models in parallel with shared memory, it is your
1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
## Reporting a vulnerability

View file

@ -1745,6 +1745,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.pooling_type = params.pooling_type;
cparams.defrag_thold = params.defrag_thold;
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.offload_kqv = !params.no_kv_offload;
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@ -2192,7 +2194,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}
{
if (params.warmup) {
LOG("warming up the model with an empty run\n");
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };

View file

@ -80,6 +80,9 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
@ -156,6 +159,7 @@ struct gpt_params {
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading
bool warmup = true; // warmup run
std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V

View file

@ -100,6 +100,8 @@ Have a look to existing implementation like `build_llama`, `build_dbrx` or `buil
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
## GGUF specification
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md

View file

@ -19,6 +19,7 @@ else()
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
add_subdirectory(finetune)
add_subdirectory(gritlm)
add_subdirectory(gguf-split)

View file

@ -0,0 +1,9 @@
set(TARGET eval-callback)
add_executable(${TARGET} eval-callback.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

View file

@ -0,0 +1,95 @@
# llama.cpp/examples/eval-callback
A simple example which demonstrates how to use callback during the inference.
It simply prints to the console all operations and tensor data.
Usage:
```shell
eval-callback \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model phi-2-q4_0.gguf \
--prompt hello \
--seed 42 \
-ngl 33
```
Will print:
```shell
llm_load_tensors: offloaded 33/33 layers to GPU
...
llama_new_context_with_model: n_ctx = 512
...
llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
llama_new_context_with_model: graph nodes = 1225
llama_new_context_with_model: graph splits = 2
ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.0181, 0.0272, 0.0272, ...],
],
]
ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -0.6989, 1.0636, 1.0636, ...],
],
]
ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.1800, 0.2817, 0.2632, ...],
],
]
ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
[
[
[ -0.1863, 0.2970, 0.2604, ...],
],
]
ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
[
[
[ -1.1238, 1.2876, -1.8086, ...],
],
]
ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
],
]
ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
[ -0.3608, 0.5076, -1.8866, ...],
[ 1.7643, 0.0273, -2.1065, ...],
...
],
]
ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
[
[
[ -1.1135, 1.4604, -1.9226, ...],
[ -0.3608, 0.5076, -1.8866, ...],
[ 1.7643, 0.0273, -2.1065, ...],
...
],
]
```

View file

@ -0,0 +1,185 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include <cstdio>
#include <random>
#include <string>
#include <tuple>
#include <vector>
/**
* This the arbitrary data which will be passed to each callback.
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
*/
struct callback_data {
std::vector<uint8_t> data;
};
static std::string ggml_ne_string(const ggml_tensor * t) {
std::string str;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
str += std::to_string(t->ne[i]);
if (i + 1 < GGML_MAX_DIMS) {
str += ", ";
}
}
return str;
}
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
printf(" [\n");
for (int64_t i2 = 0; i2 < ne[2] && i2 < n; i2++) {
printf(" [\n");
for (int64_t i1 = 0; i1 < ne[1] && i1 < n; i1++) {
printf(" [");
for (int64_t i0 = 0; i0 < ne[0] && i0 < n; i0++) {
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
} else if (type == GGML_TYPE_F32) {
v = *(float *) data + i;
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) data + i;
} else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) data + i;
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) data + i;
} else {
GGML_ASSERT(false);
}
printf("%8.4f", v);
sum += v;
if (i0 < ne[0] - 1 && i0 < n - 1) printf(", ");
}
if (ne[0] > n) printf(", ...");
printf("],\n");
}
if (ne[1] > n) printf(" ...\n");
printf(" ],\n");
}
if (ne[2] > n) printf(" ...\n");
printf(" ]\n");
printf(" sum = %f\n", sum);
}
}
/**
* GGML operations callback during the graph execution.
*
* @param t current tensor
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
* see ggml_backend_sched_eval_callback
* @param user_data user data to pass at each call back
* @return true to receive data or continue the graph, false otherwise
*/
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (callback_data *) user_data;
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
if (ask) {
return true; // Always retrieve data
}
char src1_str[128] = {0};
if (src1) {
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
}
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
t->name, ggml_type_name(t->type), ggml_op_desc(t),
src0->name, ggml_ne_string(src0).c_str(),
src1 ? src1_str : "",
ggml_ne_string(t).c_str());
// copy the data from the GPU memory if needed
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
if (!is_host) {
auto n_bytes = ggml_nbytes(t);
cb_data->data.resize(n_bytes);
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
}
if (!ggml_is_quantized(t->type)) {
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
}
return true;
}
static bool run(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
return true;
}
int main(int argc, char ** argv) {
callback_data cb_data;
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
print_build_info();
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_backend_init();
llama_numa_init(params.numa);
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = ggml_debug;
params.cb_eval_user_data = &cb_data;
params.warmup = false;
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str());
}
bool OK = run(ctx, params);
if (!OK) {
return 1;
}
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}

View file

@ -17,7 +17,7 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";

View file

@ -107,9 +107,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
// the top-k selected expert ids are stored in the ids tensor
// for simplicity, always copy ids to host, because it is small
// take into account that ids is not contiguous!
GGML_ASSERT(ids->ne[1] == src1->ne[1]);
GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
m_ids.resize(ggml_nbytes(ids)/sizeof(int));
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
@ -597,24 +595,18 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
llama_model_params mparams = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
llama_context_params cparams = llama_context_params_from_gpt_params(params);
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
cparams.cb_eval = ik_collect_imatrix;
cparams.cb_eval_user_data = NULL;
params.cb_eval = ik_collect_imatrix;
params.cb_eval_user_data = NULL;
params.warmup = false;
llama_context * ctx = llama_new_context_with_model(model, cparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: unable to create context\n", __func__);
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
}

View file

@ -22,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example:
## Model conversion
- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
```sh
git clone https://huggingface.co/mtgv/MobileVLM-1.7B

View file

@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example:
## LLaVA 1.5
- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b

View file

@ -310,7 +310,7 @@ These options help improve the performance and memory usage of the LLaMA models.
### Quantization
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
## Additional Options

View file

@ -3,19 +3,18 @@
TODO
## Llama 2 70B Scorechart
Quantization | Model size (GiB) | Perplexity | Delta to fp16
-- | -- | -- | --
Q4_0 | 36.20 | 3.5550 | 3.61%
Q4_1 | 40.20 | 3.5125 | 2.37%
Q5_0 | 44.20 | 3.4744 | 1.26%
Q2_K | 27.27 | 3.7339 | 8.82%
Q3_K_S | 27.86 | 3.7019 | 7.89%
Q3_K_M | 30.83 | 3.5932 | 4.72%
Q3_K_L | 33.67 | 3.5617 | 3.80%
Q4_K_S | 36.39 | 3.4852 | 1.57%
Q4_K_M | 38.54 | 3.4725 | 1.20%
Q5_K_S | 44.20 | 3.4483 | 0.50%
Q5_K_M | 45.41 | 3.4451 | 0.40%
Q6_K | 52.70 | 3.4367 | 0.16%
fp16 | 128.5 | 3.4313 | -
| Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
|--------------|------------------|------------|---------------|
| Q4_0 | 36.20 | 3.5550 | 3.61% |
| Q4_1 | 40.20 | 3.5125 | 2.37% |
| Q5_0 | 44.20 | 3.4744 | 1.26% |
| Q2_K | 27.27 | 3.7339 | 8.82% |
| Q3_K_S | 27.86 | 3.7019 | 7.89% |
| Q3_K_M | 30.83 | 3.5932 | 4.72% |
| Q3_K_L | 33.67 | 3.5617 | 3.80% |
| Q4_K_S | 36.39 | 3.4852 | 1.57% |
| Q4_K_M | 38.54 | 3.4725 | 1.20% |
| Q5_K_S | 44.20 | 3.4483 | 0.50% |
| Q5_K_M | 45.41 | 3.4451 | 0.40% |
| Q6_K | 52.70 | 3.4367 | 0.16% |
| fp16 | 128.5 | 3.4313 | - |

View file

@ -4,17 +4,17 @@ TODO
## Llama 2 7B
Quantization | Bits per Weight (BPW)
-- | --
Q2_K | 3.35
Q3_K_S | 3.50
Q3_K_M | 3.91
Q3_K_L | 4.27
Q4_K_S | 4.58
Q4_K_M | 4.84
Q5_K_S | 5.52
Q5_K_M | 5.68
Q6_K | 6.56
| Quantization | Bits per Weight (BPW) |
|--------------|-----------------------|
| Q2_K | 3.35 |
| Q3_K_S | 3.50 |
| Q3_K_M | 3.91 |
| Q3_K_L | 4.27 |
| Q4_K_S | 4.58 |
| Q4_K_M | 4.84 |
| Q5_K_S | 5.52 |
| Q5_K_M | 5.68 |
| Q6_K | 6.56 |
## Llama 2 13B
Quantization | Bits per Weight (BPW)

View file

@ -1638,17 +1638,17 @@ static size_t llama_get_device_memory(int device) {
#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &total, &free);
ggml_backend_cuda_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &total, &free);
ggml_backend_sycl_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
ggml_backend_vk_get_device_memory(device, &total, &free);
ggml_backend_vk_get_device_memory(device, &free, &total);
return free;
#else
return 1;
@ -11121,7 +11121,7 @@ struct llm_tokenizer_bpe {
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
}
// add the fnished tokens to the final list keeping correct order for next and prev
// add the finished tokens to the final list keeping correct order for next and prev
for (auto & sym : symbols) {
if (sym.n > 0) {
sym.prev = final_prev_index;
@ -11861,7 +11861,9 @@ static void llama_grammar_advance_stack(
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
if (stack.empty()) {
new_stacks.emplace_back(stack);
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
new_stacks.emplace_back(stack);
}
return;
}
@ -11898,7 +11900,10 @@ static void llama_grammar_advance_stack(
}
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
new_stacks.emplace_back(stack);
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
// only add the stack if it's not a duplicate of one we already have
new_stacks.emplace_back(stack);
}
break;
default:
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@ -11912,12 +11917,13 @@ static void llama_grammar_advance_stack(
// be positioned at a character range (see `llama_grammar_advance_stack`), and
// produces the N possible stacks if the given char is accepted at those
// positions
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
void llama_grammar_accept(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
const uint32_t chr) {
const uint32_t chr,
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
new_stacks.clear();
for (const auto & stack : stacks) {
if (stack.empty()) {
@ -11936,8 +11942,6 @@ std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
llama_grammar_advance_stack(rules, new_stack, new_stacks);
}
}
return new_stacks;
}
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
@ -11951,6 +11955,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
const std::vector<llama_grammar_candidate> & candidates) {
std::vector<llama_grammar_candidate> rejects;
rejects.reserve(candidates.size());
if (stack.empty()) {
for (const auto & tok : candidates) {
@ -11964,6 +11969,8 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
const llama_grammar_element * stack_pos = stack.back();
std::vector<llama_grammar_candidate> next_candidates;
next_candidates.reserve(candidates.size());
for (const auto & tok : candidates) {
if (*tok.code_points == 0) {
// reached end of full codepoints in token, reject iff it ended in a partial sequence
@ -12771,8 +12778,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
// Note terminating 0 in decoded string
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
const auto & code_points = decoded.first;
std::vector<std::vector<const llama_grammar_element *>> tmp_new_stacks;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks);
grammar->stacks = tmp_new_stacks;
}
grammar->partial_utf8 = decoded.second;
GGML_ASSERT(!grammar->stacks.empty());

View file

@ -1097,10 +1097,11 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
struct llama_context * ctx
);
std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
void llama_grammar_accept(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
const uint32_t chr);
const uint32_t chr,
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const std::string & src,

View file

@ -3,9 +3,9 @@
# Shortcut for downloading HF models
#
# Usage:
# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
# ./main -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
# ./main -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
# ./main -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
#
# all logs go to stderr
@ -14,7 +14,7 @@ function log {
}
function usage {
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [-h|--help]"
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [--outdir <dir> [-h|--help]"
exit 1
}
@ -26,9 +26,9 @@ function has_cmd {
}
if has_cmd wget; then
cmd="wget -q --show-progress -c -O %s %s"
cmd="wget -q --show-progress -c -O %s/%s %s"
elif has_cmd curl; then
cmd="curl -C - -f -o %s -L %s"
cmd="curl -C - -f --output-dir %s -o %s -L %s"
else
log "[E] curl or wget not found"
exit 1
@ -37,6 +37,7 @@ fi
url=""
repo=""
file=""
outdir="."
# parse args
while [[ $# -gt 0 ]]; do
@ -53,6 +54,10 @@ while [[ $# -gt 0 ]]; do
file="$2"
shift 2
;;
--outdir)
outdir="$2"
shift 2
;;
-h|--help)
usage
;;
@ -94,10 +99,10 @@ basename=$(basename $url)
log "[+] attempting to download $basename"
if [ -n "$cmd" ]; then
cmd=$(printf "$cmd" "$basename" "$url")
cmd=$(printf "$cmd" "$outdir" "$basename" "$url")
log "[+] $cmd"
if $cmd; then
echo $basename
echo $outdir/$basename
exit 0
fi
fi

View file

@ -38,7 +38,7 @@ number ::= [0-9]+)""";
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
assert(!grammar->stacks.empty());
}
@ -138,7 +138,7 @@ ws ::= [ \t\n\r]?)""";
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
++pos;
auto prev_stacks = grammar->stacks;
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
// Expect that each code point will not cause the grammar to fail
if (grammar->stacks.empty()) {
@ -173,7 +173,7 @@ ws ::= [ \t\n\r]?)""";
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) {
parse_failed = true;
break;