diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index b974e7fac..758796632 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -79,12 +79,18 @@ jobs:
sleep 0.1
done
- - name: Install k6
+ - name: Set up Go
+ uses: actions/setup-go@v5
+ with:
+ go-version: '1.21'
+
+ - name: Install k6 and xk6-sse
id: k6_installation
run: |
cd examples/server/bench
- wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
- tar xzf k6*.tar.gz --strip-components=1
+ go install go.k6.io/xk6/cmd/xk6@latest
+ xk6 build master \
+ --with github.com/phymbert/xk6-sse
- name: Build
id: cmake_build
@@ -118,7 +124,7 @@ jobs:
cd examples/server/bench
source venv/bin/activate
- BENCH_K6_BIN_PATH=./k6 python bench.py \
+ python bench.py \
--runner-label ${{ env.RUNNER_LABEL }} \
--name ${{ github.job }} \
--branch ${{ github.head_ref || github.ref_name }} \
@@ -228,9 +234,9 @@ jobs:
Expand details for performance related PR only
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
+ - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
+ - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
+ - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
- ${{ env.BENCH_GRAPH_XLABEL }}
diff --git a/Makefile b/Makefile
index bdd5ef335..11b31c5c8 100644
--- a/Makefile
+++ b/Makefile
@@ -10,7 +10,7 @@ TEST_TARGETS = \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
- tests/test-json-schema-to-grammar
+ tests/test-json-schema-to-grammar tests/test-grammar-integration
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -918,6 +918,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/README.md b/README.md
index 17699c358..9e3cf030c 100644
--- a/README.md
+++ b/README.md
@@ -182,6 +182,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
+- [MindMac](https://mindmac.app) (proprietary)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md
index 34b343f66..bf951baf7 100644
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ]
+./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ]
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
+./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
# custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
```
## Sample results
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 19674dfd3..1e34de620 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -32,13 +32,15 @@ int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] \n" , argv[0]);
+ printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] \n" , argv[0]);
printf(" , and PL are comma-separated lists of numbers without spaces\n\n");
- printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+ printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ;
}
int n_kv_max = 2048;
+ int n_batch = 2048;
+ int n_ubatch = 512;
int is_pp_shared = 0;
int n_gpu_layers = 0;
@@ -56,23 +58,31 @@ int main(int argc, char ** argv) {
}
if (argc >= 4) {
- is_pp_shared = std::atoi(argv[3]);
+ n_batch = std::atoi(argv[3]);
}
if (argc >= 5) {
- n_gpu_layers = std::atoi(argv[4]);
+ n_ubatch = std::atoi(argv[4]);
}
if (argc >= 6) {
- n_pp = parse_list(argv[5]);
+ is_pp_shared = std::atoi(argv[5]);
}
if (argc >= 7) {
- n_tg = parse_list(argv[6]);
+ n_gpu_layers = std::atoi(argv[6]);
}
if (argc >= 8) {
- n_pl = parse_list(argv[7]);
+ n_pp = parse_list(argv[7]);
+ }
+
+ if (argc >= 9) {
+ n_tg = parse_list(argv[8]);
+ }
+
+ if (argc >= 10) {
+ n_pl = parse_list(argv[9]);
}
// init LLM
@@ -100,7 +110,8 @@ int main(int argc, char ** argv) {
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
- ctx_params.n_batch = 512;
+ ctx_params.n_batch = n_batch;
+ ctx_params.n_ubatch = n_ubatch;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -158,7 +169,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
- LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+ LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index a53ad64d7..23a3ec975 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -2,13 +2,15 @@
Benchmark is using [k6](https://k6.io/).
-##### Install k6
+##### Install k6 and sse extension
-Follow instruction from: https://k6.io/docs/get-started/installation/
+SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
-Example for ubuntu:
+Example:
```shell
-snap install k6
+go install go.k6.io/xk6/cmd/xk6@latest
+xk6 build master \
+--with github.com/phymbert/xk6-sse
```
#### Download a dataset
@@ -46,7 +48,7 @@ server --host localhost --port 8080 \
For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
```shell
-k6 run script.js --duration 10m --iterations 500 --vus 8
+./k6 run script.js --duration 10m --iterations 500 --vus 8
```
The benchmark values can be overridden with:
@@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with:
```shell
curl http://localhost:8080/metrics
```
+
+### Using the CI python script
+The `bench.py` script does several steps:
+- start the server
+- define good variable for k6
+- run k6 script
+- extract metrics from prometheus
+
+It aims to be used in the CI, but you can run it manually:
+
+```shell
+LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
+ --runner-label local \
+ --name local \
+ --branch `git rev-parse --abbrev-ref HEAD` \
+ --commit `git rev-parse HEAD` \
+ --scenario script.js \
+ --duration 5m \
+ --hf-repo ggml-org/models \
+ --hf-file phi-2/ggml-model-q4_0.gguf \
+ --model-path-prefix models \
+ --parallel 4 \
+ -ngl 33 \
+ --batch-size 2048 \
+ --ubatch-size 256 \
+ --ctx-size 4096 \
+ --n-prompts 200 \
+ --max-prompt-tokens 256 \
+ --max-tokens 256
+```
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index 86eeeccf8..6ca637bdd 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -76,7 +76,6 @@ def main(args_in: list[str] | None = None) -> None:
data['metrics'][metric_name][metric_metric]=value
github_env.write(
f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
- token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
iterations = data['root_group']['checks']['success completion']['passes']
except Exception:
@@ -181,16 +180,16 @@ xychart-beta
bench_results = {
"i": iterations,
"req": {
- "p90": round(data['metrics']["http_req_duration"]["p(90)"], 2),
+ "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
"avg": round(data['metrics']["http_req_duration"]["avg"], 2),
},
"pp": {
- "p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2),
- "avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2),
+ "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
+ "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
},
"tg": {
- "p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2),
+ "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
},
@@ -206,7 +205,7 @@ xychart-beta
def start_benchmark(args):
- k6_path = 'k6'
+ k6_path = './k6'
if 'BENCH_K6_BIN_PATH' in os.environ:
k6_path = os.environ['BENCH_K6_BIN_PATH']
k6_args = [
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index dc41e8d93..c4c486cdf 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -1,4 +1,4 @@
-import http from 'k6/http'
+import sse from 'k6/x/sse'
import {check, sleep} from 'k6'
import {SharedArray} from 'k6/data'
import {Counter, Rate, Trend} from 'k6/metrics'
@@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () {
const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
+
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
+const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -86,36 +88,62 @@ export default function () {
}
],
"model": model,
- "stream": false,
+ "stream": true,
"seed": 42,
"max_tokens": max_tokens
}
- const body = JSON.stringify(payload)
+ const params = {method: 'POST', body: JSON.stringify(payload)};
- let res = http.post(`${server_url}/chat/completions`, body, {
- headers: {'Content-Type': 'application/json'},
- timeout: '300s'
+ const startTime = new Date()
+ let promptEvalEndTime = null
+ let prompt_tokens = 0
+ let completions_tokens = 0
+ let finish_reason = null
+ const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
+ client.on('event', function (event) {
+ if (promptEvalEndTime == null) {
+ promptEvalEndTime = new Date()
+ }
+
+ let chunk = JSON.parse(event.data)
+ let choice = chunk.choices[0]
+ if (choice.finish_reason) {
+ finish_reason = choice.finish_reason
+ }
+
+ if (chunk.usage) {
+ prompt_tokens = chunk.usage.prompt_tokens
+ llamacpp_prompt_tokens.add(prompt_tokens)
+ llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
+
+ completions_tokens = chunk.usage.completion_tokens
+ llamacpp_completion_tokens.add(completions_tokens)
+ llamacpp_completion_tokens_total_counter.add(completions_tokens)
+ }
+ })
+
+ client.on('error', function (e) {
+ console.log('An unexpected error occurred: ', e.error());
+ throw e;
+ })
})
check(res, {'success completion': (r) => r.status === 200})
- if (res.status === 200) {
- const completions = res.json()
+ const endTime = new Date()
- llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
- llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
-
- llamacpp_completion_tokens.add(completions.usage.completion_tokens)
- llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
-
- llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
- llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
-
- llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
- } else {
- console.error(`response: ${res.body} request=${payload}`)
+ const promptEvalTime = promptEvalEndTime - startTime
+ if (promptEvalTime > 0) {
+ llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
}
+ const completion_time = endTime - promptEvalEndTime
+ if (completions_tokens > 0 && completion_time > 0) {
+ llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
+ }
+ llamacpp_completions_truncated_rate.add(finish_reason === 'length')
+ llamacpp_completions_stop_rate.add(finish_reason === 'stop')
+
sleep(0.3)
}
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 47cc53c27..a8d43ac63 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -567,6 +567,15 @@ static std::vector format_partial_response_oaicompat(json result, const st
{"model", modelname},
{"object", "chat.completion.chunk"}
};
+ if (!finish_reason.empty()) {
+ int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+ int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
+ ret.push_back({"usage", json {
+ {"completion_tokens", num_tokens_predicted},
+ {"prompt_tokens", num_prompt_tokens},
+ {"total_tokens", num_tokens_predicted + num_prompt_tokens}
+ }});
+ }
return std::vector({ret});
}
diff --git a/ggml-backend.h b/ggml-backend.h
index 422457ab6..744b6a774 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -137,7 +137,7 @@ extern "C" {
/*
Example usage:
- // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
+ // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
// preferrably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 2b0e5f548..db3c24f60 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -1664,24 +1664,6 @@ namespace dpct
const void *alpha, const void *a, int lda, const void *b,
int ldb, const void *beta, void *c, int ldc)
{
-#ifndef __INTEL_MKL__
- GGML_UNUSED(q);
- GGML_UNUSED(a_trans);
- GGML_UNUSED(b_trans);
- GGML_UNUSED(m);
- GGML_UNUSED(n);
- GGML_UNUSED(k);
- GGML_UNUSED(alpha);
- GGML_UNUSED(a);
- GGML_UNUSED(lda);
- GGML_UNUSED(b);
- GGML_UNUSED(ldb);
- GGML_UNUSED(beta);
- GGML_UNUSED(c);
- GGML_UNUSED(ldc);
- throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
- "Project does not support this API.");
-#else
Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q);
Ts beta_value = dpct::get_value(reinterpret_cast(beta), q);
auto data_a = get_memory(a);
@@ -1690,7 +1672,6 @@ namespace dpct
oneapi::mkl::blas::column_major::gemm(
q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
data_b, ldb, beta_value, data_c, ldc);
-#endif
}
template
@@ -2330,6 +2311,7 @@ namespace dpct
lda, b, ldb, beta, c, ldc);
break;
}
+#ifdef __INTEL_MKL__
case detail::get_type_combination_id(
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float):
@@ -2391,6 +2373,7 @@ namespace dpct
q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
break;
}
+#endif // __INTEL_MKL__
default:
throw std::runtime_error("the combination of data type is unsupported");
}
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index ce697f714..f6ade5b22 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -24,6 +24,7 @@ class Keys:
ALIGNMENT = "general.alignment"
NAME = "general.name"
AUTHOR = "general.author"
+ VERSION = "general.version"
URL = "general.url"
DESCRIPTION = "general.description"
LICENSE = "general.license"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 2ae6c814b..f4c440766 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -296,6 +296,9 @@ class GGUFWriter:
def add_author(self, author: str) -> None:
self.add_string(Keys.General.AUTHOR, author)
+ def add_version(self, version: str) -> None:
+ self.add_string(Keys.General.VERSION, version)
+
def add_tensor_data_layout(self, layout: str) -> None:
self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
@@ -305,6 +308,9 @@ class GGUFWriter:
def add_description(self, description: str) -> None:
self.add_string(Keys.General.DESCRIPTION, description)
+ def add_licence(self, licence: str) -> None:
+ self.add_string(Keys.General.LICENSE, licence)
+
def add_source_url(self, url: str) -> None:
self.add_string(Keys.General.SOURCE_URL, url)
diff --git a/llama.cpp b/llama.cpp
index 8e6580877..7347ae655 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -263,6 +263,7 @@ enum llm_kv {
LLM_KV_GENERAL_ALIGNMENT,
LLM_KV_GENERAL_NAME,
LLM_KV_GENERAL_AUTHOR,
+ LLM_KV_GENERAL_VERSION,
LLM_KV_GENERAL_URL,
LLM_KV_GENERAL_DESCRIPTION,
LLM_KV_GENERAL_LICENSE,
@@ -332,6 +333,7 @@ static const std::map LLM_KV_NAMES = {
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
{ LLM_KV_GENERAL_NAME, "general.name" },
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
+ { LLM_KV_GENERAL_VERSION, "general.version" },
{ LLM_KV_GENERAL_URL, "general.url" },
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
{ LLM_KV_GENERAL_LICENSE, "general.license" },
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 7c30162e2..bbe7e3084 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-43a6d4af1971ee2912ff7bc2404011ff327b6a60
+8e413034b42e4fbedc2873166f61193b75f2622a
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a43439aed..b5d7bb59c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -59,6 +59,7 @@ llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt2 AR
llama_test(test-grammar-parser.cpp)
llama_test(test-llama-grammar.cpp)
+llama_test(test-grammar-integration.cpp)
llama_test(test-grad0.cpp)
# llama_test(test-opt.cpp) # SLOW
llama_test(test-backend-ops.cpp)
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
new file mode 100644
index 000000000..0a9c3b6f5
--- /dev/null
+++ b/tests/test-grammar-integration.cpp
@@ -0,0 +1,243 @@
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#define LLAMA_API_INTERNAL
+
+#include "ggml.h"
+#include "llama.h"
+#include "grammar-parser.h"
+#include "unicode.h"
+#include
+#include
+
+static void test_simple_grammar() {
+ // Test case for a simple grammar
+ const std::string grammar_str = R"""(root ::= expr
+expr ::= term ("+" term)*
+term ::= number
+number ::= [0-9]+)""";
+
+ grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
+
+ // Ensure we parsed correctly
+ assert(!parsed_grammar.rules.empty());
+
+ // Ensure we have a root node
+ assert(!(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()));
+
+ std::vector grammar_rules(parsed_grammar.c_rules());
+ llama_grammar* grammar = llama_grammar_init(
+ grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+
+ std::string input = "123+456";
+
+ auto decoded = decode_utf8(input, {});
+
+ const auto & code_points = decoded.first;
+
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+ auto prev_stacks = grammar->stacks;
+ grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+ assert(!grammar->stacks.empty());
+ }
+
+ bool completed_grammar = false;
+
+ for (const auto & stack : grammar->stacks) {
+ if (stack.empty()) {
+ completed_grammar = true;
+ break;
+ }
+ }
+
+ assert(completed_grammar);
+
+ // Clean up allocated memory
+ llama_grammar_free(grammar);
+}
+
+static void test_complex_grammar() {
+ // Test case for a more complex grammar, with both failure strings and success strings
+ const std::string grammar_str = R"""(root ::= expression
+expression ::= term ws (("+"|"-") ws term)*
+term ::= factor ws (("*"|"/") ws factor)*
+factor ::= number | variable | "(" expression ")" | function-call
+number ::= [0-9]+
+variable ::= [a-zA-Z_][a-zA-Z0-9_]*
+function-call ::= variable ws "(" (expression ("," ws expression)*)? ")"
+ws ::= [ \t\n\r]?)""";
+
+ grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
+
+ // Ensure we parsed correctly
+ assert(!parsed_grammar.rules.empty());
+
+ // Ensure we have a root node
+ assert(!(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()));
+
+ std::vector grammar_rules(parsed_grammar.c_rules());
+ llama_grammar* grammar = llama_grammar_init(
+ grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+
+ // Save the original grammar stacks so that we can reset after every new string we want to test
+ auto original_stacks = grammar->stacks;
+
+ // Test a few strings
+ std::vector test_strings_pass = {
+ "42",
+ "1*2*3*4*5",
+ "x",
+ "x+10",
+ "x1+y2",
+ "(a+b)*(c-d)",
+ "func()",
+ "func(x,y+2)",
+ "a*(b+c)-d/e",
+ "f(g(x),h(y,z))",
+ "x + 10",
+ "x1 + y2",
+ "(a + b) * (c - d)",
+ "func()",
+ "func(x, y + 2)",
+ "a * (b + c) - d / e",
+ "f(g(x), h(y, z))",
+ "123+456",
+ "123*456*789-123/456+789*123",
+ "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456"
+ };
+
+ std::vector test_strings_fail = {
+ "+",
+ "/ 3x",
+ "x + + y",
+ "a * / b",
+ "func(,)",
+ "func(x y)",
+ "(a + b",
+ "x + y)",
+ "a + b * (c - d",
+ "42 +",
+ "x +",
+ "x + 10 +",
+ "(a + b) * (c - d",
+ "func(",
+ "func(x, y + 2",
+ "a * (b + c) - d /",
+ "f(g(x), h(y, z)",
+ "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/",
+ };
+
+ // Passing strings
+ for (const auto & test_string : test_strings_pass) {
+ auto decoded = decode_utf8(test_string, {});
+
+ const auto & code_points = decoded.first;
+
+ int pos = 0;
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+ ++pos;
+ auto prev_stacks = grammar->stacks;
+ grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+
+ // Expect that each code point will not cause the grammar to fail
+ if (grammar->stacks.empty()) {
+ fprintf(stdout, "Error at position %d\n", pos);
+ fprintf(stderr, "Unexpected character '%s'\n", unicode_cpt_to_utf8(*it).c_str());
+ fprintf(stderr, "Input string is %s:\n", test_string.c_str());
+ }
+ assert(!grammar->stacks.empty());
+ }
+
+ bool completed_grammar = false;
+
+ for (const auto & stack : grammar->stacks) {
+ if (stack.empty()) {
+ completed_grammar = true;
+ break;
+ }
+ }
+
+ assert(completed_grammar);
+
+ // Reset the grammar stacks
+ grammar->stacks = original_stacks;
+ }
+
+ // Failing strings
+ for (const auto & test_string : test_strings_fail) {
+ auto decoded = decode_utf8(test_string, {});
+
+ const auto & code_points = decoded.first;
+ bool parse_failed = false;
+
+ for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
+ auto prev_stacks = grammar->stacks;
+ grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+ if (grammar->stacks.empty()) {
+ parse_failed = true;
+ break;
+ }
+ assert(!grammar->stacks.empty());
+ }
+
+ bool completed_grammar = false;
+
+ for (const auto & stack : grammar->stacks) {
+ if (stack.empty()) {
+ completed_grammar = true;
+ break;
+ }
+ }
+
+ // Ensure that the grammar is not completed, or that each string failed to match as-expected
+ assert((!completed_grammar) || parse_failed);
+
+ // Reset the grammar stacks
+ grammar->stacks = original_stacks;
+ }
+
+ // Clean up allocated memory
+ llama_grammar_free(grammar);
+}
+
+static void test_failure_missing_root() {
+ // Test case for a grammar that is missing a root rule
+ const std::string grammar_str = R"""(rot ::= expr
+expr ::= term ("+" term)*
+term ::= number
+number ::= [0-9]+)""";
+
+ grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
+
+ // Ensure we parsed correctly
+ assert(!parsed_grammar.rules.empty());
+
+ // Ensure we do NOT have a root node
+ assert(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end());
+}
+
+static void test_failure_missing_reference() {
+ // Test case for a grammar that is missing a referenced rule
+ const std::string grammar_str = R"""(root ::= expr
+expr ::= term ("+" term)*
+term ::= numero
+number ::= [0-9]+)""";
+
+ fprintf(stderr, "Expected error: ");
+
+ grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
+
+ // Ensure we did NOT parsed correctly
+ assert(parsed_grammar.rules.empty());
+
+ fprintf(stderr, "End of expected error. Test successful.\n");
+}
+
+int main() {
+ test_simple_grammar();
+ test_complex_grammar();
+ test_failure_missing_root();
+ test_failure_missing_reference();
+ return 0;
+}