From 1b496a745c315022df2d919374052e6004ced8d3 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Fri, 5 Apr 2024 14:35:06 +0100 Subject: [PATCH 1/8] [SYCL] Fixed minor bug when enabling FP16 for non intel targets (#6464) * moved INTEL_MKL guard from gemm_impl to gemm (wrapper) * Update ggml-sycl.cpp Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> --------- Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> --- ggml-sycl.cpp | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 2b0e5f548..db3c24f60 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -1664,24 +1664,6 @@ namespace dpct const void *alpha, const void *a, int lda, const void *b, int ldb, const void *beta, void *c, int ldc) { -#ifndef __INTEL_MKL__ - GGML_UNUSED(q); - GGML_UNUSED(a_trans); - GGML_UNUSED(b_trans); - GGML_UNUSED(m); - GGML_UNUSED(n); - GGML_UNUSED(k); - GGML_UNUSED(alpha); - GGML_UNUSED(a); - GGML_UNUSED(lda); - GGML_UNUSED(b); - GGML_UNUSED(ldb); - GGML_UNUSED(beta); - GGML_UNUSED(c); - GGML_UNUSED(ldc); - throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " - "Project does not support this API."); -#else Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); auto data_a = get_memory(a); @@ -1690,7 +1672,6 @@ namespace dpct oneapi::mkl::blas::column_major::gemm( q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, beta_value, data_c, ldc); -#endif } template @@ -2330,6 +2311,7 @@ namespace dpct lda, b, ldb, beta, c, ldc); break; } +#ifdef __INTEL_MKL__ case detail::get_type_combination_id( library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_float, library_data_t::real_float): @@ -2391,6 +2373,7 @@ namespace dpct q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc); break; } +#endif // __INTEL_MKL__ default: throw std::runtime_error("the combination of data type is unsupported"); } From 87e21bbacd830437ab653cf03b6f26d45c15395d Mon Sep 17 00:00:00 2001 From: Ting Sun Date: Sat, 6 Apr 2024 01:34:53 +0700 Subject: [PATCH 2/8] bench : make n_batch and n_ubatch configurable in Batched bench (#6500) * bench: make n_batch and n_ubatch configurable * bench: update doc for batched bench --- examples/batched-bench/README.md | 8 +++---- examples/batched-bench/batched-bench.cpp | 29 ++++++++++++++++-------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md index 34b343f66..bf951baf7 100644 --- a/examples/batched-bench/README.md +++ b/examples/batched-bench/README.md @@ -10,16 +10,16 @@ There are 2 modes of operation: - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) ```bash -./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] +./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared -./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99 +./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared -./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99 +./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99 # custom set of batches -./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32 +./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32 ``` ## Sample results diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 19674dfd3..1e34de620 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -32,13 +32,15 @@ int main(int argc, char ** argv) { gpt_params params; if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] \n" , argv[0]); + printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] \n" , argv[0]); printf(" , and PL are comma-separated lists of numbers without spaces\n\n"); - printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); + printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); return 1 ; } int n_kv_max = 2048; + int n_batch = 2048; + int n_ubatch = 512; int is_pp_shared = 0; int n_gpu_layers = 0; @@ -56,23 +58,31 @@ int main(int argc, char ** argv) { } if (argc >= 4) { - is_pp_shared = std::atoi(argv[3]); + n_batch = std::atoi(argv[3]); } if (argc >= 5) { - n_gpu_layers = std::atoi(argv[4]); + n_ubatch = std::atoi(argv[4]); } if (argc >= 6) { - n_pp = parse_list(argv[5]); + is_pp_shared = std::atoi(argv[5]); } if (argc >= 7) { - n_tg = parse_list(argv[6]); + n_gpu_layers = std::atoi(argv[6]); } if (argc >= 8) { - n_pl = parse_list(argv[7]); + n_pp = parse_list(argv[7]); + } + + if (argc >= 9) { + n_tg = parse_list(argv[8]); + } + + if (argc >= 10) { + n_pl = parse_list(argv[9]); } // init LLM @@ -100,7 +110,8 @@ int main(int argc, char ** argv) { ctx_params.seed = 1234; ctx_params.n_ctx = n_kv_max; - ctx_params.n_batch = 512; + ctx_params.n_batch = n_batch; + ctx_params.n_ubatch = n_ubatch; ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; @@ -158,7 +169,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("\n"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); From d0f5deebf898f8186a10148a03a56909ba05fc0b Mon Sep 17 00:00:00 2001 From: Hoang Nguyen Date: Fri, 5 Apr 2024 11:39:43 -0700 Subject: [PATCH 3/8] readme : update UI list (#6503) * Add MindMac to UI list * Update proprietary description Co-authored-by: slaren --------- Co-authored-by: slaren --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bb66b6c68..bd3f9cff5 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) - [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later) - [Dot](https://github.com/alexpinel/Dot) (GPL) +- [MindMac](https://mindmac.app) (proprietary) *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* From a8bd14d55717754a1f48313a846a2b16fa998ad2 Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 6 Apr 2024 05:41:38 +1100 Subject: [PATCH 4/8] gguf.py : add licence and version to gguf writer (#6504) --- gguf-py/gguf/constants.py | 1 + gguf-py/gguf/gguf_writer.py | 6 ++++++ llama.cpp | 2 ++ 3 files changed, 9 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5214764a9..c44d8abeb 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -24,6 +24,7 @@ class Keys: ALIGNMENT = "general.alignment" NAME = "general.name" AUTHOR = "general.author" + VERSION = "general.version" URL = "general.url" DESCRIPTION = "general.description" LICENSE = "general.license" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 2ae6c814b..f4c440766 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -296,6 +296,9 @@ class GGUFWriter: def add_author(self, author: str) -> None: self.add_string(Keys.General.AUTHOR, author) + def add_version(self, version: str) -> None: + self.add_string(Keys.General.VERSION, version) + def add_tensor_data_layout(self, layout: str) -> None: self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) @@ -305,6 +308,9 @@ class GGUFWriter: def add_description(self, description: str) -> None: self.add_string(Keys.General.DESCRIPTION, description) + def add_licence(self, licence: str) -> None: + self.add_string(Keys.General.LICENSE, licence) + def add_source_url(self, url: str) -> None: self.add_string(Keys.General.SOURCE_URL, url) diff --git a/llama.cpp b/llama.cpp index 9a1c11043..217726184 100644 --- a/llama.cpp +++ b/llama.cpp @@ -261,6 +261,7 @@ enum llm_kv { LLM_KV_GENERAL_ALIGNMENT, LLM_KV_GENERAL_NAME, LLM_KV_GENERAL_AUTHOR, + LLM_KV_GENERAL_VERSION, LLM_KV_GENERAL_URL, LLM_KV_GENERAL_DESCRIPTION, LLM_KV_GENERAL_LICENSE, @@ -330,6 +331,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, { LLM_KV_GENERAL_NAME, "general.name" }, { LLM_KV_GENERAL_AUTHOR, "general.author" }, + { LLM_KV_GENERAL_VERSION, "general.version" }, { LLM_KV_GENERAL_URL, "general.url" }, { LLM_KV_GENERAL_DESCRIPTION, "general.description" }, { LLM_KV_GENERAL_LICENSE, "general.license" }, From 75cd4c77292034ecec587ecb401366f57338f7c0 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 6 Apr 2024 05:40:47 +0200 Subject: [PATCH 5/8] ci: bench: support sse and fix prompt processing time / server: add tokens usage in stream OAI response (#6495) * ci: bench: support sse and fix prompt processing time server: add tokens usage in stream mode * ci: bench: README.md EOL * ci: bench: remove total pp and tg as it is not accurate * ci: bench: fix case when there is no token generated * ci: bench: change to the 95 percentile for pp and tg as it is closer to what the server exports in metrics * ci: bench: fix finish reason rate --- .github/workflows/bench.yml | 20 ++++++---- examples/server/bench/README.md | 42 +++++++++++++++++--- examples/server/bench/bench.py | 11 +++--- examples/server/bench/script.js | 68 +++++++++++++++++++++++---------- examples/server/utils.hpp | 9 +++++ 5 files changed, 112 insertions(+), 38 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b974e7fac..758796632 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -79,12 +79,18 @@ jobs: sleep 0.1 done - - name: Install k6 + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.21' + + - name: Install k6 and xk6-sse id: k6_installation run: | cd examples/server/bench - wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz - tar xzf k6*.tar.gz --strip-components=1 + go install go.k6.io/xk6/cmd/xk6@latest + xk6 build master \ + --with github.com/phymbert/xk6-sse - name: Build id: cmake_build @@ -118,7 +124,7 @@ jobs: cd examples/server/bench source venv/bin/activate - BENCH_K6_BIN_PATH=./k6 python bench.py \ + python bench.py \ --runner-label ${{ env.RUNNER_LABEL }} \ --name ${{ github.job }} \ --branch ${{ github.head_ref || github.ref_name }} \ @@ -228,9 +234,9 @@ jobs: Expand details for performance related PR only - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s** - - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s** + - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} + - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s + - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s - ${{ env.BENCH_GRAPH_XLABEL }} diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index a53ad64d7..23a3ec975 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -2,13 +2,15 @@ Benchmark is using [k6](https://k6.io/). -##### Install k6 +##### Install k6 and sse extension -Follow instruction from: https://k6.io/docs/get-started/installation/ +SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension. -Example for ubuntu: +Example: ```shell -snap install k6 +go install go.k6.io/xk6/cmd/xk6@latest +xk6 build master \ +--with github.com/phymbert/xk6-sse ``` #### Download a dataset @@ -46,7 +48,7 @@ server --host localhost --port 8080 \ For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run: ```shell -k6 run script.js --duration 10m --iterations 500 --vus 8 +./k6 run script.js --duration 10m --iterations 500 --vus 8 ``` The benchmark values can be overridden with: @@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with: ```shell curl http://localhost:8080/metrics ``` + +### Using the CI python script +The `bench.py` script does several steps: +- start the server +- define good variable for k6 +- run k6 script +- extract metrics from prometheus + +It aims to be used in the CI, but you can run it manually: + +```shell +LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \ + --runner-label local \ + --name local \ + --branch `git rev-parse --abbrev-ref HEAD` \ + --commit `git rev-parse HEAD` \ + --scenario script.js \ + --duration 5m \ + --hf-repo ggml-org/models \ + --hf-file phi-2/ggml-model-q4_0.gguf \ + --model-path-prefix models \ + --parallel 4 \ + -ngl 33 \ + --batch-size 2048 \ + --ubatch-size 256 \ + --ctx-size 4096 \ + --n-prompts 200 \ + --max-prompt-tokens 256 \ + --max-tokens 256 +``` diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 86eeeccf8..6ca637bdd 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -76,7 +76,6 @@ def main(args_in: list[str] | None = None) -> None: data['metrics'][metric_name][metric_metric]=value github_env.write( f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n") - token_seconds = data['metrics']['llamacpp_tokens_second']['avg'] iterations = data['root_group']['checks']['success completion']['passes'] except Exception: @@ -181,16 +180,16 @@ xychart-beta bench_results = { "i": iterations, "req": { - "p90": round(data['metrics']["http_req_duration"]["p(90)"], 2), + "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2), "avg": round(data['metrics']["http_req_duration"]["avg"], 2), }, "pp": { - "p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2), - "avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2), + "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2), + "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2), "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2), }, "tg": { - "p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2), + "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2), "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2), "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2), }, @@ -206,7 +205,7 @@ xychart-beta def start_benchmark(args): - k6_path = 'k6' + k6_path = './k6' if 'BENCH_K6_BIN_PATH' in os.environ: k6_path = os.environ['BENCH_K6_BIN_PATH'] k6_args = [ diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index dc41e8d93..c4c486cdf 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -1,4 +1,4 @@ -import http from 'k6/http' +import sse from 'k6/x/sse' import {check, sleep} from 'k6' import {SharedArray} from 'k6/data' import {Counter, Rate, Trend} from 'k6/metrics' @@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () { const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens') const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') + const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') +const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') @@ -86,36 +88,62 @@ export default function () { } ], "model": model, - "stream": false, + "stream": true, "seed": 42, "max_tokens": max_tokens } - const body = JSON.stringify(payload) + const params = {method: 'POST', body: JSON.stringify(payload)}; - let res = http.post(`${server_url}/chat/completions`, body, { - headers: {'Content-Type': 'application/json'}, - timeout: '300s' + const startTime = new Date() + let promptEvalEndTime = null + let prompt_tokens = 0 + let completions_tokens = 0 + let finish_reason = null + const res = sse.open(`${server_url}/chat/completions`, params, function (client) { + client.on('event', function (event) { + if (promptEvalEndTime == null) { + promptEvalEndTime = new Date() + } + + let chunk = JSON.parse(event.data) + let choice = chunk.choices[0] + if (choice.finish_reason) { + finish_reason = choice.finish_reason + } + + if (chunk.usage) { + prompt_tokens = chunk.usage.prompt_tokens + llamacpp_prompt_tokens.add(prompt_tokens) + llamacpp_prompt_tokens_total_counter.add(prompt_tokens) + + completions_tokens = chunk.usage.completion_tokens + llamacpp_completion_tokens.add(completions_tokens) + llamacpp_completion_tokens_total_counter.add(completions_tokens) + } + }) + + client.on('error', function (e) { + console.log('An unexpected error occurred: ', e.error()); + throw e; + }) }) check(res, {'success completion': (r) => r.status === 200}) - if (res.status === 200) { - const completions = res.json() + const endTime = new Date() - llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) - llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) - - llamacpp_completion_tokens.add(completions.usage.completion_tokens) - llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) - - llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') - llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') - - llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3) - } else { - console.error(`response: ${res.body} request=${payload}`) + const promptEvalTime = promptEvalEndTime - startTime + if (promptEvalTime > 0) { + llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3) } + const completion_time = endTime - promptEvalEndTime + if (completions_tokens > 0 && completion_time > 0) { + llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3) + } + llamacpp_completions_truncated_rate.add(finish_reason === 'length') + llamacpp_completions_stop_rate.add(finish_reason === 'stop') + sleep(0.3) } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 47cc53c27..a8d43ac63 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -567,6 +567,15 @@ static std::vector format_partial_response_oaicompat(json result, const st {"model", modelname}, {"object", "chat.completion.chunk"} }; + if (!finish_reason.empty()) { + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + ret.push_back({"usage", json { + {"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens} + }}); + } return std::vector({ret}); } From 57dd02c44b2a0eb79e28f6c5eb8242a5d2d3174d Mon Sep 17 00:00:00 2001 From: Clint Herron Date: Sat, 6 Apr 2024 10:31:33 -0400 Subject: [PATCH 6/8] Tests: Added integration tests for GBNF parser (#6472) * Added integration tests for GBNF parser to validate correctness of parsing, as well as correctness of string matching. Intended for use to pin behavior while working on performance improvements. * Fixing whitespace errors and cleaning error message alert to be clearer. * Removing hacky include to llama.cpp from grammar integration test now that needed functions are available via internal API. * Comment cleanup. * Reorganizing tests for readability. * Cleaning up debug message to make a bit more sense. --- Makefile | 6 +- tests/CMakeLists.txt | 1 + tests/test-grammar-integration.cpp | 243 +++++++++++++++++++++++++++++ 3 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 tests/test-grammar-integration.cpp diff --git a/Makefile b/Makefile index bdd5ef335..11b31c5c8 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ TEST_TARGETS = \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \ - tests/test-json-schema-to-grammar + tests/test-json-schema-to-grammar tests/test-grammar-integration # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -918,6 +918,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a43439aed..b5d7bb59c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -59,6 +59,7 @@ llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt2 AR llama_test(test-grammar-parser.cpp) llama_test(test-llama-grammar.cpp) +llama_test(test-grammar-integration.cpp) llama_test(test-grad0.cpp) # llama_test(test-opt.cpp) # SLOW llama_test(test-backend-ops.cpp) diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp new file mode 100644 index 000000000..0a9c3b6f5 --- /dev/null +++ b/tests/test-grammar-integration.cpp @@ -0,0 +1,243 @@ +#ifdef NDEBUG +#undef NDEBUG +#endif + +#define LLAMA_API_INTERNAL + +#include "ggml.h" +#include "llama.h" +#include "grammar-parser.h" +#include "unicode.h" +#include +#include + +static void test_simple_grammar() { + // Test case for a simple grammar + const std::string grammar_str = R"""(root ::= expr +expr ::= term ("+" term)* +term ::= number +number ::= [0-9]+)"""; + + grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + + // Ensure we parsed correctly + assert(!parsed_grammar.rules.empty()); + + // Ensure we have a root node + assert(!(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end())); + + std::vector grammar_rules(parsed_grammar.c_rules()); + llama_grammar* grammar = llama_grammar_init( + grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + + std::string input = "123+456"; + + auto decoded = decode_utf8(input, {}); + + const auto & code_points = decoded.first; + + for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { + auto prev_stacks = grammar->stacks; + grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); + assert(!grammar->stacks.empty()); + } + + bool completed_grammar = false; + + for (const auto & stack : grammar->stacks) { + if (stack.empty()) { + completed_grammar = true; + break; + } + } + + assert(completed_grammar); + + // Clean up allocated memory + llama_grammar_free(grammar); +} + +static void test_complex_grammar() { + // Test case for a more complex grammar, with both failure strings and success strings + const std::string grammar_str = R"""(root ::= expression +expression ::= term ws (("+"|"-") ws term)* +term ::= factor ws (("*"|"/") ws factor)* +factor ::= number | variable | "(" expression ")" | function-call +number ::= [0-9]+ +variable ::= [a-zA-Z_][a-zA-Z0-9_]* +function-call ::= variable ws "(" (expression ("," ws expression)*)? ")" +ws ::= [ \t\n\r]?)"""; + + grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + + // Ensure we parsed correctly + assert(!parsed_grammar.rules.empty()); + + // Ensure we have a root node + assert(!(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end())); + + std::vector grammar_rules(parsed_grammar.c_rules()); + llama_grammar* grammar = llama_grammar_init( + grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + + // Save the original grammar stacks so that we can reset after every new string we want to test + auto original_stacks = grammar->stacks; + + // Test a few strings + std::vector test_strings_pass = { + "42", + "1*2*3*4*5", + "x", + "x+10", + "x1+y2", + "(a+b)*(c-d)", + "func()", + "func(x,y+2)", + "a*(b+c)-d/e", + "f(g(x),h(y,z))", + "x + 10", + "x1 + y2", + "(a + b) * (c - d)", + "func()", + "func(x, y + 2)", + "a * (b + c) - d / e", + "f(g(x), h(y, z))", + "123+456", + "123*456*789-123/456+789*123", + "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456" + }; + + std::vector test_strings_fail = { + "+", + "/ 3x", + "x + + y", + "a * / b", + "func(,)", + "func(x y)", + "(a + b", + "x + y)", + "a + b * (c - d", + "42 +", + "x +", + "x + 10 +", + "(a + b) * (c - d", + "func(", + "func(x, y + 2", + "a * (b + c) - d /", + "f(g(x), h(y, z)", + "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/", + }; + + // Passing strings + for (const auto & test_string : test_strings_pass) { + auto decoded = decode_utf8(test_string, {}); + + const auto & code_points = decoded.first; + + int pos = 0; + for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { + ++pos; + auto prev_stacks = grammar->stacks; + grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); + + // Expect that each code point will not cause the grammar to fail + if (grammar->stacks.empty()) { + fprintf(stdout, "Error at position %d\n", pos); + fprintf(stderr, "Unexpected character '%s'\n", unicode_cpt_to_utf8(*it).c_str()); + fprintf(stderr, "Input string is %s:\n", test_string.c_str()); + } + assert(!grammar->stacks.empty()); + } + + bool completed_grammar = false; + + for (const auto & stack : grammar->stacks) { + if (stack.empty()) { + completed_grammar = true; + break; + } + } + + assert(completed_grammar); + + // Reset the grammar stacks + grammar->stacks = original_stacks; + } + + // Failing strings + for (const auto & test_string : test_strings_fail) { + auto decoded = decode_utf8(test_string, {}); + + const auto & code_points = decoded.first; + bool parse_failed = false; + + for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { + auto prev_stacks = grammar->stacks; + grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); + if (grammar->stacks.empty()) { + parse_failed = true; + break; + } + assert(!grammar->stacks.empty()); + } + + bool completed_grammar = false; + + for (const auto & stack : grammar->stacks) { + if (stack.empty()) { + completed_grammar = true; + break; + } + } + + // Ensure that the grammar is not completed, or that each string failed to match as-expected + assert((!completed_grammar) || parse_failed); + + // Reset the grammar stacks + grammar->stacks = original_stacks; + } + + // Clean up allocated memory + llama_grammar_free(grammar); +} + +static void test_failure_missing_root() { + // Test case for a grammar that is missing a root rule + const std::string grammar_str = R"""(rot ::= expr +expr ::= term ("+" term)* +term ::= number +number ::= [0-9]+)"""; + + grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + + // Ensure we parsed correctly + assert(!parsed_grammar.rules.empty()); + + // Ensure we do NOT have a root node + assert(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()); +} + +static void test_failure_missing_reference() { + // Test case for a grammar that is missing a referenced rule + const std::string grammar_str = R"""(root ::= expr +expr ::= term ("+" term)* +term ::= numero +number ::= [0-9]+)"""; + + fprintf(stderr, "Expected error: "); + + grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + + // Ensure we did NOT parsed correctly + assert(parsed_grammar.rules.empty()); + + fprintf(stderr, "End of expected error. Test successful.\n"); +} + +int main() { + test_simple_grammar(); + test_complex_grammar(); + test_failure_missing_root(); + test_failure_missing_reference(); + return 0; +} From b66aec675c1571a06b3570b858ae711246f96f84 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 3 Apr 2024 22:57:20 +0200 Subject: [PATCH 7/8] backend : fix typo in scheduler documentation (ggml/781) Signed-off-by: Daniel Bevenius --- ggml-backend.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-backend.h b/ggml-backend.h index 422457ab6..744b6a774 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -137,7 +137,7 @@ extern "C" { /* Example usage: - // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned + // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned // preferrably to run on the same backend as the buffer ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); From 54ea0698fbf87e36a5d68a98c95f6bdd0fb91557 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 6 Apr 2024 17:43:15 +0300 Subject: [PATCH 8/8] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 7c30162e2..bbe7e3084 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -43a6d4af1971ee2912ff7bc2404011ff327b6a60 +8e413034b42e4fbedc2873166f61193b75f2622a