Merge remote-tracking branch 'origin/master' into hp/model/support-dbrx

This commit is contained in:
Pierrick HYMBERT 2024-04-06 23:53:11 +02:00
commit 69856297b9
17 changed files with 399 additions and 73 deletions

View file

@ -79,12 +79,18 @@ jobs:
sleep 0.1 sleep 0.1
done done
- name: Install k6 - name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.21'
- name: Install k6 and xk6-sse
id: k6_installation id: k6_installation
run: | run: |
cd examples/server/bench cd examples/server/bench
wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz go install go.k6.io/xk6/cmd/xk6@latest
tar xzf k6*.tar.gz --strip-components=1 xk6 build master \
--with github.com/phymbert/xk6-sse
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -118,7 +124,7 @@ jobs:
cd examples/server/bench cd examples/server/bench
source venv/bin/activate source venv/bin/activate
BENCH_K6_BIN_PATH=./k6 python bench.py \ python bench.py \
--runner-label ${{ env.RUNNER_LABEL }} \ --runner-label ${{ env.RUNNER_LABEL }} \
--name ${{ github.job }} \ --name ${{ github.job }} \
--branch ${{ github.head_ref || github.ref_name }} \ --branch ${{ github.head_ref || github.ref_name }} \
@ -228,9 +234,9 @@ jobs:
<summary>Expand details for performance related PR only</summary> <summary>Expand details for performance related PR only</summary>
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s** - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s** - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
- ${{ env.BENCH_GRAPH_XLABEL }} - ${{ env.BENCH_GRAPH_XLABEL }}

View file

@ -10,7 +10,7 @@ TEST_TARGETS = \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
tests/test-json-schema-to-grammar tests/test-json-schema-to-grammar tests/test-grammar-integration
# Code coverage output files # Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -918,6 +918,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS) tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View file

@ -182,6 +182,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later) - [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
- [Dot](https://github.com/alexpinel/Dot) (GPL) - [Dot](https://github.com/alexpinel/Dot) (GPL)
- [MindMac](https://mindmac.app) (proprietary)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

View file

@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash ```bash
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL> ./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99 ./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99 ./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
# custom set of batches # custom set of batches
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32 ./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
``` ```
## Sample results ## Sample results

View file

@ -32,13 +32,15 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (argc == 1 || argv[1][0] == '-') { if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]); printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n"); printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ; return 1 ;
} }
int n_kv_max = 2048; int n_kv_max = 2048;
int n_batch = 2048;
int n_ubatch = 512;
int is_pp_shared = 0; int is_pp_shared = 0;
int n_gpu_layers = 0; int n_gpu_layers = 0;
@ -56,23 +58,31 @@ int main(int argc, char ** argv) {
} }
if (argc >= 4) { if (argc >= 4) {
is_pp_shared = std::atoi(argv[3]); n_batch = std::atoi(argv[3]);
} }
if (argc >= 5) { if (argc >= 5) {
n_gpu_layers = std::atoi(argv[4]); n_ubatch = std::atoi(argv[4]);
} }
if (argc >= 6) { if (argc >= 6) {
n_pp = parse_list(argv[5]); is_pp_shared = std::atoi(argv[5]);
} }
if (argc >= 7) { if (argc >= 7) {
n_tg = parse_list(argv[6]); n_gpu_layers = std::atoi(argv[6]);
} }
if (argc >= 8) { if (argc >= 8) {
n_pl = parse_list(argv[7]); n_pp = parse_list(argv[7]);
}
if (argc >= 9) {
n_tg = parse_list(argv[8]);
}
if (argc >= 10) {
n_pl = parse_list(argv[9]);
} }
// init LLM // init LLM
@ -100,7 +110,8 @@ int main(int argc, char ** argv) {
ctx_params.seed = 1234; ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max; ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = 512; ctx_params.n_batch = n_batch;
ctx_params.n_ubatch = n_ubatch;
ctx_params.n_threads = params.n_threads; ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@ -158,7 +169,7 @@ int main(int argc, char ** argv) {
} }
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

View file

@ -2,13 +2,15 @@
Benchmark is using [k6](https://k6.io/). Benchmark is using [k6](https://k6.io/).
##### Install k6 ##### Install k6 and sse extension
Follow instruction from: https://k6.io/docs/get-started/installation/ SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
Example for ubuntu: Example:
```shell ```shell
snap install k6 go install go.k6.io/xk6/cmd/xk6@latest
xk6 build master \
--with github.com/phymbert/xk6-sse
``` ```
#### Download a dataset #### Download a dataset
@ -46,7 +48,7 @@ server --host localhost --port 8080 \
For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run: For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
```shell ```shell
k6 run script.js --duration 10m --iterations 500 --vus 8 ./k6 run script.js --duration 10m --iterations 500 --vus 8
``` ```
The benchmark values can be overridden with: The benchmark values can be overridden with:
@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with:
```shell ```shell
curl http://localhost:8080/metrics curl http://localhost:8080/metrics
``` ```
### Using the CI python script
The `bench.py` script does several steps:
- start the server
- define good variable for k6
- run k6 script
- extract metrics from prometheus
It aims to be used in the CI, but you can run it manually:
```shell
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
--runner-label local \
--name local \
--branch `git rev-parse --abbrev-ref HEAD` \
--commit `git rev-parse HEAD` \
--scenario script.js \
--duration 5m \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model-path-prefix models \
--parallel 4 \
-ngl 33 \
--batch-size 2048 \
--ubatch-size 256 \
--ctx-size 4096 \
--n-prompts 200 \
--max-prompt-tokens 256 \
--max-tokens 256
```

View file

@ -76,7 +76,6 @@ def main(args_in: list[str] | None = None) -> None:
data['metrics'][metric_name][metric_metric]=value data['metrics'][metric_name][metric_metric]=value
github_env.write( github_env.write(
f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n") f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
iterations = data['root_group']['checks']['success completion']['passes'] iterations = data['root_group']['checks']['success completion']['passes']
except Exception: except Exception:
@ -181,16 +180,16 @@ xychart-beta
bench_results = { bench_results = {
"i": iterations, "i": iterations,
"req": { "req": {
"p90": round(data['metrics']["http_req_duration"]["p(90)"], 2), "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
"avg": round(data['metrics']["http_req_duration"]["avg"], 2), "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
}, },
"pp": { "pp": {
"p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2), "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
"avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2), "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2), "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
}, },
"tg": { "tg": {
"p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2), "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2), "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2), "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
}, },
@ -206,7 +205,7 @@ xychart-beta
def start_benchmark(args): def start_benchmark(args):
k6_path = 'k6' k6_path = './k6'
if 'BENCH_K6_BIN_PATH' in os.environ: if 'BENCH_K6_BIN_PATH' in os.environ:
k6_path = os.environ['BENCH_K6_BIN_PATH'] k6_path = os.environ['BENCH_K6_BIN_PATH']
k6_args = [ k6_args = [

View file

@ -1,4 +1,4 @@
import http from 'k6/http' import sse from 'k6/x/sse'
import {check, sleep} from 'k6' import {check, sleep} from 'k6'
import {SharedArray} from 'k6/data' import {SharedArray} from 'k6/data'
import {Counter, Rate, Trend} from 'k6/metrics' import {Counter, Rate, Trend} from 'k6/metrics'
@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () {
const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens') const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@ -86,36 +88,62 @@ export default function () {
} }
], ],
"model": model, "model": model,
"stream": false, "stream": true,
"seed": 42, "seed": 42,
"max_tokens": max_tokens "max_tokens": max_tokens
} }
const body = JSON.stringify(payload) const params = {method: 'POST', body: JSON.stringify(payload)};
let res = http.post(`${server_url}/chat/completions`, body, { const startTime = new Date()
headers: {'Content-Type': 'application/json'}, let promptEvalEndTime = null
timeout: '300s' let prompt_tokens = 0
let completions_tokens = 0
let finish_reason = null
const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
client.on('event', function (event) {
if (promptEvalEndTime == null) {
promptEvalEndTime = new Date()
}
let chunk = JSON.parse(event.data)
let choice = chunk.choices[0]
if (choice.finish_reason) {
finish_reason = choice.finish_reason
}
if (chunk.usage) {
prompt_tokens = chunk.usage.prompt_tokens
llamacpp_prompt_tokens.add(prompt_tokens)
llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
completions_tokens = chunk.usage.completion_tokens
llamacpp_completion_tokens.add(completions_tokens)
llamacpp_completion_tokens_total_counter.add(completions_tokens)
}
})
client.on('error', function (e) {
console.log('An unexpected error occurred: ', e.error());
throw e;
})
}) })
check(res, {'success completion': (r) => r.status === 200}) check(res, {'success completion': (r) => r.status === 200})
if (res.status === 200) { const endTime = new Date()
const completions = res.json()
llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) const promptEvalTime = promptEvalEndTime - startTime
llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) if (promptEvalTime > 0) {
llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
llamacpp_completion_tokens.add(completions.usage.completion_tokens)
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
} else {
console.error(`response: ${res.body} request=${payload}`)
} }
const completion_time = endTime - promptEvalEndTime
if (completions_tokens > 0 && completion_time > 0) {
llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
}
llamacpp_completions_truncated_rate.add(finish_reason === 'length')
llamacpp_completions_stop_rate.add(finish_reason === 'stop')
sleep(0.3) sleep(0.3)
} }

View file

@ -567,6 +567,15 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
{"model", modelname}, {"model", modelname},
{"object", "chat.completion.chunk"} {"object", "chat.completion.chunk"}
}; };
if (!finish_reason.empty()) {
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
ret.push_back({"usage", json {
{"completion_tokens", num_tokens_predicted},
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
}});
}
return std::vector<json>({ret}); return std::vector<json>({ret});
} }

View file

@ -137,7 +137,7 @@ extern "C" {
/* /*
Example usage: Example usage:
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
// preferrably to run on the same backend as the buffer // preferrably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);

View file

@ -1664,24 +1664,6 @@ namespace dpct
const void *alpha, const void *a, int lda, const void *b, const void *alpha, const void *a, int lda, const void *b,
int ldb, const void *beta, void *c, int ldc) int ldb, const void *beta, void *c, int ldc)
{ {
#ifndef __INTEL_MKL__
GGML_UNUSED(q);
GGML_UNUSED(a_trans);
GGML_UNUSED(b_trans);
GGML_UNUSED(m);
GGML_UNUSED(n);
GGML_UNUSED(k);
GGML_UNUSED(alpha);
GGML_UNUSED(a);
GGML_UNUSED(lda);
GGML_UNUSED(b);
GGML_UNUSED(ldb);
GGML_UNUSED(beta);
GGML_UNUSED(c);
GGML_UNUSED(ldc);
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
"Project does not support this API.");
#else
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q); Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q); Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
auto data_a = get_memory<const Ta>(a); auto data_a = get_memory<const Ta>(a);
@ -1690,7 +1672,6 @@ namespace dpct
oneapi::mkl::blas::column_major::gemm( oneapi::mkl::blas::column_major::gemm(
q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
data_b, ldb, beta_value, data_c, ldc); data_b, ldb, beta_value, data_c, ldc);
#endif
} }
template <typename VecT, class BinaryOperation, class = void> template <typename VecT, class BinaryOperation, class = void>
@ -2330,6 +2311,7 @@ namespace dpct
lda, b, ldb, beta, c, ldc); lda, b, ldb, beta, c, ldc);
break; break;
} }
#ifdef __INTEL_MKL__
case detail::get_type_combination_id( case detail::get_type_combination_id(
library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float): library_data_t::real_float, library_data_t::real_float):
@ -2391,6 +2373,7 @@ namespace dpct
q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc); q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
break; break;
} }
#endif // __INTEL_MKL__
default: default:
throw std::runtime_error("the combination of data type is unsupported"); throw std::runtime_error("the combination of data type is unsupported");
} }

View file

@ -24,6 +24,7 @@ class Keys:
ALIGNMENT = "general.alignment" ALIGNMENT = "general.alignment"
NAME = "general.name" NAME = "general.name"
AUTHOR = "general.author" AUTHOR = "general.author"
VERSION = "general.version"
URL = "general.url" URL = "general.url"
DESCRIPTION = "general.description" DESCRIPTION = "general.description"
LICENSE = "general.license" LICENSE = "general.license"

View file

@ -296,6 +296,9 @@ class GGUFWriter:
def add_author(self, author: str) -> None: def add_author(self, author: str) -> None:
self.add_string(Keys.General.AUTHOR, author) self.add_string(Keys.General.AUTHOR, author)
def add_version(self, version: str) -> None:
self.add_string(Keys.General.VERSION, version)
def add_tensor_data_layout(self, layout: str) -> None: def add_tensor_data_layout(self, layout: str) -> None:
self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
@ -305,6 +308,9 @@ class GGUFWriter:
def add_description(self, description: str) -> None: def add_description(self, description: str) -> None:
self.add_string(Keys.General.DESCRIPTION, description) self.add_string(Keys.General.DESCRIPTION, description)
def add_licence(self, licence: str) -> None:
self.add_string(Keys.General.LICENSE, licence)
def add_source_url(self, url: str) -> None: def add_source_url(self, url: str) -> None:
self.add_string(Keys.General.SOURCE_URL, url) self.add_string(Keys.General.SOURCE_URL, url)

View file

@ -263,6 +263,7 @@ enum llm_kv {
LLM_KV_GENERAL_ALIGNMENT, LLM_KV_GENERAL_ALIGNMENT,
LLM_KV_GENERAL_NAME, LLM_KV_GENERAL_NAME,
LLM_KV_GENERAL_AUTHOR, LLM_KV_GENERAL_AUTHOR,
LLM_KV_GENERAL_VERSION,
LLM_KV_GENERAL_URL, LLM_KV_GENERAL_URL,
LLM_KV_GENERAL_DESCRIPTION, LLM_KV_GENERAL_DESCRIPTION,
LLM_KV_GENERAL_LICENSE, LLM_KV_GENERAL_LICENSE,
@ -332,6 +333,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
{ LLM_KV_GENERAL_NAME, "general.name" }, { LLM_KV_GENERAL_NAME, "general.name" },
{ LLM_KV_GENERAL_AUTHOR, "general.author" }, { LLM_KV_GENERAL_AUTHOR, "general.author" },
{ LLM_KV_GENERAL_VERSION, "general.version" },
{ LLM_KV_GENERAL_URL, "general.url" }, { LLM_KV_GENERAL_URL, "general.url" },
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" }, { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
{ LLM_KV_GENERAL_LICENSE, "general.license" }, { LLM_KV_GENERAL_LICENSE, "general.license" },

View file

@ -1 +1 @@
43a6d4af1971ee2912ff7bc2404011ff327b6a60 8e413034b42e4fbedc2873166f61193b75f2622a

View file

@ -59,6 +59,7 @@ llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt2 AR
llama_test(test-grammar-parser.cpp) llama_test(test-grammar-parser.cpp)
llama_test(test-llama-grammar.cpp) llama_test(test-llama-grammar.cpp)
llama_test(test-grammar-integration.cpp)
llama_test(test-grad0.cpp) llama_test(test-grad0.cpp)
# llama_test(test-opt.cpp) # SLOW # llama_test(test-opt.cpp) # SLOW
llama_test(test-backend-ops.cpp) llama_test(test-backend-ops.cpp)

View file

@ -0,0 +1,243 @@
#ifdef NDEBUG
#undef NDEBUG
#endif
#define LLAMA_API_INTERNAL
#include "ggml.h"
#include "llama.h"
#include "grammar-parser.h"
#include "unicode.h"
#include <cassert>
#include <string>
static void test_simple_grammar() {
// Test case for a simple grammar
const std::string grammar_str = R"""(root ::= expr
expr ::= term ("+" term)*
term ::= number
number ::= [0-9]+)""";
grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
// Ensure we parsed correctly
assert(!parsed_grammar.rules.empty());
// Ensure we have a root node
assert(!(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()));
std::vector<const llama_grammar_element*> grammar_rules(parsed_grammar.c_rules());
llama_grammar* grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
std::string input = "123+456";
auto decoded = decode_utf8(input, {});
const auto & code_points = decoded.first;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
assert(!grammar->stacks.empty());
}
bool completed_grammar = false;
for (const auto & stack : grammar->stacks) {
if (stack.empty()) {
completed_grammar = true;
break;
}
}
assert(completed_grammar);
// Clean up allocated memory
llama_grammar_free(grammar);
}
static void test_complex_grammar() {
// Test case for a more complex grammar, with both failure strings and success strings
const std::string grammar_str = R"""(root ::= expression
expression ::= term ws (("+"|"-") ws term)*
term ::= factor ws (("*"|"/") ws factor)*
factor ::= number | variable | "(" expression ")" | function-call
number ::= [0-9]+
variable ::= [a-zA-Z_][a-zA-Z0-9_]*
function-call ::= variable ws "(" (expression ("," ws expression)*)? ")"
ws ::= [ \t\n\r]?)""";
grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
// Ensure we parsed correctly
assert(!parsed_grammar.rules.empty());
// Ensure we have a root node
assert(!(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()));
std::vector<const llama_grammar_element*> grammar_rules(parsed_grammar.c_rules());
llama_grammar* grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
// Save the original grammar stacks so that we can reset after every new string we want to test
auto original_stacks = grammar->stacks;
// Test a few strings
std::vector<std::string> test_strings_pass = {
"42",
"1*2*3*4*5",
"x",
"x+10",
"x1+y2",
"(a+b)*(c-d)",
"func()",
"func(x,y+2)",
"a*(b+c)-d/e",
"f(g(x),h(y,z))",
"x + 10",
"x1 + y2",
"(a + b) * (c - d)",
"func()",
"func(x, y + 2)",
"a * (b + c) - d / e",
"f(g(x), h(y, z))",
"123+456",
"123*456*789-123/456+789*123",
"123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456"
};
std::vector<std::string> test_strings_fail = {
"+",
"/ 3x",
"x + + y",
"a * / b",
"func(,)",
"func(x y)",
"(a + b",
"x + y)",
"a + b * (c - d",
"42 +",
"x +",
"x + 10 +",
"(a + b) * (c - d",
"func(",
"func(x, y + 2",
"a * (b + c) - d /",
"f(g(x), h(y, z)",
"123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/",
};
// Passing strings
for (const auto & test_string : test_strings_pass) {
auto decoded = decode_utf8(test_string, {});
const auto & code_points = decoded.first;
int pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
++pos;
auto prev_stacks = grammar->stacks;
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
// Expect that each code point will not cause the grammar to fail
if (grammar->stacks.empty()) {
fprintf(stdout, "Error at position %d\n", pos);
fprintf(stderr, "Unexpected character '%s'\n", unicode_cpt_to_utf8(*it).c_str());
fprintf(stderr, "Input string is %s:\n", test_string.c_str());
}
assert(!grammar->stacks.empty());
}
bool completed_grammar = false;
for (const auto & stack : grammar->stacks) {
if (stack.empty()) {
completed_grammar = true;
break;
}
}
assert(completed_grammar);
// Reset the grammar stacks
grammar->stacks = original_stacks;
}
// Failing strings
for (const auto & test_string : test_strings_fail) {
auto decoded = decode_utf8(test_string, {});
const auto & code_points = decoded.first;
bool parse_failed = false;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
if (grammar->stacks.empty()) {
parse_failed = true;
break;
}
assert(!grammar->stacks.empty());
}
bool completed_grammar = false;
for (const auto & stack : grammar->stacks) {
if (stack.empty()) {
completed_grammar = true;
break;
}
}
// Ensure that the grammar is not completed, or that each string failed to match as-expected
assert((!completed_grammar) || parse_failed);
// Reset the grammar stacks
grammar->stacks = original_stacks;
}
// Clean up allocated memory
llama_grammar_free(grammar);
}
static void test_failure_missing_root() {
// Test case for a grammar that is missing a root rule
const std::string grammar_str = R"""(rot ::= expr
expr ::= term ("+" term)*
term ::= number
number ::= [0-9]+)""";
grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
// Ensure we parsed correctly
assert(!parsed_grammar.rules.empty());
// Ensure we do NOT have a root node
assert(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end());
}
static void test_failure_missing_reference() {
// Test case for a grammar that is missing a referenced rule
const std::string grammar_str = R"""(root ::= expr
expr ::= term ("+" term)*
term ::= numero
number ::= [0-9]+)""";
fprintf(stderr, "Expected error: ");
grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
// Ensure we did NOT parsed correctly
assert(parsed_grammar.rules.empty());
fprintf(stderr, "End of expected error. Test successful.\n");
}
int main() {
test_simple_grammar();
test_complex_grammar();
test_failure_missing_root();
test_failure_missing_reference();
return 0;
}