llama : llama_perf + option to disable timings during decode (#9355)

* llama : llama_perf + option to disable timings during decode

ggml-ci

* common : add llama_arg

* Update src/llama.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* perf : separate functions in the API

ggml-ci

* perf : safer pointer handling + naming update

ggml-ci

* minor : better local var name

* perf : abort on invalid sampler pointer

ggml-ci

---------

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
This commit is contained in:
Georgi Gerganov 2024-09-13 09:53:38 +03:00 committed by GitHub
parent bd35cb0ae3
commit 0abc6a2c25
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 135 additions and 91 deletions

View file

@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN)
llama_perf_sampler_print(smpl)
llama_perf_context_print(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let utf8Count = text.utf8.count