Merge branch 'master' into sycl_async_data_load

This commit is contained in:
OuadiElfarouki 2024-10-16 18:20:51 +01:00
commit f3141d563c
34 changed files with 1471 additions and 757 deletions

View file

@ -31,7 +31,7 @@ variety of hardware - locally and in the cloud.
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures - AVX, AVX2 and AVX512 support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP) - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
- Vulkan and SYCL backend support - Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
@ -130,6 +130,7 @@ Typically finetunes of the base models below are supported as well.
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
**UI:** **UI:**
@ -413,7 +414,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
| [BLAS](./docs/build.md#blas-build) | All | | [BLAS](./docs/build.md#blas-build) | All |
| [BLIS](./docs/backend/BLIS.md) | All | | [BLIS](./docs/backend/BLIS.md) | All |
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU | | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [MUSA](./docs/build.md#musa) | Moore Threads GPU | | [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
| [CUDA](./docs/build.md#cuda) | Nvidia GPU | | [CUDA](./docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU | | [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
| [Vulkan](./docs/build.md#vulkan) | GPU | | [Vulkan](./docs/build.md#vulkan) | GPU |

View file

@ -119,32 +119,6 @@ std::string common_arg::to_string() {
// utils // utils
// //
#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
#endif
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
static std::string format(const char * fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
static void common_params_handle_model_default(common_params & params) { static void common_params_handle_model_default(common_params & params) {
if (!params.hf_repo.empty()) { if (!params.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model // short-hand to avoid specifying --hf-file -> default it to --model
@ -199,7 +173,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
continue; continue;
} }
} catch (std::exception & e) { } catch (std::exception & e) {
throw std::invalid_argument(format( throw std::invalid_argument(string_format(
"error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
} }
} }
@ -220,7 +194,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
std::replace(arg.begin(), arg.end(), '_', '-'); std::replace(arg.begin(), arg.end(), '_', '-');
} }
if (arg_to_options.find(arg) == arg_to_options.end()) { if (arg_to_options.find(arg) == arg_to_options.end()) {
throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
} }
auto opt = *arg_to_options[arg]; auto opt = *arg_to_options[arg];
if (opt.has_value_from_env()) { if (opt.has_value_from_env()) {
@ -252,7 +226,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
continue; continue;
} }
} catch (std::exception & e) { } catch (std::exception & e) {
throw std::invalid_argument(format( throw std::invalid_argument(string_format(
"error while handling argument \"%s\": %s\n\n" "error while handling argument \"%s\": %s\n\n"
"usage:\n%s\n\nto show complete usage, run with -h", "usage:\n%s\n\nto show complete usage, run with -h",
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
@ -391,28 +365,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--verbose-prompt"}, {"--verbose-prompt"},
format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.verbose_prompt = true; params.verbose_prompt = true;
} }
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--no-display-prompt"}, {"--no-display-prompt"},
format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.display_prompt = false; params.display_prompt = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg( add_opt(common_arg(
{"-co", "--color"}, {"-co", "--color"},
format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.use_color = true; params.use_color = true;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
add_opt(common_arg( add_opt(common_arg(
{"-t", "--threads"}, "N", {"-t", "--threads"}, "N",
format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
[](common_params & params, int value) { [](common_params & params, int value) {
params.cpuparams.n_threads = value; params.cpuparams.n_threads = value;
if (params.cpuparams.n_threads <= 0) { if (params.cpuparams.n_threads <= 0) {
@ -472,14 +446,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--cpu-strict"}, "<0|1>", {"--cpu-strict"}, "<0|1>",
format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.cpuparams.strict_cpu = std::stoul(value); params.cpuparams.strict_cpu = std::stoul(value);
} }
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--prio"}, "N", {"--prio"}, "N",
format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
[](common_params & params, int prio) { [](common_params & params, int prio) {
if (prio < 0 || prio > 3) { if (prio < 0 || prio > 3) {
throw std::invalid_argument("invalid value"); throw std::invalid_argument("invalid value");
@ -489,7 +463,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--poll"}, "<0...100>", {"--poll"}, "<0...100>",
format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.cpuparams.poll = std::stoul(value); params.cpuparams.poll = std::stoul(value);
} }
@ -523,7 +497,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--prio-batch"}, "N", {"--prio-batch"}, "N",
format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
[](common_params & params, int prio) { [](common_params & params, int prio) {
if (prio < 0 || prio > 3) { if (prio < 0 || prio > 3) {
throw std::invalid_argument("invalid value"); throw std::invalid_argument("invalid value");
@ -567,7 +541,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
add_opt(common_arg( add_opt(common_arg(
{"--prio-draft"}, "N", {"--prio-draft"}, "N",
format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
[](common_params & params, int prio) { [](common_params & params, int prio) {
if (prio < 0 || prio > 3) { if (prio < 0 || prio > 3) {
throw std::invalid_argument("invalid value"); throw std::invalid_argument("invalid value");
@ -611,7 +585,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
add_opt(common_arg( add_opt(common_arg(
{"--prio-batch-draft"}, "N", {"--prio-batch-draft"}, "N",
format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
[](common_params & params, int prio) { [](common_params & params, int prio) {
if (prio < 0 || prio > 3) { if (prio < 0 || prio > 3) {
throw std::invalid_argument("invalid value"); throw std::invalid_argument("invalid value");
@ -628,14 +602,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
add_opt(common_arg( add_opt(common_arg(
{"--draft"}, "N", {"--draft"}, "N",
format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_draft = value; params.n_draft = value;
} }
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
add_opt(common_arg( add_opt(common_arg(
{"-ps", "--p-split"}, "N", {"-ps", "--p-split"}, "N",
format("speculative decoding split probability (default: %.1f)", (double)params.p_split), string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.p_split = std::stof(value); params.p_split = std::stof(value);
} }
@ -656,56 +630,56 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_LOOKUP})); ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
add_opt(common_arg( add_opt(common_arg(
{"-c", "--ctx-size"}, "N", {"-c", "--ctx-size"}, "N",
format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_ctx = value; params.n_ctx = value;
} }
).set_env("LLAMA_ARG_CTX_SIZE")); ).set_env("LLAMA_ARG_CTX_SIZE"));
add_opt(common_arg( add_opt(common_arg(
{"-n", "--predict", "--n-predict"}, "N", {"-n", "--predict", "--n-predict"}, "N",
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_predict = value; params.n_predict = value;
} }
).set_env("LLAMA_ARG_N_PREDICT")); ).set_env("LLAMA_ARG_N_PREDICT"));
add_opt(common_arg( add_opt(common_arg(
{"-b", "--batch-size"}, "N", {"-b", "--batch-size"}, "N",
format("logical maximum batch size (default: %d)", params.n_batch), string_format("logical maximum batch size (default: %d)", params.n_batch),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_batch = value; params.n_batch = value;
} }
).set_env("LLAMA_ARG_BATCH")); ).set_env("LLAMA_ARG_BATCH"));
add_opt(common_arg( add_opt(common_arg(
{"-ub", "--ubatch-size"}, "N", {"-ub", "--ubatch-size"}, "N",
format("physical maximum batch size (default: %d)", params.n_ubatch), string_format("physical maximum batch size (default: %d)", params.n_ubatch),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_ubatch = value; params.n_ubatch = value;
} }
).set_env("LLAMA_ARG_UBATCH")); ).set_env("LLAMA_ARG_UBATCH"));
add_opt(common_arg( add_opt(common_arg(
{"--keep"}, "N", {"--keep"}, "N",
format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_keep = value; params.n_keep = value;
} }
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--no-context-shift"}, {"--no-context-shift"},
format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
[](common_params & params) { [](common_params & params) {
params.ctx_shift = false; params.ctx_shift = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
add_opt(common_arg( add_opt(common_arg(
{"--chunks"}, "N", {"--chunks"}, "N",
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_chunks = value; params.n_chunks = value;
} }
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
add_opt(common_arg( add_opt(common_arg(
{"-fa", "--flash-attn"}, {"-fa", "--flash-attn"},
format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params) {
params.flash_attn = true; params.flash_attn = true;
} }
@ -721,7 +695,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--no-perf"}, {"--no-perf"},
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.no_perf = true; params.no_perf = true;
params.sparams.no_perf = true; params.sparams.no_perf = true;
@ -733,7 +707,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
std::ifstream file(value); std::ifstream file(value);
if (!file) { if (!file) {
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
} }
// store the external file name in params // store the external file name in params
params.prompt_file = value; params.prompt_file = value;
@ -749,7 +723,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
std::ifstream file(value); std::ifstream file(value);
if (!file) { if (!file) {
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
} }
params.in_files.push_back(value); params.in_files.push_back(value);
} }
@ -760,7 +734,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
std::ifstream file(value, std::ios::binary); std::ifstream file(value, std::ios::binary);
if (!file) { if (!file) {
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
} }
// store the external file name in params // store the external file name in params
params.prompt_file = value; params.prompt_file = value;
@ -772,7 +746,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
)); ));
add_opt(common_arg( add_opt(common_arg(
{"-e", "--escape"}, {"-e", "--escape"},
format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.escape = true; params.escape = true;
} }
@ -786,7 +760,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
)); ));
add_opt(common_arg( add_opt(common_arg(
{"-ptc", "--print-token-count"}, "N", {"-ptc", "--print-token-count"}, "N",
format("print token count every N tokens (default: %d)", params.n_print), string_format("print token count every N tokens (default: %d)", params.n_print),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_print = value; params.n_print = value;
} }
@ -821,14 +795,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg( add_opt(common_arg(
{"-sp", "--special"}, {"-sp", "--special"},
format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.special = true; params.special = true;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-cnv", "--conversation"}, {"-cnv", "--conversation"},
format( string_format(
"run in conversation mode:\n" "run in conversation mode:\n"
"- does not print special tokens and suffix/prefix\n" "- does not print special tokens and suffix/prefix\n"
"- interactive mode is also enabled\n" "- interactive mode is also enabled\n"
@ -841,14 +815,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg( add_opt(common_arg(
{"-i", "--interactive"}, {"-i", "--interactive"},
format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.interactive = true; params.interactive = true;
} }
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg( add_opt(common_arg(
{"-if", "--interactive-first"}, {"-if", "--interactive-first"},
format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.interactive_first = true; params.interactive_first = true;
} }
@ -893,7 +867,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg( add_opt(common_arg(
{"--spm-infill"}, {"--spm-infill"},
format( string_format(
"use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
params.spm_infill ? "enabled" : "disabled" params.spm_infill ? "enabled" : "disabled"
), ),
@ -903,7 +877,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
add_opt(common_arg( add_opt(common_arg(
{"--samplers"}, "SAMPLERS", {"--samplers"}, "SAMPLERS",
format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
const auto sampler_names = string_split(value, ';'); const auto sampler_names = string_split(value, ';');
params.sparams.samplers = common_sampler_types_from_names(sampler_names, true); params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
@ -911,14 +885,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"-s", "--seed"}, "SEED", {"-s", "--seed"}, "SEED",
format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.seed = std::stoul(value); params.sparams.seed = std::stoul(value);
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--sampling-seq"}, "SEQUENCE", {"--sampling-seq"}, "SEQUENCE",
format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.samplers = common_sampler_types_from_chars(value); params.sparams.samplers = common_sampler_types_from_chars(value);
} }
@ -932,14 +906,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--penalize-nl"}, {"--penalize-nl"},
format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.sparams.penalize_nl = true; params.sparams.penalize_nl = true;
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--temp"}, "N", {"--temp"}, "N",
format("temperature (default: %.1f)", (double)params.sparams.temp), string_format("temperature (default: %.1f)", (double)params.sparams.temp),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.temp = std::stof(value); params.sparams.temp = std::stof(value);
params.sparams.temp = std::max(params.sparams.temp, 0.0f); params.sparams.temp = std::max(params.sparams.temp, 0.0f);
@ -947,42 +921,56 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--top-k"}, "N", {"--top-k"}, "N",
format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
[](common_params & params, int value) { [](common_params & params, int value) {
params.sparams.top_k = value; params.sparams.top_k = value;
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--top-p"}, "N", {"--top-p"}, "N",
format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.top_p = std::stof(value); params.sparams.top_p = std::stof(value);
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--min-p"}, "N", {"--min-p"}, "N",
format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.min_p = std::stof(value); params.sparams.min_p = std::stof(value);
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--tfs"}, "N", {"--tfs"}, "N",
format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), string_format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.tfs_z = std::stof(value); params.sparams.tfs_z = std::stof(value);
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg(
{"--xtc-probability"}, "N",
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
[](common_params & params, const std::string & value) {
params.sparams.xtc_probability = std::stof(value);
}
).set_sparam());
add_opt(common_arg(
{"--xtc-threshold"}, "N",
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
[](common_params & params, const std::string & value) {
params.sparams.xtc_threshold = std::stof(value);
}
).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--typical"}, "N", {"--typical"}, "N",
format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.typ_p = std::stof(value); params.sparams.typ_p = std::stof(value);
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--repeat-last-n"}, "N", {"--repeat-last-n"}, "N",
format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
[](common_params & params, int value) { [](common_params & params, int value) {
params.sparams.penalty_last_n = value; params.sparams.penalty_last_n = value;
params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
@ -990,42 +978,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--repeat-penalty"}, "N", {"--repeat-penalty"}, "N",
format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.penalty_repeat = std::stof(value); params.sparams.penalty_repeat = std::stof(value);
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--presence-penalty"}, "N", {"--presence-penalty"}, "N",
format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.penalty_present = std::stof(value); params.sparams.penalty_present = std::stof(value);
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--frequency-penalty"}, "N", {"--frequency-penalty"}, "N",
format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.penalty_freq = std::stof(value); params.sparams.penalty_freq = std::stof(value);
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--dynatemp-range"}, "N", {"--dynatemp-range"}, "N",
format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.dynatemp_range = std::stof(value); params.sparams.dynatemp_range = std::stof(value);
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--dynatemp-exp"}, "N", {"--dynatemp-exp"}, "N",
format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.dynatemp_exponent = std::stof(value); params.sparams.dynatemp_exponent = std::stof(value);
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--mirostat"}, "N", {"--mirostat"}, "N",
format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" string_format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
[](common_params & params, int value) { [](common_params & params, int value) {
params.sparams.mirostat = value; params.sparams.mirostat = value;
@ -1033,14 +1021,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--mirostat-lr"}, "N", {"--mirostat-lr"}, "N",
format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.mirostat_eta = std::stof(value); params.sparams.mirostat_eta = std::stof(value);
} }
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--mirostat-ent"}, "N", {"--mirostat-ent"}, "N",
format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.mirostat_tau = std::stof(value); params.sparams.mirostat_tau = std::stof(value);
} }
@ -1069,7 +1057,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_sparam()); ).set_sparam());
add_opt(common_arg( add_opt(common_arg(
{"--grammar"}, "GRAMMAR", {"--grammar"}, "GRAMMAR",
format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.sparams.grammar = value; params.sparams.grammar = value;
} }
@ -1080,7 +1068,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
std::ifstream file(value); std::ifstream file(value);
if (!file) { if (!file) {
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
} }
std::copy( std::copy(
std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(file),
@ -1150,53 +1138,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_ROPE_FREQ_SCALE")); ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
add_opt(common_arg( add_opt(common_arg(
{"--yarn-orig-ctx"}, "N", {"--yarn-orig-ctx"}, "N",
format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
[](common_params & params, int value) { [](common_params & params, int value) {
params.yarn_orig_ctx = value; params.yarn_orig_ctx = value;
} }
).set_env("LLAMA_ARG_YARN_ORIG_CTX")); ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
add_opt(common_arg( add_opt(common_arg(
{"--yarn-ext-factor"}, "N", {"--yarn-ext-factor"}, "N",
format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.yarn_ext_factor = std::stof(value); params.yarn_ext_factor = std::stof(value);
} }
).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
add_opt(common_arg( add_opt(common_arg(
{"--yarn-attn-factor"}, "N", {"--yarn-attn-factor"}, "N",
format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.yarn_attn_factor = std::stof(value); params.yarn_attn_factor = std::stof(value);
} }
).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
add_opt(common_arg( add_opt(common_arg(
{"--yarn-beta-slow"}, "N", {"--yarn-beta-slow"}, "N",
format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.yarn_beta_slow = std::stof(value); params.yarn_beta_slow = std::stof(value);
} }
).set_env("LLAMA_ARG_YARN_BETA_SLOW")); ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
add_opt(common_arg( add_opt(common_arg(
{"--yarn-beta-fast"}, "N", {"--yarn-beta-fast"}, "N",
format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.yarn_beta_fast = std::stof(value); params.yarn_beta_fast = std::stof(value);
} }
).set_env("LLAMA_ARG_YARN_BETA_FAST")); ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
add_opt(common_arg( add_opt(common_arg(
{"-gan", "--grp-attn-n"}, "N", {"-gan", "--grp-attn-n"}, "N",
format("group-attention factor (default: %d)", params.grp_attn_n), string_format("group-attention factor (default: %d)", params.grp_attn_n),
[](common_params & params, int value) { [](common_params & params, int value) {
params.grp_attn_n = value; params.grp_attn_n = value;
} }
).set_env("LLAMA_ARG_GRP_ATTN_N")); ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
add_opt(common_arg( add_opt(common_arg(
{"-gaw", "--grp-attn-w"}, "N", {"-gaw", "--grp-attn-w"}, "N",
format("group-attention width (default: %.1f)", (double)params.grp_attn_w), string_format("group-attention width (default: %d)", params.grp_attn_w),
[](common_params & params, int value) { [](common_params & params, int value) {
params.grp_attn_w = value; params.grp_attn_w = value;
} }
).set_env("LLAMA_ARG_GRP_ATTN_W")); ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg( add_opt(common_arg(
{"-dkvc", "--dump-kv-cache"}, {"-dkvc", "--dump-kv-cache"},
"verbose print of the KV cache", "verbose print of the KV cache",
@ -1213,7 +1201,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
add_opt(common_arg( add_opt(common_arg(
{"-ctk", "--cache-type-k"}, "TYPE", {"-ctk", "--cache-type-k"}, "TYPE",
format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
// TODO: get the type right here // TODO: get the type right here
params.cache_type_k = value; params.cache_type_k = value;
@ -1221,7 +1209,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_CACHE_TYPE_K")); ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
add_opt(common_arg( add_opt(common_arg(
{"-ctv", "--cache-type-v"}, "TYPE", {"-ctv", "--cache-type-v"}, "TYPE",
format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
// TODO: get the type right here // TODO: get the type right here
params.cache_type_v = value; params.cache_type_v = value;
@ -1229,7 +1217,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_CACHE_TYPE_V")); ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
add_opt(common_arg( add_opt(common_arg(
{"--perplexity", "--all-logits"}, {"--perplexity", "--all-logits"},
format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.logits_all = true; params.logits_all = true;
} }
@ -1243,7 +1231,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg( add_opt(common_arg(
{"--hellaswag-tasks"}, "N", {"--hellaswag-tasks"}, "N",
format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
[](common_params & params, int value) { [](common_params & params, int value) {
params.hellaswag_tasks = value; params.hellaswag_tasks = value;
} }
@ -1257,7 +1245,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg( add_opt(common_arg(
{"--winogrande-tasks"}, "N", {"--winogrande-tasks"}, "N",
format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
[](common_params & params, int value) { [](common_params & params, int value) {
params.winogrande_tasks = value; params.winogrande_tasks = value;
} }
@ -1271,7 +1259,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg( add_opt(common_arg(
{"--multiple-choice-tasks"}, "N", {"--multiple-choice-tasks"}, "N",
format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
[](common_params & params, int value) { [](common_params & params, int value) {
params.multiple_choice_tasks = value; params.multiple_choice_tasks = value;
} }
@ -1292,42 +1280,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg( add_opt(common_arg(
{"--ppl-stride"}, "N", {"--ppl-stride"}, "N",
format("stride for perplexity calculation (default: %d)", params.ppl_stride), string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
[](common_params & params, int value) { [](common_params & params, int value) {
params.ppl_stride = value; params.ppl_stride = value;
} }
).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg( add_opt(common_arg(
{"--ppl-output-type"}, "<0|1>", {"--ppl-output-type"}, "<0|1>",
format("output type for perplexity calculation (default: %d)", params.ppl_output_type), string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
[](common_params & params, int value) { [](common_params & params, int value) {
params.ppl_output_type = value; params.ppl_output_type = value;
} }
).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg( add_opt(common_arg(
{"-dt", "--defrag-thold"}, "N", {"-dt", "--defrag-thold"}, "N",
format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.defrag_thold = std::stof(value); params.defrag_thold = std::stof(value);
} }
).set_env("LLAMA_ARG_DEFRAG_THOLD")); ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
add_opt(common_arg( add_opt(common_arg(
{"-np", "--parallel"}, "N", {"-np", "--parallel"}, "N",
format("number of parallel sequences to decode (default: %d)", params.n_parallel), string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_parallel = value; params.n_parallel = value;
} }
).set_env("LLAMA_ARG_N_PARALLEL")); ).set_env("LLAMA_ARG_N_PARALLEL"));
add_opt(common_arg( add_opt(common_arg(
{"-ns", "--sequences"}, "N", {"-ns", "--sequences"}, "N",
format("number of sequences to decode (default: %d)", params.n_sequences), string_format("number of sequences to decode (default: %d)", params.n_sequences),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_sequences = value; params.n_sequences = value;
} }
).set_examples({LLAMA_EXAMPLE_PARALLEL})); ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
add_opt(common_arg( add_opt(common_arg(
{"-cb", "--cont-batching"}, {"-cb", "--cont-batching"},
format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params) {
params.cont_batching = true; params.cont_batching = true;
} }
@ -1451,7 +1439,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
std::vector<std::string> split_arg{ it, {} }; std::vector<std::string> split_arg{ it, {} };
if (split_arg.size() >= llama_max_devices()) { if (split_arg.size() >= llama_max_devices()) {
throw std::invalid_argument( throw std::invalid_argument(
format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
); );
} }
for (size_t i = 0; i < llama_max_devices(); ++i) { for (size_t i = 0; i < llama_max_devices(); ++i) {
@ -1468,7 +1456,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_TENSOR_SPLIT")); ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
add_opt(common_arg( add_opt(common_arg(
{"-mg", "--main-gpu"}, "INDEX", {"-mg", "--main-gpu"}, "INDEX",
format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
[](common_params & params, int value) { [](common_params & params, int value) {
params.main_gpu = value; params.main_gpu = value;
if (!llama_supports_gpu_offload()) { if (!llama_supports_gpu_offload()) {
@ -1478,7 +1466,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_MAIN_GPU")); ).set_env("LLAMA_ARG_MAIN_GPU"));
add_opt(common_arg( add_opt(common_arg(
{"--check-tensors"}, {"--check-tensors"},
format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.check_tensors = true; params.check_tensors = true;
} }
@ -1489,7 +1477,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
} }
} }
)); ));
@ -1543,7 +1531,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"-m", "--model"}, "FNAME", {"-m", "--model"}, "FNAME",
ex == LLAMA_EXAMPLE_EXPORT_LORA ex == LLAMA_EXAMPLE_EXPORT_LORA
? std::string("model path from which to load base model") ? std::string("model path from which to load base model")
: format( : string_format(
"model path (default: `models/$filename` with filename from `--hf-file` " "model path (default: `models/$filename` with filename from `--hf-file` "
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
), ),
@ -1592,42 +1580,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
std::ifstream file(value, std::ios::binary); std::ifstream file(value, std::ios::binary);
if (!file) { if (!file) {
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
} }
params.context_files.push_back(value); params.context_files.push_back(value);
} }
).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
add_opt(common_arg( add_opt(common_arg(
{"--chunk-size"}, "N", {"--chunk-size"}, "N",
format("minimum length of embedded text chunks (default: %d)", params.chunk_size), string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
[](common_params & params, int value) { [](common_params & params, int value) {
params.chunk_size = value; params.chunk_size = value;
} }
).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
add_opt(common_arg( add_opt(common_arg(
{"--chunk-separator"}, "STRING", {"--chunk-separator"}, "STRING",
format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.chunk_separator = value; params.chunk_separator = value;
} }
).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
add_opt(common_arg( add_opt(common_arg(
{"--junk"}, "N", {"--junk"}, "N",
format("number of times to repeat the junk text (default: %d)", params.n_junk), string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_junk = value; params.n_junk = value;
} }
).set_examples({LLAMA_EXAMPLE_PASSKEY})); ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
add_opt(common_arg( add_opt(common_arg(
{"--pos"}, "N", {"--pos"}, "N",
format("position of the passkey in the junk text (default: %d)", params.i_pos), string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
[](common_params & params, int value) { [](common_params & params, int value) {
params.i_pos = value; params.i_pos = value;
} }
).set_examples({LLAMA_EXAMPLE_PASSKEY})); ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
add_opt(common_arg( add_opt(common_arg(
{"-o", "--output", "--output-file"}, "FNAME", {"-o", "--output", "--output-file"}, "FNAME",
format("output file (default: '%s')", string_format("output file (default: '%s')",
ex == LLAMA_EXAMPLE_EXPORT_LORA ex == LLAMA_EXAMPLE_EXPORT_LORA
? params.lora_outfile.c_str() ? params.lora_outfile.c_str()
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
@ -1641,42 +1629,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
add_opt(common_arg( add_opt(common_arg(
{"-ofreq", "--output-frequency"}, "N", {"-ofreq", "--output-frequency"}, "N",
format("output the imatrix every N iterations (default: %d)", params.n_out_freq), string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_out_freq = value; params.n_out_freq = value;
} }
).set_examples({LLAMA_EXAMPLE_IMATRIX})); ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg( add_opt(common_arg(
{"--save-frequency"}, "N", {"--save-frequency"}, "N",
format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_save_freq = value; params.n_save_freq = value;
} }
).set_examples({LLAMA_EXAMPLE_IMATRIX})); ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg( add_opt(common_arg(
{"--process-output"}, {"--process-output"},
format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.process_output = true; params.process_output = true;
} }
).set_examples({LLAMA_EXAMPLE_IMATRIX})); ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg( add_opt(common_arg(
{"--no-ppl"}, {"--no-ppl"},
format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.compute_ppl = false; params.compute_ppl = false;
} }
).set_examples({LLAMA_EXAMPLE_IMATRIX})); ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg( add_opt(common_arg(
{"--chunk", "--from-chunk"}, "N", {"--chunk", "--from-chunk"}, "N",
format("start processing the input from chunk N (default: %d)", params.i_chunk), string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
[](common_params & params, int value) { [](common_params & params, int value) {
params.i_chunk = value; params.i_chunk = value;
} }
).set_examples({LLAMA_EXAMPLE_IMATRIX})); ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg( add_opt(common_arg(
{"-pps"}, {"-pps"},
format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
[](common_params & params) { [](common_params & params) {
params.is_pp_shared = true; params.is_pp_shared = true;
} }
@ -1707,7 +1695,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_BENCH})); ).set_examples({LLAMA_EXAMPLE_BENCH}));
add_opt(common_arg( add_opt(common_arg(
{"--embd-normalize"}, "N", {"--embd-normalize"}, "N",
format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
[](common_params & params, int value) { [](common_params & params, int value) {
params.embd_normalize = value; params.embd_normalize = value;
} }
@ -1728,35 +1716,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_EMBEDDING})); ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
add_opt(common_arg( add_opt(common_arg(
{"--host"}, "HOST", {"--host"}, "HOST",
format("ip address to listen (default: %s)", params.hostname.c_str()), string_format("ip address to listen (default: %s)", params.hostname.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.hostname = value; params.hostname = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
add_opt(common_arg( add_opt(common_arg(
{"--port"}, "PORT", {"--port"}, "PORT",
format("port to listen (default: %d)", params.port), string_format("port to listen (default: %d)", params.port),
[](common_params & params, int value) { [](common_params & params, int value) {
params.port = value; params.port = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
add_opt(common_arg( add_opt(common_arg(
{"--path"}, "PATH", {"--path"}, "PATH",
format("path to serve static files from (default: %s)", params.public_path.c_str()), string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.public_path = value; params.public_path = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
add_opt(common_arg( add_opt(common_arg(
{"--embedding", "--embeddings"}, {"--embedding", "--embeddings"},
format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params) {
params.embedding = true; params.embedding = true;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
add_opt(common_arg( add_opt(common_arg(
{"--reranking", "--rerank"}, {"--reranking", "--rerank"},
format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params) {
params.reranking = true; params.reranking = true;
} }
@ -1774,7 +1762,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
std::ifstream key_file(value); std::ifstream key_file(value);
if (!key_file) { if (!key_file) {
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
} }
std::string key; std::string key;
while (std::getline(key_file, key)) { while (std::getline(key_file, key)) {
@ -1801,7 +1789,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
add_opt(common_arg( add_opt(common_arg(
{"-to", "--timeout"}, "N", {"-to", "--timeout"}, "N",
format("server read/write timeout in seconds (default: %d)", params.timeout_read), string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
[](common_params & params, int value) { [](common_params & params, int value) {
params.timeout_read = value; params.timeout_read = value;
params.timeout_write = value; params.timeout_write = value;
@ -1809,45 +1797,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
add_opt(common_arg( add_opt(common_arg(
{"--threads-http"}, "N", {"--threads-http"}, "N",
format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_threads_http = value; params.n_threads_http = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
add_opt(common_arg( add_opt(common_arg(
{"-spf", "--system-prompt-file"}, "FNAME", {"--cache-reuse"}, "N",
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
[](common_params & params, const std::string & value) { [](common_params & params, int value) {
std::ifstream file(value); params.n_cache_reuse = value;
if (!file) {
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
}
std::string system_prompt;
std::copy(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(system_prompt)
);
params.system_prompt = system_prompt;
} }
).set_examples({LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
add_opt(common_arg( add_opt(common_arg(
{"--metrics"}, {"--metrics"},
format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params) {
params.endpoint_metrics = true; params.endpoint_metrics = true;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
add_opt(common_arg( add_opt(common_arg(
{"--slots"}, {"--slots"},
format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params) {
params.endpoint_slots = true; params.endpoint_slots = true;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
add_opt(common_arg( add_opt(common_arg(
{"--props"}, {"--props"},
format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params) {
params.endpoint_props = true; params.endpoint_props = true;
} }
@ -1877,7 +1855,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
if (!common_chat_verify_template(value)) { if (!common_chat_verify_template(value)) {
throw std::runtime_error(format( throw std::runtime_error(string_format(
"error: the supplied chat template is not supported: %s\n" "error: the supplied chat template is not supported: %s\n"
"note: llama.cpp does not use jinja parser, we only support commonly used templates\n", "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
value.c_str() value.c_str()
@ -1888,14 +1866,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(common_arg( add_opt(common_arg(
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY", {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.slot_prompt_similarity = std::stof(value); params.slot_prompt_similarity = std::stof(value);
} }
).set_examples({LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"--lora-init-without-apply"}, {"--lora-init-without-apply"},
format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params) {
params.lora_init_without_apply = true; params.lora_init_without_apply = true;
} }
@ -1920,28 +1898,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--positive-file"}, "FNAME", {"--positive-file"}, "FNAME",
format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.cvector_positive_file = value; params.cvector_positive_file = value;
} }
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
add_opt(common_arg( add_opt(common_arg(
{"--negative-file"}, "FNAME", {"--negative-file"}, "FNAME",
format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.cvector_negative_file = value; params.cvector_negative_file = value;
} }
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
add_opt(common_arg( add_opt(common_arg(
{"--pca-batch"}, "N", {"--pca-batch"}, "N",
format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_pca_batch = value; params.n_pca_batch = value;
} }
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
add_opt(common_arg( add_opt(common_arg(
{"--pca-iter"}, "N", {"--pca-iter"}, "N",
format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
[](common_params & params, int value) { [](common_params & params, int value) {
params.n_pca_iterations = value; params.n_pca_iterations = value;
} }

View file

@ -12,6 +12,7 @@
#include <algorithm> #include <algorithm>
#include <cinttypes> #include <cinttypes>
#include <climits>
#include <cmath> #include <cmath>
#include <codecvt> #include <codecvt>
#include <cstdarg> #include <cstdarg>
@ -23,10 +24,10 @@
#include <regex> #include <regex>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <thread>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include <thread>
#if defined(__APPLE__) && defined(__MACH__) #if defined(__APPLE__) && defined(__MACH__)
#include <sys/types.h> #include <sys/types.h>
@ -400,6 +401,21 @@ std::string common_params_get_system_info(const common_params & params) {
// String utils // String utils
// //
std::string string_format(const char * fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
std::vector<std::string> string_split(std::string input, char separator) { std::vector<std::string> string_split(std::string input, char separator) {
std::vector<std::string> parts; std::vector<std::string> parts;
size_t separator_pos = input.find(separator); size_t separator_pos = input.find(separator);
@ -2088,6 +2104,8 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p); fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");

View file

@ -90,6 +90,8 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_TFS_Z = 4, COMMON_SAMPLER_TYPE_TFS_Z = 4,
COMMON_SAMPLER_TYPE_TYPICAL_P = 5, COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
COMMON_SAMPLER_TYPE_TEMPERATURE = 6, COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
COMMON_SAMPLER_TYPE_XTC = 7,
COMMON_SAMPLER_TYPE_INFILL = 8,
}; };
// dimensionality reduction methods, used by cvector-generator // dimensionality reduction methods, used by cvector-generator
@ -108,6 +110,8 @@ struct common_sampler_params {
int32_t top_k = 40; // <= 0 to use vocab size int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled float min_p = 0.05f; // 0.0 = disabled
float xtc_probability = 0.00f; // 0.0 = disabled
float xtc_threshold = 0.10f; // > 0.5 disables XTC
float tfs_z = 1.00f; // 1.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled
float typ_p = 1.00f; // typical_p, 1.0 = disabled float typ_p = 1.00f; // typical_p, 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
@ -124,13 +128,15 @@ struct common_sampler_params {
bool ignore_eos = false; bool ignore_eos = false;
bool no_perf = false; // disable performance metrics bool no_perf = false; // disable performance metrics
std::vector<enum common_sampler_type> samplers = { std::vector<enum common_sampler_type> samplers = {
COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TFS_Z, COMMON_SAMPLER_TYPE_TFS_Z,
COMMON_SAMPLER_TYPE_TYPICAL_P, COMMON_SAMPLER_TYPE_TYPICAL_P,
COMMON_SAMPLER_TYPE_TOP_P, COMMON_SAMPLER_TYPE_TOP_P,
COMMON_SAMPLER_TYPE_MIN_P, COMMON_SAMPLER_TYPE_MIN_P,
COMMON_SAMPLER_TYPE_TEMPERATURE COMMON_SAMPLER_TYPE_XTC,
COMMON_SAMPLER_TYPE_TEMPERATURE,
}; };
std::string grammar; // optional BNF-like grammar to constrain sampling std::string grammar; // optional BNF-like grammar to constrain sampling
@ -277,12 +283,12 @@ struct common_params {
int32_t port = 8080; // server listens on this network port int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT std::string public_path = ""; // NOLINT
std::string chat_template = ""; // NOLINT std::string chat_template = ""; // NOLINT
std::string system_prompt = ""; // NOLINT
bool enable_chat_template = true; bool enable_chat_template = true;
std::vector<std::string> api_keys; std::vector<std::string> api_keys;
@ -352,15 +358,28 @@ void common_init();
std::string common_params_get_system_info(const common_params & params); std::string common_params_get_system_info(const common_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr); void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
bool set_process_priority(enum ggml_sched_priority prio); bool set_process_priority(enum ggml_sched_priority prio);
// //
// String utils // String utils
// //
#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
#endif
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
std::string string_format(const char * fmt, ...);
std::vector<std::string> string_split(std::string input, char separator); std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str); std::string string_strip(const std::string & str);

View file

@ -611,7 +611,7 @@ private:
} }
return join_seq(); return join_seq();
}; };
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space"); return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
} }
/* /*

View file

@ -130,10 +130,10 @@ std::string common_sampler_params::print() const {
snprintf(result, sizeof(result), snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
penalty_last_n, penalty_repeat, penalty_freq, penalty_present, penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
top_k, tfs_z, top_p, min_p, typ_p, temp, top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
mirostat, mirostat_eta, mirostat_tau); mirostat, mirostat_eta, mirostat_tau);
return std::string(result); return std::string(result);
@ -184,6 +184,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
case COMMON_SAMPLER_TYPE_MIN_P: case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break; break;
case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break;
case COMMON_SAMPLER_TYPE_TFS_Z: case COMMON_SAMPLER_TYPE_TFS_Z:
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
break; break;
@ -193,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
case COMMON_SAMPLER_TYPE_TEMPERATURE: case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break; break;
case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
break;
default: default:
GGML_ASSERT(false && "unknown sampler type"); GGML_ASSERT(false && "unknown sampler type");
} }
@ -372,6 +378,8 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TOP_P: return 'p'; case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
case COMMON_SAMPLER_TYPE_MIN_P: return 'm'; case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
case COMMON_SAMPLER_TYPE_XTC: return 'x';
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
default : return '?'; default : return '?';
} }
} }
@ -384,6 +392,8 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p"; case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p"; case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
default : return ""; default : return "";
} }
} }
@ -396,6 +406,8 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P }, { "min_p", COMMON_SAMPLER_TYPE_MIN_P },
{ "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z }, { "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z },
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
}; };
// since samplers names are written multiple ways // since samplers names are written multiple ways
@ -441,7 +453,9 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE } { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
}; };
std::vector<common_sampler_type> samplers; std::vector<common_sampler_type> samplers;

View file

@ -198,6 +198,8 @@ The following compilation options are also available to tweak performance:
### MUSA ### MUSA
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
- Using `make`: - Using `make`:
```bash ```bash
make GGML_MUSA=1 make GGML_MUSA=1
@ -209,6 +211,12 @@ The following compilation options are also available to tweak performance:
cmake --build build --config Release cmake --build build --config Release
``` ```
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
### hipBLAS ### hipBLAS
This provides BLAS acceleration on HIP-supported AMD GPUs. This provides BLAS acceleration on HIP-supported AMD GPUs.

View file

@ -205,11 +205,11 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
GGML_ASSERT(llama_token_prefix(model) >= 0); GGML_ASSERT(llama_token_fim_pre(model) >= 0);
GGML_ASSERT(llama_token_suffix(model) >= 0); GGML_ASSERT(llama_token_fim_suf(model) >= 0);
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx;
@ -218,7 +218,7 @@ int main(int argc, char ** argv) {
} }
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
const llama_token middle_token = llama_token_middle(model); const llama_token middle_token = llama_token_fim_mid(model);
if (middle_token >= 0) { if (middle_token >= 0) {
embd_inp.push_back(middle_token); embd_inp.push_back(middle_token);
} }
@ -508,8 +508,8 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx;

View file

@ -540,7 +540,7 @@ class SchemaConverter:
return self._add_rule( return self._add_rule(
name, name,
to_rule(transform()) if self._raw_pattern \ to_rule(transform()) if self._raw_pattern \
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space") else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
def _resolve_ref(self, ref): def _resolve_ref(self, ref):

View file

@ -241,6 +241,19 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres
Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0` Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
### XTC Sampling
- `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
- `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`
### Logit Bias ### Logit Bias
- `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion. - `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.

View file

@ -569,30 +569,30 @@ int main(int argc, char ** argv) {
if (!params.ctx_shift){ if (!params.ctx_shift){
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__); LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
break; break;
} else {
if (params.n_predict == -2) {
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep;
const int n_discard = n_left/2;
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
n_past -= n_discard;
LOG_DBG("after swap: n_past = %d\n", n_past);
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
LOG_DBG("clear session path\n");
path_session.clear();
} }
if (params.n_predict == -2) {
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep;
const int n_discard = n_left/2;
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
n_past -= n_discard;
LOG_DBG("after swap: n_past = %d\n", n_past);
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
LOG_DBG("clear session path\n");
path_session.clear();
} }
} else { } else {
// context extension via Self-Extend // context extension via Self-Extend

View file

@ -60,8 +60,6 @@ The project is under active development, and we are [looking for feedback and co
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) | | `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) | | `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) | | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
| `-gan, --grp-attn-n N` | group-attention factor (default: 1)<br/>(env: LLAMA_ARG_GRP_ATTN_N) |
| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0)<br/>(env: LLAMA_ARG_GRP_ATTN_W) |
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache | | `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) | | `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
@ -149,7 +147,7 @@ The project is under active development, and we are [looking for feedback and co
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) | | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) | | `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) | | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
@ -320,7 +318,6 @@ node index.js
- The prompt is a string or an array with the first element given as a string - The prompt is a string or an array with the first element given as a string
- The model's `tokenizer.ggml.add_bos_token` metadata is `true` - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
- The system prompt is empty
`temperature`: Adjust the randomness of the generated text. Default: `0.8` `temperature`: Adjust the randomness of the generated text. Default: `0.8`
@ -378,6 +375,8 @@ node index.js
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0` `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
`t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
@ -525,8 +524,31 @@ Takes a prefix and a suffix and returns the predicted completion as stream.
- `input_prefix`: Set the prefix of the code to infill. - `input_prefix`: Set the prefix of the code to infill.
- `input_suffix`: Set the suffix of the code to infill. - `input_suffix`: Set the suffix of the code to infill.
- `input_extra`: Additional context inserted before the FIM prefix.
- `prompt`: Added after the `FIM_MID` token
It also accepts all the options of `/completion` except `stream` and `prompt`. `input_extra` is array of `{"filename": string, "text": string}` objects.
The endpoint also accepts all the options of `/completion`.
If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used:
```txt
<FIM_REP>myproject
<FIM_SEP>{chunk 0 filename}
{chunk 0 text}
<FIM_SEP>{chunk 1 filename}
{chunk 1 text}
...
<FIM_SEP>filename
<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
```
If the tokens are missing, then the extra context is simply prefixed at the start:
```txt
[input_extra]<FIM_PRE>[input_prefix]<FIM_SUF>[input_suffix]<FIM_MID>[prompt]
```
### **GET** `/props`: Get server global properties. ### **GET** `/props`: Get server global properties.
@ -536,14 +558,12 @@ This endpoint is public (no API key check). By default, it is read-only. To make
```json ```json
{ {
"system_prompt": "",
"default_generation_settings": { ... }, "default_generation_settings": { ... },
"total_slots": 1, "total_slots": 1,
"chat_template": "" "chat_template": ""
} }
``` ```
- `system_prompt` - the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt.
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
- `chat_template` - the model's original Jinja2 prompt template - `chat_template` - the model's original Jinja2 prompt template
@ -554,7 +574,7 @@ To use this endpoint with POST method, you need to start server with `--props`
*Options:* *Options:*
- `system_prompt`: Change the system prompt (initial prompt of all slots). Please note that this does not take into account the chat template. It will append the prompt at the beginning of formatted prompt. - None yet
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API

View file

@ -43,6 +43,8 @@
top_k: 0, // <= 0 to use vocab size top_k: 0, // <= 0 to use vocab size
top_p: 1.0, // 1.0 = disabled top_p: 1.0, // 1.0 = disabled
min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4 min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
xtc_probability: 0.0, // 0 = disabled;
xtc_threshold: 0.1, // > 0.5 disables XTC;
tfs_z: 1.0, // 1.0 = disabled tfs_z: 1.0, // 1.0 = disabled
typical_p: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled
presence_penalty: 0.0, // 0.0 = disabled presence_penalty: 0.0, // 0.0 = disabled
@ -836,6 +838,8 @@ return html`
${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })} ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })} ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
</fieldset> </fieldset>
@ -1132,6 +1136,8 @@ document.addEventListener('DOMContentLoaded', (event) => {
const snapSettings = { const snapSettings = {
temperature: { snapValue: 1.0, snapRangeMultiplier: 6 }, temperature: { snapValue: 1.0, snapRangeMultiplier: 6 },
min_p: { snapValue: 0.05, snapRangeMultiplier: 2 }, min_p: { snapValue: 0.05, snapRangeMultiplier: 2 },
xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 },
xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 },
top_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 }, tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 },
typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },

View file

@ -307,6 +307,8 @@
top_k: 40, // <= 0 to use vocab size top_k: 40, // <= 0 to use vocab size
top_p: 0.95, // 1.0 = disabled top_p: 0.95, // 1.0 = disabled
min_p: 0.05, // 0 = disabled min_p: 0.05, // 0 = disabled
xtc_probability: 0.0, // 0 = disabled;
xtc_threshold: 0.1, // > 0.5 disables XTC;
tfs_z: 1.0, // 1.0 = disabled tfs_z: 1.0, // 1.0 = disabled
typical_p: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled
presence_penalty: 0.0, // 0.0 = disabled presence_penalty: 0.0, // 0.0 = disabled
@ -1013,6 +1015,8 @@
${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
</fieldset> </fieldset>
<hr /> <hr />
<fieldset class="three"> <fieldset class="three">

File diff suppressed because one or more lines are too long

View file

@ -529,7 +529,7 @@ export class SchemaConverter {
return joinSeq(); return joinSeq();
}; };
return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space") return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space")
} }
_notStrings(strings) { _notStrings(strings) {

View file

@ -128,14 +128,14 @@ struct slot_params {
bool stream = true; bool stream = true;
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int64_t t_max_prompt_ms = -1; // TODO: implement
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<std::string> antiprompt; std::vector<std::string> antiprompt;
json input_prefix;
json input_suffix;
}; };
struct server_slot { struct server_slot {
@ -165,8 +165,13 @@ struct server_slot {
json prompt; // can be either a string, array of strings or array of token ids json prompt; // can be either a string, array of strings or array of token ids
json input_prefix;
json input_suffix;
json input_extra;
// when a task is submitted, we first tokenize the prompt and store it here // when a task is submitted, we first tokenize the prompt and store it here
std::vector<llama_token> prompt_tokens; std::vector<llama_token> prompt_tokens;
std::vector<llama_token> extra_tokens;
std::string generated_text; std::string generated_text;
std::vector<llama_token> cache_tokens; std::vector<llama_token> cache_tokens;
@ -175,6 +180,7 @@ struct server_slot {
server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
bool has_next_token = true; bool has_next_token = true;
bool has_new_line = false;
bool truncated = false; bool truncated = false;
bool stopped_eos = false; bool stopped_eos = false;
bool stopped_word = false; bool stopped_word = false;
@ -193,21 +199,15 @@ struct server_slot {
llama_token sampled; llama_token sampled;
int32_t ga_i = 0; // group-attention state
int32_t ga_n = 1; // group-attention factor
int32_t ga_w = 512; // group-attention width
int32_t n_past_se = 0; // self-extend
// stats // stats
size_t n_sent_text = 0; // number of sent text character size_t n_sent_text = 0; // number of sent text character
size_t n_sent_token_probs = 0; size_t n_sent_token_probs = 0;
int64_t t_start_process_prompt; int64_t t_start_process_prompt;
int64_t t_start_generation; int64_t t_start_generation;
double t_prompt_processing; // ms double t_prompt_processing; // ms
double t_token_generation; // ms double t_token_generation; // ms
std::function<void(int)> callback_on_release; std::function<void(int)> callback_on_release;
@ -216,6 +216,7 @@ struct server_slot {
n_prompt_tokens = 0; n_prompt_tokens = 0;
generated_text = ""; generated_text = "";
has_new_line = false;
truncated = false; truncated = false;
stopped_eos = false; stopped_eos = false;
stopped_word = false; stopped_word = false;
@ -225,8 +226,6 @@ struct server_slot {
n_sent_text = 0; n_sent_text = 0;
n_sent_token_probs = 0; n_sent_token_probs = 0;
cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
ga_i = 0;
n_past_se = 0;
generated_token_probs.clear(); generated_token_probs.clear();
} }
@ -623,12 +622,6 @@ struct server_context {
int32_t n_ctx; // total context for all clients / slots int32_t n_ctx; // total context for all clients / slots
// system prompt
bool system_need_update = false;
std::string system_prompt;
std::vector<llama_token> system_tokens;
// slots / clients // slots / clients
std::vector<server_slot> slots; std::vector<server_slot> slots;
json default_generation_settings_for_props; json default_generation_settings_for_props;
@ -665,7 +658,7 @@ struct server_context {
bool load_model(const common_params & params_) { bool load_model(const common_params & params_) {
params = params_; params = params_;
// dedicate one sequence to the system prompt // reserve one extra sequence (seq_id == 0) for extra features
params.n_parallel += 1; params.n_parallel += 1;
common_init_result llama_init = common_init_from_params(params); common_init_result llama_init = common_init_from_params(params);
@ -711,22 +704,6 @@ struct server_context {
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
const int ga_n = params.grp_attn_n;
const int ga_w = params.grp_attn_w;
if (ga_n != 1) {
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w);
}
slot.ga_i = 0;
slot.ga_n = ga_n;
slot.ga_w = ga_w;
slot.sparams = params.sparams; slot.sparams = params.sparams;
slot.callback_on_release = [this](int) { slot.callback_on_release = [this](int) {
@ -753,12 +730,7 @@ struct server_context {
metrics.init(); metrics.init();
} }
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const { std::vector<llama_token> tokenize(const json & json_prompt, bool add_special, bool parse_special) const {
// TODO: currently, we tokenize using special tokens by default
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
// but it's better compared to completely ignoring ChatML and other chat templates
const bool TMP_FORCE_SPECIAL = true;
// If `add_bos` is true, we only add BOS, when json_prompt is a string, // If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string. // or the first element of the json_prompt array is a string.
std::vector<llama_token> prompt_tokens; std::vector<llama_token> prompt_tokens;
@ -771,10 +743,10 @@ struct server_context {
std::vector<llama_token> p; std::vector<llama_token> p;
if (first) { if (first) {
p = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); p = common_tokenize(ctx, s, add_special, parse_special);
first = false; first = false;
} else { } else {
p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); p = common_tokenize(ctx, s, false, parse_special);
} }
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
@ -788,7 +760,7 @@ struct server_context {
} }
} else { } else {
auto s = json_prompt.template get<std::string>(); auto s = json_prompt.template get<std::string>();
prompt_tokens = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
} }
return prompt_tokens; return prompt_tokens;
@ -830,7 +802,7 @@ struct server_context {
int slot_prompt_len = slot_prompt.size(); int slot_prompt_len = slot_prompt.size();
// length of the Longest Common Prefix between the current slot's prompt and the input prompt // length of the Longest Common Prefix between the current slot's prompt and the input prompt
int lcp_len = common_part(slot_prompt, prompt); int lcp_len = longest_common_prefix(slot_prompt, prompt);
// fraction of the common substring length compared to the current slot's prompt length // fraction of the common substring length compared to the current slot's prompt length
similarity = static_cast<float>(lcp_len) / slot_prompt_len; similarity = static_cast<float>(lcp_len) / slot_prompt_len;
@ -891,6 +863,8 @@ struct server_context {
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability);
slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold);
slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
@ -909,6 +883,8 @@ struct server_context {
slot.sparams.seed = json_value(data, "seed", default_sparams.seed); slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
//slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement
slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms);
// process "json_schema" and "grammar" // process "json_schema" and "grammar"
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
@ -917,19 +893,14 @@ struct server_context {
} }
if (data.contains("json_schema") && !data.contains("grammar")) { if (data.contains("json_schema") && !data.contains("grammar")) {
try { try {
auto schema = json_value(data, "json_schema", json::object()); auto schema = json_value(data, "json_schema", json::object());
slot.sparams.grammar = json_schema_to_grammar(schema); slot.sparams.grammar = json_schema_to_grammar(schema);
} catch (const std::exception & e) { } catch (const std::exception & e) {
send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
return false; return false;
} }
} else { } else {
slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
}
if (slot.params.cache_prompt && slot.ga_n != 1) {
slot.params.cache_prompt = false;
SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n");
} }
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@ -939,11 +910,29 @@ struct server_context {
} }
// infill // infill
slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix); slot.input_prefix = json_value(data, "input_prefix", json());
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); slot.input_suffix = json_value(data, "input_suffix", json());
slot.input_extra = json_value(data, "input_extra", json());
SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size());
for (const auto & chunk : slot.input_extra) {
// { "text": string, "filename": string }
if (!chunk.contains("text") || !chunk["text"].is_string()) {
send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST);
return false;
}
// filename is optional
if (chunk.contains("filename") && !chunk["filename"].is_string()) {
send_error(task, "extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST);
return false;
}
SLT_DBG(slot, "extra_context chunk in file '%s':\n%s\n", chunk.value("filename", "").c_str(), chunk.value("text", "").c_str());
}
// get prompt // get prompt
if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) { {
const auto & prompt = data.find("prompt"); const auto & prompt = data.find("prompt");
if (prompt == data.end()) { if (prompt == data.end()) {
send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST); send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
@ -1066,51 +1055,6 @@ struct server_context {
clean_kv_cache = false; clean_kv_cache = false;
} }
void system_prompt_update() {
SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str());
kv_cache_clear();
system_tokens.clear();
if (!system_prompt.empty()) {
system_tokens = common_tokenize(ctx, system_prompt, true);
const int32_t n_batch = llama_n_batch(ctx);
const int32_t n_tokens_prompt = system_tokens.size();
for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
common_batch_clear(batch);
for (int32_t j = 0; j < n_tokens; ++j) {
common_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
}
if (llama_decode(ctx, batch) != 0) {
SRV_ERR("%s", "llama_decode() failed\n");
return;
}
}
// assign the system KV cache to all parallel sequences
for (int32_t i = 1; i <= params.n_parallel; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
}
system_need_update = false;
}
bool system_prompt_set(const std::string & sys_prompt) {
SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
system_prompt = sys_prompt;
// update system_tokens and KV cache as soon as all slots are idle
system_need_update = true;
return true;
}
bool process_token(completion_token_output & result, server_slot & slot) { bool process_token(completion_token_output & result, server_slot & slot) {
// remember which tokens were sampled - used for repetition penalties during sampling // remember which tokens were sampled - used for repetition penalties during sampling
const std::string token_str = common_token_to_piece(ctx, result.tok, params.special); const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
@ -1146,22 +1090,21 @@ struct server_context {
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
const std::string str_test = slot.generated_text.substr(pos); const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false; bool send_text = true;
size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL); size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
if (stop_pos != std::string::npos) { if (stop_pos != std::string::npos) {
is_stop_full = true;
slot.generated_text.erase( slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos, slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end()); slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size()); pos = std::min(slot.n_sent_text, slot.generated_text.size());
} else { } else if (slot.has_next_token) {
is_stop_full = false;
stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL); stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
send_text = stop_pos == std::string::npos;
} }
// check if there is any token to predict // check if there is any token to predict
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) { if (send_text) {
// no send the stop word in the response // no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos); result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size(); slot.n_sent_text += result.text_to_send.size();
@ -1186,13 +1129,28 @@ struct server_context {
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
} }
// if we have already seen a new line, we stop after a certain time limit
if (slot.has_new_line && slot.params.t_max_predict_ms > 0 &&
(ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
slot.stopped_limit = true;
slot.has_next_token = false;
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
}
// check if there is a new line in the generated text
if (result.text_to_send.find('\n') != std::string::npos) {
slot.has_new_line = true;
}
// if context shift is disabled, we stop when it reaches the context limit // if context shift is disabled, we stop when it reaches the context limit
if (slot.n_decoded >= slot.n_ctx) { if (slot.n_past >= slot.n_ctx) {
slot.truncated = true; slot.truncated = true;
slot.stopped_limit = true; slot.stopped_limit = true;
slot.has_next_token = false; slot.has_next_token = false;
SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx); SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
} }
if (llama_token_is_eog(model, result.tok)) { if (llama_token_is_eog(model, result.tok)) {
@ -1204,18 +1162,18 @@ struct server_context {
const auto n_ctx_train = llama_n_ctx_train(model); const auto n_ctx_train = llama_n_ctx_train(model);
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
slot.truncated = true; slot.truncated = true;
slot.stopped_limit = true; slot.stopped_limit = true;
slot.has_next_token = false; // stop prediction slot.has_next_token = false; // stop prediction
SLT_WRN(slot, SLT_WRN(slot,
"n_predict (%d) is not set and self-context extend is disabled. " "n_predict (%d) is set for infinite generation. "
"Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n", "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
slot.params.n_predict, n_ctx_train); slot.params.n_predict, n_ctx_train);
} }
SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str()); SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());
return slot.has_next_token; // continue return slot.has_next_token; // continue
} }
@ -1239,6 +1197,8 @@ struct server_context {
{"top_k", slot.sparams.top_k}, {"top_k", slot.sparams.top_k},
{"top_p", slot.sparams.top_p}, {"top_p", slot.sparams.top_p},
{"min_p", slot.sparams.min_p}, {"min_p", slot.sparams.min_p},
{"xtc_probability", slot.sparams.xtc_probability},
{"xtc_threshold", slot.sparams.xtc_threshold},
{"tfs_z", slot.sparams.tfs_z}, {"tfs_z", slot.sparams.tfs_z},
{"typical_p", slot.sparams.typ_p}, {"typical_p", slot.sparams.typ_p},
{"repeat_last_n", slot.sparams.penalty_last_n}, {"repeat_last_n", slot.sparams.penalty_last_n},
@ -1334,6 +1294,7 @@ struct server_context {
{"tokens_evaluated", slot.n_prompt_tokens}, {"tokens_evaluated", slot.n_prompt_tokens},
{"generation_settings", get_formated_generation(slot)}, {"generation_settings", get_formated_generation(slot)},
{"prompt", slot.prompt}, {"prompt", slot.prompt},
{"has_new_line", slot.has_new_line},
{"truncated", slot.truncated}, {"truncated", slot.truncated},
{"stopped_eos", slot.stopped_eos}, {"stopped_eos", slot.stopped_eos},
{"stopped_word", slot.stopped_word}, {"stopped_word", slot.stopped_word},
@ -1483,9 +1444,8 @@ struct server_context {
if (prompt.is_string() || json_is_array_of_numbers(prompt)) { if (prompt.is_string() || json_is_array_of_numbers(prompt)) {
data["index"] = 0; data["index"] = 0;
create_task(data, false, nullptr); create_task(data, false, nullptr);
} } else if (prompt.is_array()) {
// otherwise, it's a multiple-prompt task, we break it into smaller tasks // otherwise, it's a multiple-prompt task, we break it into smaller tasks
else if (prompt.is_array()) {
std::vector<json> prompts = prompt; std::vector<json> prompts = prompt;
if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
// prompts[0] is the question // prompts[0] is the question
@ -1510,9 +1470,8 @@ struct server_context {
} }
} }
} }
} } else {
// invalid case // invalid case
else {
throw std::runtime_error(error_msg); throw std::runtime_error(error_msg);
} }
@ -1662,6 +1621,7 @@ struct server_context {
slot_data["prompt"] = slot.prompt; slot_data["prompt"] = slot.prompt;
slot_data["next_token"] = { slot_data["next_token"] = {
{"has_next_token", slot.has_next_token}, {"has_next_token", slot.has_next_token},
{"has_new_line", slot.has_new_line},
{"n_remain", slot.n_remaining}, {"n_remain", slot.n_remaining},
{"n_decoded", slot.n_decoded}, {"n_decoded", slot.n_decoded},
{"stopped_eos", slot.stopped_eos}, {"stopped_eos", slot.stopped_eos},
@ -1785,6 +1745,9 @@ struct server_context {
} }
slot->cache_tokens.resize(token_count); slot->cache_tokens.resize(token_count);
// TODO: maybe detokenize the slot->cache_tokens instead?
slot->prompt = string_format("[restored %d tokens from file]", (int) token_count);
const int64_t t_end = ggml_time_us(); const int64_t t_end = ggml_time_us();
const double t_restore_ms = (t_end - t_start) / 1000.0; const double t_restore_ms = (t_end - t_start) / 1000.0;
@ -1859,12 +1822,8 @@ struct server_context {
} }
if (all_idle) { if (all_idle) {
if (system_need_update) {
system_prompt_update();
}
SRV_INF("%s", "all slots are idle\n"); SRV_INF("%s", "all slots are idle\n");
if (system_prompt.empty() && clean_kv_cache) { if (clean_kv_cache) {
kv_cache_clear(); kv_cache_clear();
} }
@ -1885,38 +1844,36 @@ struct server_context {
// apply context-shift if needed // apply context-shift if needed
// TODO: simplify and improve // TODO: simplify and improve
for (server_slot & slot : slots) { for (server_slot & slot : slots) {
if (slot.ga_n == 1) { if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) { if (!params.ctx_shift) {
if (!params.ctx_shift) { // this check is redundant (for good)
// this check is redundant (for good) // we should never get here, because generation should already stopped in process_token()
// we should never get here, because generation should already stopped in process_token() slot.release();
slot.release(); send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); continue;
continue;
}
// Shift context
const int n_keep = slot.params.n_keep + add_bos_token;
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
if (slot.params.cache_prompt) {
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
}
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
}
slot.n_past -= n_discard;
slot.truncated = true;
} }
// Shift context
const int n_keep = slot.params.n_keep + add_bos_token;
const int n_left = slot.n_past - n_keep;
const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard);
if (slot.params.cache_prompt) {
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
}
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
}
slot.n_past -= n_discard;
slot.truncated = true;
} }
} }
@ -1931,11 +1888,7 @@ struct server_context {
slot.i_batch = batch.n_tokens; slot.i_batch = batch.n_tokens;
const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true);
// TODO: we always have to take into account the "system_tokens"
// this is not great and needs to be improved somehow
common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
slot.n_past += 1; slot.n_past += 1;
@ -1943,8 +1896,8 @@ struct server_context {
slot.cache_tokens.push_back(slot.sampled); slot.cache_tokens.push_back(slot.sampled);
} }
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n", SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated); slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
} }
// process in chunks of params.n_batch // process in chunks of params.n_batch
@ -1971,63 +1924,126 @@ struct server_context {
slot.t_start_process_prompt = ggml_time_us(); slot.t_start_process_prompt = ggml_time_us();
slot.t_start_generation = 0; slot.t_start_generation = 0;
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) { switch (slot.cmpl_type) {
const bool add_bos = llama_add_bos_token(model); case SERVER_TASK_CMPL_TYPE_NORMAL:
bool suff_rm_leading_spc = true; case SERVER_TASK_CMPL_TYPE_EMBEDDING:
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { {
params.input_suffix.erase(0, 1); prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true);
suff_rm_leading_spc = false; } break;
} case SERVER_TASK_CMPL_TYPE_RERANK:
{
// require slot.prompt to be array of 2 strings
if (!slot.prompt.is_array() || slot.prompt.size() != 2) {
SLT_ERR(slot, "%s", "invalid prompt for rerank task\n");
slot.release();
send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST);
continue;
}
auto prefix_tokens = tokenize(slot.params.input_prefix, false); // prompt: [BOS]query[EOS][SEP]doc[EOS]
auto suffix_tokens = tokenize(slot.params.input_suffix, false); prompt_tokens.clear();
prompt_tokens.push_back(llama_token_bos(model));
{
const auto part = tokenize(slot.prompt[0], false, false);
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
}
prompt_tokens.push_back(llama_token_eos(model));
prompt_tokens.push_back(llama_token_sep(model));
{
const auto part = tokenize(slot.prompt[1], false, false);
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
}
prompt_tokens.push_back(llama_token_eos(model));
} break;
case SERVER_TASK_CMPL_TYPE_INFILL:
{
// TODO: optimize this block by reducing memory allocations and movement
const int space_token = 29871; // TODO: this should not be hardcoded // use FIM repo-level pattern:
if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) { // ref: https://arxiv.org/pdf/2409.12186
suffix_tokens.erase(suffix_tokens.begin()); //
} // [FIM_REP]myproject
// [FIM_SEP]filename0
// extra chunk 0
// [FIM_SEP]filename1
// extra chunk 1
// ...
// [FIM_SEP]filename
// [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
//
auto tokens_prefix = tokenize(slot.input_prefix, false, false);
auto tokens_suffix = tokenize(slot.input_suffix, false, false);
auto tokens_prompt = tokenize(slot.prompt, false, false);
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); slot.extra_tokens.clear();
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
static const auto k_fim_repo = tokenize("myproject\n", false, false);
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; slot.extra_tokens.push_back(llama_token_fim_rep(model));
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
if (add_bos) { }
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
}
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
const llama_token middle_token = llama_token_middle(model); for (const auto & chunk : slot.input_extra) {
if (middle_token >= 0) { // { "text": string, "filename": string }
embd_inp.push_back(middle_token); const std::string text = chunk.value("text", "");
} const std::string filename = chunk.value("filename", "tmp");
prompt_tokens = embd_inp; if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
} else if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { const auto k_fim_file = tokenize(filename + "\n", false, false);
// require slot.prompt to be array of 2 strings
if (!slot.prompt.is_array() || slot.prompt.size() != 2) {
SLT_ERR(slot, "%s", "invalid prompt for rerank task\n");
slot.release();
send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST);
continue;
}
// prompt: [BOS]query[EOS][SEP]doc[EOS] slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model));
prompt_tokens.clear(); slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
prompt_tokens.push_back(llama_token_bos(model)); } else {
{ // chunk separator in binary form to avoid confusing the AI
const auto part = tokenize(slot.prompt[0], false); static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false);
}
prompt_tokens.push_back(llama_token_eos(model)); slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
prompt_tokens.push_back(llama_token_sep(model)); }
{
const auto part = tokenize(slot.prompt[1], false); const auto chunk_tokens = tokenize(text, false, false);
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); slot.extra_tokens.insert(slot.extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
} }
prompt_tokens.push_back(llama_token_eos(model));
} else { if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt // TODO: current filename
static const auto k_fim_file = tokenize("filename\n", false, false);
slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model));
slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
}
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
const int n_suffix_take = std::min<int>(tokens_suffix.size(), (n_batch/4));
const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4) - 3);
// fill the rest of the context with extra chunks
const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());
tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
tokens_suffix.resize(n_suffix_take);
tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix;
auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix;
if (llama_add_bos_token(model)) {
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
}
SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size());
// put the extra context before the FIM prefix
embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end());
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
embd_inp.push_back(llama_token_fim_mid(model));
prompt_tokens = std::move(embd_inp);
} break;
} }
slot.n_past = 0; slot.n_past = 0;
@ -2035,6 +2051,19 @@ struct server_context {
SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
// print prompt tokens (for debugging)
if (1) {
// first 16 tokens (avoid flooding logs)
for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
}
} else {
// all
for (int i = 0; i < (int) prompt_tokens.size(); i++) {
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
}
}
// empty prompt passed -> release the slot and send empty response // empty prompt passed -> release the slot and send empty response
if (prompt_tokens.empty()) { if (prompt_tokens.empty()) {
SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
@ -2055,7 +2084,9 @@ struct server_context {
} else { } else {
if (!params.ctx_shift) { if (!params.ctx_shift) {
// if context shift is disabled, we make sure prompt size is smaller than KV size // if context shift is disabled, we make sure prompt size is smaller than KV size
if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) { // TODO: there should be a separate parameter that control prompt truncation
// context shift should be applied only during the generation phase
if (slot.n_prompt_tokens >= slot.n_ctx) {
slot.release(); slot.release();
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
continue; continue;
@ -2066,8 +2097,8 @@ struct server_context {
} }
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
// if input prompt is too big, truncate it (if group attention self-extend is disabled) // if input prompt is too big, truncate it
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { if (slot.n_prompt_tokens >= slot.n_ctx) {
const int n_left = slot.n_ctx - slot.params.n_keep; const int n_left = slot.n_ctx - slot.params.n_keep;
const int n_block_size = n_left / 2; const int n_block_size = n_left / 2;
@ -2094,19 +2125,61 @@ struct server_context {
common_sampler_reset(slot.smpl); common_sampler_reset(slot.smpl);
if (!slot.params.cache_prompt) { if (slot.params.cache_prompt) {
slot.n_past_se = 0;
slot.ga_i = 0;
} else {
GGML_ASSERT(slot.ga_n == 1);
// reuse any previously computed tokens that are common with the new prompt // reuse any previously computed tokens that are common with the new prompt
slot.n_past = common_part(slot.cache_tokens, prompt_tokens); slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
// push the prompt into the sampling context (do not apply grammar) // push the prompt into the sampling context (do not apply grammar)
for (int i = 0; i < slot.n_past; ++i) { for (int i = 0; i < slot.n_past; ++i) {
common_sampler_accept(slot.smpl, slot.cache_tokens[i], false); common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
} }
// reuse chunks from the cached prompt by shifting their KV cache in the new position
if (params.n_cache_reuse > 0) {
size_t head_c = slot.n_past; // cache
size_t head_p = slot.n_past; // current prompt
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past);
while (head_c < slot.cache_tokens.size() &&
head_p < prompt_tokens.size()) {
size_t n_match = 0;
while (head_c + n_match < slot.cache_tokens.size() &&
head_p + n_match < prompt_tokens.size() &&
slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
n_match++;
}
if (n_match >= (size_t) params.n_cache_reuse) {
SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
//for (size_t i = head_p; i < head_p + n_match; i++) {
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
//}
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift);
for (size_t i = 0; i < n_match; i++) {
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
slot.n_past++;
}
head_c += n_match;
head_p += n_match;
} else {
head_c += 1;
}
}
SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
}
} }
} }
@ -2115,9 +2188,6 @@ struct server_context {
SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
slot.n_past--; slot.n_past--;
if (slot.ga_i > 0) {
slot.n_past_se--;
}
} }
slot.n_prompt_tokens_processed = 0; slot.n_prompt_tokens_processed = 0;
@ -2143,55 +2213,31 @@ struct server_context {
} }
// keep only the common part // keep only the common part
int p0 = (int) system_tokens.size() + slot.n_past; if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
// could not partially delete (likely using a non-Transformer model) // could not partially delete (likely using a non-Transformer model)
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
p0 = (int) system_tokens.size(); // there is no common part left
if (p0 != 0) {
// copy over the system prompt when there is one
llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);
}
// there is no common part left (except for the system prompt)
slot.n_past = 0; slot.n_past = 0;
slot.n_past_se = 0;
slot.ga_i = 0;
// TODO: is the system prompt ever in the sampling context?
common_sampler_reset(slot.smpl); common_sampler_reset(slot.smpl);
} }
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
// remove the non-common part from the cache // remove the non-common part from the cache
slot.cache_tokens.resize(slot.n_past); slot.cache_tokens.resize(slot.n_past);
SLT_INF(slot, "kv cache rm [%d, end)\n", p0);
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
int32_t ga_i = slot.ga_i;
int32_t ga_n = slot.ga_n;
int32_t ga_w = slot.ga_w;
// add prompt tokens for processing in the current batch // add prompt tokens for processing in the current batch
// TODO: the self-extend stuff here is a mess - simplify and/or abstract it somehow while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) { common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false);
if (slot.ga_n != 1) {
while (slot_npast >= ga_i + ga_w) {
const int bd = (ga_w/ga_n)*(ga_n - 1);
slot_npast -= bd;
ga_i += ga_w/ga_n;
}
}
common_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
if (slot.params.cache_prompt) { if (slot.params.cache_prompt) {
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
} }
slot.n_prompt_tokens_processed++; slot.n_prompt_tokens_processed++;
slot_npast++; slot.n_past++;
} }
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
@ -2232,34 +2278,6 @@ struct server_context {
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
for (auto & slot : slots) {
if (slot.ga_n != 1) {
// context extension via Self-Extend
// TODO: simplify and/or abstract this
while (slot.n_past_se >= slot.ga_i + slot.ga_w) {
const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
slot.n_past_se -= bd;
slot.ga_i += slot.ga_w / slot.ga_n;
SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
}
slot.n_past_se += n_tokens;
}
}
llama_batch batch_view = { llama_batch batch_view = {
n_tokens, n_tokens,
batch.token + i, batch.token + i,
@ -2414,10 +2432,6 @@ int main(int argc, char ** argv) {
// struct that contains llama context and inference // struct that contains llama context and inference
server_context ctx_server; server_context ctx_server;
if (!params.system_prompt.empty()) {
ctx_server.system_prompt_set(params.system_prompt);
}
if (params.model_alias == "unknown") { if (params.model_alias == "unknown") {
params.model_alias = params.model; params.model_alias = params.model;
} }
@ -2845,7 +2859,6 @@ int main(int argc, char ** argv) {
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
json data = { json data = {
{ "system_prompt", ctx_server.system_prompt },
{ "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params.n_parallel }, { "total_slots", ctx_server.params.n_parallel },
{ "chat_template", llama_get_chat_template(ctx_server.model) }, { "chat_template", llama_get_chat_template(ctx_server.model) },
@ -2861,10 +2874,8 @@ int main(int argc, char ** argv) {
} }
json data = json::parse(req.body); json data = json::parse(req.body);
if (data.contains("system_prompt")) {
std::string system_prompt = data.at("system_prompt"); // update any props here
ctx_server.system_prompt_set(system_prompt);
}
res_ok(res, {{ "success", true }}); res_ok(res, {{ "success", true }});
}; };
@ -2924,7 +2935,23 @@ int main(int argc, char ** argv) {
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res); return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res);
}; };
const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
std::string err;
if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
err += "prefix token is missing. ";
}
if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) {
err += "suffix token is missing. ";
}
if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) {
err += "middle token is missing. ";
}
if (!err.empty()) {
res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
return;
}
json data = json::parse(req.body); json data = json::parse(req.body);
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res); return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res);
}; };
@ -3010,7 +3037,8 @@ int main(int argc, char ** argv) {
if (body.count("content") != 0) { if (body.count("content") != 0) {
const bool add_special = json_value(body, "add_special", false); const bool add_special = json_value(body, "add_special", false);
const bool with_pieces = json_value(body, "with_pieces", false); const bool with_pieces = json_value(body, "with_pieces", false);
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special, true);
if (with_pieces) { if (with_pieces) {
for (const auto& token : tokens) { for (const auto& token : tokens) {
@ -3361,6 +3389,7 @@ int main(int argc, char ** argv) {
ctx_server.queue_tasks.on_new_task(std::bind( ctx_server.queue_tasks.on_new_task(std::bind(
&server_context::process_single_task, &ctx_server, std::placeholders::_1)); &server_context::process_single_task, &ctx_server, std::placeholders::_1));
ctx_server.queue_tasks.on_update_slots(std::bind( ctx_server.queue_tasks.on_update_slots(std::bind(
&server_context::update_slots, &ctx_server)); &server_context::update_slots, &ctx_server));

View file

@ -13,6 +13,10 @@ Feature: llama.cpp server
And 32 as batch size And 32 as batch size
And 2 slots And 2 slots
# the prompt is 301 tokens
# the slot context is 256/2 = 128 tokens
# the prompt is truncated to keep the last 109 tokens
# 64 tokens are generated thanks to shifting the context when it gets full
Scenario: Inference with context shift Scenario: Inference with context shift
And 64 server max tokens to predict And 64 server max tokens to predict
Then the server is starting Then the server is starting

View file

@ -195,14 +195,14 @@ static std::string gen_chatcmplid() {
// other common utils // other common utils
// //
static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) { static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
size_t i; size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
return i; return i;
} }
static size_t common_part(const std::string & a, const std::string & b) { static size_t longest_common_prefix(const std::string & a, const std::string & b) {
size_t i; size_t i;
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
@ -360,9 +360,9 @@ static json oaicompat_completion_params_parse(
// Handle "logprobs" field // Handle "logprobs" field
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
if (body.contains("logprobs")) { if (json_value(body, "logprobs", false)) {
llama_params["n_probs"] = json_value(body, "top_logprobs", 20); llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
} else if (body.contains("top_logprobs")) { } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
throw std::runtime_error("top_logprobs requires logprobs to be set to true"); throw std::runtime_error("top_logprobs requires logprobs to be set to true");
} }

6
flake.lock generated
View file

@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1728018373, "lastModified": 1728492678,
"narHash": "sha256-NOiTvBbRLIOe5F6RbHaAh6++BNjsb149fGZd1T4+KBg=", "narHash": "sha256-9UTxR8eukdg+XZeHgxW5hQA9fIKHsKCdOIUycTryeVw=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "bc947f541ae55e999ffdb4013441347d83b00feb", "rev": "5633bcff0c6162b9e4b5f1264264611e950c8ec7",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -348,7 +348,6 @@ struct tensor_alloc {
}; };
struct leaf_alloc { struct leaf_alloc {
int buffer_id;
struct tensor_alloc leaf; struct tensor_alloc leaf;
}; };
@ -740,7 +739,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
for (int i = 0; i < graph->n_leafs; i++) { for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i]; struct ggml_tensor * leaf = graph->leafs[i];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
if (leaf->view_src || leaf->data) { if (leaf->view_src || leaf->data) {
galloc->leaf_allocs[i].leaf.buffer_id = -1; galloc->leaf_allocs[i].leaf.buffer_id = -1;
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;

View file

@ -1148,6 +1148,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) { for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
ggml_backend_cann_buffer_types[i] = { ggml_backend_cann_buffer_types[i] = {
/* .iface = */ ggml_backend_cann_buffer_type_interface, /* .iface = */ ggml_backend_cann_buffer_type_interface,
/* .device = */ nullptr,
/* .context = */ /* .context = */
new ggml_backend_cann_buffer_type_context{ new ggml_backend_cann_buffer_type_context{
i, "CANN" + std::to_string(i)}, i, "CANN" + std::to_string(i)},
@ -1868,7 +1869,7 @@ static ggml_backend_event_t ggml_backend_cann_event_new(
ACL_CHECK(aclrtCreateEvent(&event)); ACL_CHECK(aclrtCreateEvent(&event));
return new ggml_backend_event{ return new ggml_backend_event{
/* .backend = */ backend, /* .device = */ nullptr,
/* .context = */ event, /* .context = */ event,
}; };
} }
@ -1895,10 +1896,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
* *
* @param event Pointer to the event structure to be recorded. * @param event Pointer to the event structure to be recorded.
*/ */
static void ggml_backend_cann_event_record(ggml_backend_event_t event) { static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_backend_cann_context* cann_ctx = ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)event->backend->context; (ggml_backend_cann_context*)backend->context;
ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream())); ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
} }
@ -1916,8 +1916,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
ggml_backend_event_t event) { ggml_backend_event_t event) {
ggml_backend_cann_context* cann_ctx = ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context; (ggml_backend_cann_context*)backend->context;
if (ggml_backend_is_cann(backend)) {
if (ggml_backend_is_cann(event->backend)) {
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
(aclrtEvent)event->context)); (aclrtEvent)event->context));
} else { } else {

View file

@ -416,10 +416,11 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const half * x = (const half *) vx; const half * x = (const half *) vx;
// load 2 halfs into register in a single instruction
const half2 x_reg = *((half2 *) &(x[ib + iqs]));
// automatic half -> float type cast if dfloat == float // automatic half -> float type cast if dfloat == float
v.x = x[ib + iqs + 0]; v.x = __low2float(x_reg);
v.y = x[ib + iqs + 1]; v.y = __high2float(x_reg);
} }
static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) { static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
@ -476,13 +477,28 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
// matrix multiplication // matrix multiplication
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
#ifdef GGML_CUDA_F16 #ifdef GGML_CUDA_F16
tmp += __hmul2(v, { if ( y_offset == 1 ) {
y[iybs + iqs + j/qr + 0], // load 2 dfloats into register in a single instruction
y[iybs + iqs + j/qr + y_offset] const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
}); tmp += __hmul2(v, y_reg);
}
else {
tmp += __hmul2(v, {
y[iybs + iqs + j/qr + 0],
y[iybs + iqs + j/qr + y_offset]
});
}
#else #else
tmp += v.x * y[iybs + iqs + j/qr + 0]; if ( y_offset == 1 ) {
tmp += v.y * y[iybs + iqs + j/qr + y_offset]; // load 2 dfloats into register in a single instruction
const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
tmp += v.x * y_reg.x;
tmp += v.y * y_reg.y;
}
else {
tmp += v.x * y[iybs + iqs + j/qr + 0];
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
}
#endif // GGML_CUDA_F16 #endif // GGML_CUDA_F16
} }
} }

View file

@ -152,6 +152,8 @@ class Keys:
MERGES = "tokenizer.ggml.merges" MERGES = "tokenizer.ggml.merges"
BOS_ID = "tokenizer.ggml.bos_token_id" BOS_ID = "tokenizer.ggml.bos_token_id"
EOS_ID = "tokenizer.ggml.eos_token_id" EOS_ID = "tokenizer.ggml.eos_token_id"
EOT_ID = "tokenizer.ggml.eot_token_id"
EOM_ID = "tokenizer.ggml.eom_token_id"
UNK_ID = "tokenizer.ggml.unknown_token_id" UNK_ID = "tokenizer.ggml.unknown_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id" PAD_ID = "tokenizer.ggml.padding_token_id"
@ -168,11 +170,16 @@ class Keys:
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
CHAT_TEMPLATES = "tokenizer.chat_templates" CHAT_TEMPLATES = "tokenizer.chat_templates"
# FIM/Infill special tokens constants # FIM/Infill special tokens constants
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id"
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
# deprecated:
PREFIX_ID = "tokenizer.ggml.prefix_token_id" PREFIX_ID = "tokenizer.ggml.prefix_token_id"
SUFFIX_ID = "tokenizer.ggml.suffix_token_id" SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
MIDDLE_ID = "tokenizer.ggml.middle_token_id" MIDDLE_ID = "tokenizer.ggml.middle_token_id"
EOT_ID = "tokenizer.ggml.eot_token_id"
EOM_ID = "tokenizer.ggml.eom_token_id"
class Adapter: class Adapter:
TYPE = "adapter.type" TYPE = "adapter.type"
@ -1579,6 +1586,8 @@ KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
@ -1586,8 +1595,15 @@ KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
# deprecated
KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID

View file

@ -843,15 +843,6 @@ class GGUFWriter:
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
def add_prefix_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
def add_suffix_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
def add_middle_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
def add_eot_token_id(self, id: int) -> None: def add_eot_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.EOT_ID, id) self.add_uint32(Keys.Tokenizer.EOT_ID, id)

View file

@ -897,6 +897,7 @@ extern "C" {
// Special tokens // Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
@ -905,11 +906,17 @@ extern "C" {
LLAMA_API bool llama_add_bos_token(const struct llama_model * model); LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
LLAMA_API bool llama_add_eos_token(const struct llama_model * model); LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
// Codellama infill tokens // infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
// //
// Tokenization // Tokenization
@ -946,6 +953,12 @@ extern "C" {
int32_t lstrip, int32_t lstrip,
bool special); bool special);
// check if token0 is contained as a prefix in token1
LLAMA_API bool llama_token_is_prefix(
const struct llama_model * model,
llama_token token0,
llama_token token1);
/// @details Convert the provided tokens into text (inverse of llama_tokenize()). /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
/// @param text The char pointer must be large enough to hold the resulting text. /// @param text The char pointer must be large enough to hold the resulting text.
/// @return Returns the number of chars/bytes on success, no more than text_len_max. /// @return Returns the number of chars/bytes on success, no more than text_len_max.
@ -1094,6 +1107,9 @@ extern "C" {
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent); LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@ -1138,6 +1154,28 @@ extern "C" {
int32_t n_logit_bias, int32_t n_logit_bias,
const llama_logit_bias * logit_bias); const llama_logit_bias * logit_bias);
// this sampler is meant to be used for fill-in-the-middle infilling
// it's supposed to be used after top_k + top_p sampling
//
// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
// 2. combine probs of tokens that have the same prefix
//
// example:
//
// - before:
// "hel": 0.5
// "hell": 0.2
// "hello": 0.1
// "dummy": 0.1
//
// - after:
// "hel": 0.8
// "dummy": 0.1
//
// 3. discard non-EOG tokens with low prob
// 4. if no tokens are left -> pick EOT
//
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);

View file

@ -1 +1 @@
564f42082f858f9674b2a2e06e9e779d9ed2c754 2327bda7a55ac6b72614ac5ebd5c5a5e02553b9b

View file

@ -1059,6 +1059,101 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
}; };
} }
// xtc
struct llama_sampler_xtc {
const float probability;
const float threshold;
const size_t min_keep;
const uint32_t seed;
uint32_t seed_cur;
std::mt19937 rng;
};
static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
return "xtc";
}
static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_xtc *) smpl->ctx;
if (ctx->probability <= 0.0f
|| ctx->threshold > 0.5f
|| cur_p->size < 2) {
return;
}
std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
float chance = distribution(ctx->rng);
if (chance > ctx->probability) return;
// in case it's not sorted/recalculated yet
llama_sampler_softmax_impl(cur_p);
int pos_last = 0;
for (size_t i = 0; i < cur_p->size; ++i) {
if (cur_p->data[i].p >= ctx->threshold) {
pos_last = i;
} else break;
}
if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
cur_p->data += pos_last;
cur_p->size -= pos_last;
}
}
static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
// copy the state
{
auto * result_ctx = (llama_sampler_xtc *) result->ctx;
result_ctx->rng = ctx->rng;
}
return result;
}
static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
delete (llama_sampler_xtc *) smpl->ctx;
}
static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_xtc *) smpl->ctx;
ctx->seed_cur = get_rng_seed(ctx->seed);
ctx->rng.seed(ctx->seed_cur);
}
static struct llama_sampler_i llama_sampler_xtc_i = {
/* .name = */ llama_sampler_xtc_name,
/* .accept = */ nullptr,
/* .apply = */ llama_sample_xtc_apply,
/* .reset = */ llama_sampler_xtc_reset,
/* .clone = */ llama_sampler_xtc_clone,
/* .free = */ llama_sampler_xtc_free,
};
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
auto seed_cur = get_rng_seed(seed);
return new llama_sampler {
/* .iface = */ &llama_sampler_xtc_i,
/* .ctx = */ new llama_sampler_xtc {
/* .probability = */ p,
/* .threshold = */ t,
/* .min_keep = */ min_keep,
/* .seed = */ seed,
/* .seed_cur = */ seed_cur,
/* .rng = */ std::mt19937(seed_cur),
},
};
}
// mirostat // mirostat
struct llama_sampler_mirostat { struct llama_sampler_mirostat {
@ -1644,6 +1739,207 @@ struct llama_sampler * llama_sampler_init_logit_bias(
}; };
} }
// infill
//#define GGML_DEBUG_SAMPLER_INFILL
struct llama_sampler_infill {
const struct llama_vocab * vocab;
};
static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
return "infill";
}
static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_infill *) smpl->ctx;
llama_sampler_softmax_impl(cur_p);
#if defined(GGML_DEBUG_SAMPLER_INFILL)
#define LOG_DBG_CUR LLAMA_LOG_DEBUG
#else
#define LOG_DBG_CUR(...)
#endif
for (size_t i = 0; i < cur_p->size; ++i) {
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
}
float p_txt_sum = 0.0f;
float p_eog_sum = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
p_eog_sum += cur_p->data[i].p;
} else {
p_txt_sum += cur_p->data[i].p;
}
}
const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
if (3*p_eog_sum*cur_p->size > p_txt_sum) {
LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
// keep just the EOG tokens
const auto size_org = cur_p->size;
cur_p->size = 0;
float p_sum = 0.0f;
for (size_t i = 0; i < size_org; ++i) {
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
p_sum += cur_p->data[i].p;
cur_p->data[cur_p->size++] = cur_p->data[i];
}
}
// normalize probs
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= p_sum;
}
return;
}
size_t n_combined = 0; GGML_UNUSED(n_combined);
// combine tokens with common prefix
for (size_t i = 0; i < cur_p->size; ++i) {
for (size_t j = 0; j < cur_p->size; ++j) {
if (cur_p->data[i].logit == -INFINITY) {
break;
}
if (i == j || cur_p->data[j].logit == -INFINITY) {
continue;
}
if (llama_token_is_prefix_impl(*ctx->vocab, cur_p->data[i].id, cur_p->data[j].id)) {
if (cur_p->data[i].p > cur_p->data[j].p) {
cur_p->data[i].p += cur_p->data[j].p;
cur_p->data[j].logit = -INFINITY;
cur_p->data[j].p = 0.0f;
} else {
cur_p->data[j].p += cur_p->data[i].p;
cur_p->data[i].logit = -INFINITY;
cur_p->data[i].p = 0.0f;
}
n_combined++;
}
}
}
size_t n_non_eog = 0;
size_t size_org = cur_p->size;
float p_sum = 0.0f;
float thold = 0.2f;
cur_p->size = 0;
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
for (size_t i = 0; i < size_org; ++i) {
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
if (cur_p->data[i].p < thold && !is_eog) {
continue;
}
if (!is_eog) {
++n_non_eog;
}
p_sum += cur_p->data[i].p;
// keep this token
cur_p->data[cur_p->size++] = cur_p->data[i];
}
LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
if (n_non_eog == 0) {
cur_p->size = 1;
cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
cur_p->data[0].logit = 1.0f;
return;
}
// normalize probs
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= p_sum;
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
}
size_org = cur_p->size;
p_sum = 0.0f;
thold = 1.0/(n_non_eog + 1);
cur_p->size = 0;
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
for (size_t i = 0; i < size_org; ++i) {
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
if (cur_p->data[i].p < thold && !is_eog) {
continue;
}
p_sum += cur_p->data[i].p;
cur_p->data[cur_p->size++] = cur_p->data[i];
}
// normalize probs
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= p_sum;
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
}
#undef LOG_DBG_CUR
}
static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
return llama_sampler_init_infill_impl(*ctx->vocab);
}
static void llama_sampler_infill_free(struct llama_sampler * smpl) {
delete (llama_sampler_infill *) smpl->ctx;
}
static struct llama_sampler_i llama_sampler_infill_i = {
/* .name = */ llama_sampler_infill_name,
/* .accept = */ nullptr,
/* .apply = */ llama_sampler_infill_apply,
/* .reset = */ nullptr,
/* .clone = */ llama_sampler_infill_clone,
/* .free = */ llama_sampler_infill_free,
};
struct llama_sampler * llama_sampler_init_infill_impl(
const struct llama_vocab & vocab) {
return new llama_sampler {
/* .iface = */ &llama_sampler_infill_i,
/* .ctx = */ new llama_sampler_infill {
/* .vocab = */ &vocab,
},
};
}
// utils // utils
uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {

View file

@ -4,8 +4,6 @@
#include "llama-grammar.h" #include "llama-grammar.h"
#include <unordered_map>
struct llama_vocab; struct llama_vocab;
struct llama_grammar; struct llama_grammar;
@ -27,3 +25,6 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
const struct llama_vocab & vocab, const struct llama_vocab & vocab,
const char * grammar_str, const char * grammar_str,
const char * grammar_root); const char * grammar_root);
struct llama_sampler * llama_sampler_init_infill_impl(
const struct llama_vocab & vocab);

View file

@ -1663,6 +1663,14 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
return vocab.special_eos_id; return vocab.special_eos_id;
} }
llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
return vocab.special_eot_id;
}
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
return vocab.special_eom_id;
}
llama_token llama_token_cls_impl(const struct llama_vocab & vocab) { llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
return vocab.special_cls_id; return vocab.special_cls_id;
} }
@ -1688,23 +1696,39 @@ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
} }
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) { llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
return vocab.special_prefix_id; return vocab.special_fim_pre_id;
} }
llama_token llama_token_middle_impl(const struct llama_vocab & vocab) { llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
return vocab.special_middle_id; return vocab.special_fim_mid_id;
} }
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) { llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
return vocab.special_suffix_id; return vocab.special_fim_suf_id;
} }
llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
return vocab.special_eot_id; return vocab.special_fim_pre_id;
} }
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
return vocab.special_eom_id; return vocab.special_fim_suf_id;
}
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_mid_id;
}
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_pad_id;
}
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_rep_id;
}
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_sep_id;
} }
int32_t llama_tokenize_impl( int32_t llama_tokenize_impl(
@ -1834,6 +1858,23 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
return 0; return 0;
} }
bool llama_token_is_prefix_impl(
const struct llama_vocab & vocab,
llama_token token0,
llama_token token1) {
char text_buf_0[128];
char text_buf_1[128];
const int32_t len0 = llama_token_to_piece_impl(vocab, token0, text_buf_0, sizeof(text_buf_0) - 1, 0, false);
const int32_t len1 = llama_token_to_piece_impl(vocab, token1, text_buf_1, sizeof(text_buf_1) - 1, 0, false);
if (len0 <= 0 || len1 <= 0) {
return false;
}
return len0 <= len1 && memcmp(text_buf_0, text_buf_1, len0) == 0;
}
int32_t llama_detokenize_impl( int32_t llama_detokenize_impl(
const struct llama_vocab & vocab, const struct llama_vocab & vocab,
const llama_token * tokens, const llama_token * tokens,

View file

@ -37,20 +37,26 @@ struct llama_vocab {
std::map<std::pair<std::string, std::string>, int> bpe_ranks; std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens // default LLaMA special tokens
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
id special_bos_id = 1; id special_bos_id = 1;
id special_eos_id = 2; id special_eos_id = 2;
id special_eot_id = LLAMA_TOKEN_NULL;
id special_eom_id = LLAMA_TOKEN_NULL;
id special_unk_id = 0; id special_unk_id = 0;
id special_sep_id = LLAMA_TOKEN_NULL; id special_sep_id = LLAMA_TOKEN_NULL;
id special_pad_id = LLAMA_TOKEN_NULL; id special_pad_id = LLAMA_TOKEN_NULL;
id special_cls_id = LLAMA_TOKEN_NULL; id special_cls_id = LLAMA_TOKEN_NULL;
id special_mask_id = LLAMA_TOKEN_NULL; id special_mask_id = LLAMA_TOKEN_NULL;
id linefeed_id = 13; id linefeed_id = 13;
id special_prefix_id = LLAMA_TOKEN_NULL;
id special_suffix_id = LLAMA_TOKEN_NULL; // fim tokens
id special_middle_id = LLAMA_TOKEN_NULL; id special_fim_pre_id = LLAMA_TOKEN_NULL;
id special_eot_id = LLAMA_TOKEN_NULL; // TODO: move above after "eos_id", and here add "file separator" token id special_fim_suf_id = LLAMA_TOKEN_NULL;
id special_eom_id = LLAMA_TOKEN_NULL; id special_fim_mid_id = LLAMA_TOKEN_NULL;
id special_fim_pad_id = LLAMA_TOKEN_NULL;
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
// set of all tokens that cause "end of generation" // set of all tokens that cause "end of generation"
std::set<id> special_eog_ids; std::set<id> special_eog_ids;
@ -104,19 +110,26 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
llama_token llama_token_bos_impl(const struct llama_vocab & vocab); llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eos_impl(const struct llama_vocab & vocab); llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
llama_token llama_token_cls_impl(const struct llama_vocab & vocab); llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
llama_token llama_token_sep_impl(const struct llama_vocab & vocab); llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
llama_token llama_token_nl_impl (const struct llama_vocab & vocab); llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
llama_token llama_token_pad_impl(const struct llama_vocab & vocab); llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
llama_token llama_token_middle_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab); llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
llama_token llama_token_eom_impl (const struct llama_vocab & vocab); llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
int32_t llama_tokenize_impl( int32_t llama_tokenize_impl(
const struct llama_vocab & vocab, const struct llama_vocab & vocab,
@ -136,6 +149,12 @@ int32_t llama_token_to_piece_impl(
int32_t lstrip, int32_t lstrip,
bool special); bool special);
// check if token0 is contained as a prefix in token1
bool llama_token_is_prefix_impl(
const struct llama_vocab & vocab,
llama_token token0,
llama_token token1);
int32_t llama_detokenize_impl( int32_t llama_detokenize_impl(
const struct llama_vocab & vocab, const struct llama_vocab & vocab,
const llama_token * tokens, const llama_token * tokens,

View file

@ -345,6 +345,8 @@ enum llm_kv {
LLM_KV_TOKENIZER_MERGES, LLM_KV_TOKENIZER_MERGES,
LLM_KV_TOKENIZER_BOS_ID, LLM_KV_TOKENIZER_BOS_ID,
LLM_KV_TOKENIZER_EOS_ID, LLM_KV_TOKENIZER_EOS_ID,
LLM_KV_TOKENIZER_EOT_ID,
LLM_KV_TOKENIZER_EOM_ID,
LLM_KV_TOKENIZER_UNK_ID, LLM_KV_TOKENIZER_UNK_ID,
LLM_KV_TOKENIZER_SEP_ID, LLM_KV_TOKENIZER_SEP_ID,
LLM_KV_TOKENIZER_PAD_ID, LLM_KV_TOKENIZER_PAD_ID,
@ -357,14 +359,20 @@ enum llm_kv {
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV, LLM_KV_TOKENIZER_RWKV,
LLM_KV_TOKENIZER_PREFIX_ID, LLM_KV_TOKENIZER_FIM_PRE_ID,
LLM_KV_TOKENIZER_SUFFIX_ID, LLM_KV_TOKENIZER_FIM_SUF_ID,
LLM_KV_TOKENIZER_MIDDLE_ID, LLM_KV_TOKENIZER_FIM_MID_ID,
LLM_KV_TOKENIZER_EOT_ID, LLM_KV_TOKENIZER_FIM_PAD_ID,
LLM_KV_TOKENIZER_EOM_ID, LLM_KV_TOKENIZER_FIM_REP_ID,
LLM_KV_TOKENIZER_FIM_SEP_ID,
LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_TYPE,
LLM_KV_ADAPTER_LORA_ALPHA, LLM_KV_ADAPTER_LORA_ALPHA,
// deprecated:
LLM_KV_TOKENIZER_PREFIX_ID,
LLM_KV_TOKENIZER_SUFFIX_ID,
LLM_KV_TOKENIZER_MIDDLE_ID,
}; };
static const std::map<llm_kv, const char *> LLM_KV_NAMES = { static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@ -422,57 +430,65 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
{ LLM_KV_SPLIT_NO, "split.no" }, { LLM_KV_SPLIT_NO, "split.no" },
{ LLM_KV_SPLIT_COUNT, "split.count" }, { LLM_KV_SPLIT_COUNT, "split.count" },
{ LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
{ LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
{ LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
{ LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
}; };
struct LLM_KV { struct LLM_KV {
@ -6151,14 +6167,14 @@ static void llm_load_vocab(
vocab.type = LLAMA_VOCAB_TYPE_NONE; vocab.type = LLAMA_VOCAB_TYPE_NONE;
// default special tokens // default special tokens
vocab.special_bos_id = -1; vocab.special_bos_id = LLAMA_TOKEN_NULL;
vocab.special_eos_id = -1; vocab.special_eos_id = LLAMA_TOKEN_NULL;
vocab.special_unk_id = -1; vocab.special_unk_id = LLAMA_TOKEN_NULL;
vocab.special_sep_id = -1; vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = -1; vocab.special_pad_id = LLAMA_TOKEN_NULL;
vocab.special_cls_id = -1; vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = -1; vocab.special_mask_id = LLAMA_TOKEN_NULL;
vocab.linefeed_id = -1; vocab.linefeed_id = LLAMA_TOKEN_NULL;
// read vocab size from metadata // read vocab size from metadata
if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) { if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
@ -6175,16 +6191,16 @@ static void llm_load_vocab(
vocab.special_bos_id = 1; vocab.special_bos_id = 1;
vocab.special_eos_id = 2; vocab.special_eos_id = 2;
vocab.special_unk_id = 0; vocab.special_unk_id = 0;
vocab.special_sep_id = -1; vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = -1; vocab.special_pad_id = LLAMA_TOKEN_NULL;
vocab.special_cls_id = -1; vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = -1; vocab.special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "bert") { } else if (tokenizer_model == "bert") {
vocab.type = LLAMA_VOCAB_TYPE_WPM; vocab.type = LLAMA_VOCAB_TYPE_WPM;
// default special tokens // default special tokens
vocab.special_bos_id = -1; vocab.special_bos_id = LLAMA_TOKEN_NULL;
vocab.special_eos_id = -1; vocab.special_eos_id = LLAMA_TOKEN_NULL;
vocab.special_unk_id = 100; vocab.special_unk_id = 100;
vocab.special_sep_id = 102; vocab.special_sep_id = 102;
vocab.special_pad_id = 0; vocab.special_pad_id = 0;
@ -6220,22 +6236,22 @@ static void llm_load_vocab(
// default special tokens // default special tokens
vocab.special_bos_id = 11; vocab.special_bos_id = 11;
vocab.special_eos_id = 11; vocab.special_eos_id = 11;
vocab.special_unk_id = -1; vocab.special_unk_id = LLAMA_TOKEN_NULL;
vocab.special_sep_id = -1; vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = -1; vocab.special_pad_id = LLAMA_TOKEN_NULL;
vocab.special_cls_id = -1; vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = -1; vocab.special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "t5") { } else if (tokenizer_model == "t5") {
vocab.type = LLAMA_VOCAB_TYPE_UGM; vocab.type = LLAMA_VOCAB_TYPE_UGM;
// default special tokens // default special tokens
vocab.special_bos_id = -1; vocab.special_bos_id = LLAMA_TOKEN_NULL;
vocab.special_eos_id = 1; vocab.special_eos_id = 1;
vocab.special_unk_id = 2; vocab.special_unk_id = 2;
vocab.special_sep_id = -1; vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = 0; vocab.special_pad_id = 0;
vocab.special_cls_id = -1; vocab.special_cls_id = LLAMA_TOKEN_NULL;
vocab.special_mask_id = -1; vocab.special_mask_id = LLAMA_TOKEN_NULL;
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) { if (precompiled_charsmap_keyidx != -1) {
@ -6258,11 +6274,11 @@ static void llm_load_vocab(
vocab.type = LLAMA_VOCAB_TYPE_RWKV; vocab.type = LLAMA_VOCAB_TYPE_RWKV;
// default special tokens // default special tokens
vocab.special_bos_id = -1; vocab.special_bos_id = LLAMA_TOKEN_NULL;
vocab.special_eos_id = -1; vocab.special_eos_id = LLAMA_TOKEN_NULL;
vocab.special_unk_id = -1; vocab.special_unk_id = LLAMA_TOKEN_NULL;
vocab.special_sep_id = -1; vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = -1; vocab.special_pad_id = LLAMA_TOKEN_NULL;
} else { } else {
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
} }
@ -6346,7 +6362,7 @@ static void llm_load_vocab(
} else if ( } else if (
tokenizer_pre == "chatglm-bpe") { tokenizer_pre == "chatglm-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
vocab.special_bos_id = -1; vocab.special_bos_id = LLAMA_TOKEN_NULL;
} else if ( } else if (
tokenizer_pre == "viking") { tokenizer_pre == "viking") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
@ -6472,44 +6488,6 @@ static void llm_load_vocab(
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
// For Fill-In-the-Middle (FIM)/infill models which where converted
// prior to support of FIM special tokens in GGUF, the following
// will allow those models to continue to work. The general names
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
// new versions of these models have been published.
std::string gen_name;
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
[](unsigned char c){ return std::tolower(c); });
if (gen_name.find("code") != std::string::npos) {
if (model.arch == LLM_ARCH_LLAMA
&& 32010 < vocab.id_to_token.size()
&& vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
&& vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
&& vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
&& vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
vocab.special_prefix_id = 32007;
vocab.special_suffix_id = 32008;
vocab.special_middle_id = 32009;
vocab.special_eot_id = 32010;
} else if (model.arch == LLM_ARCH_GEMMA
&& 107 < vocab.id_to_token.size()
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
&& vocab.id_to_token[68].text == "<|fim_middle|>"
&& vocab.id_to_token[107].text == "<end_of_turn>") {
vocab.special_prefix_id = 67;
vocab.special_suffix_id = 69;
vocab.special_middle_id = 68;
// TODO: this is not EOT, it is "file separator" token, needs fix
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
//vocab.special_eot_id = 70;
vocab.special_eot_id = 107;
}
}
try { try {
vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n'); vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
} catch (const std::exception & e) { } catch (const std::exception & e) {
@ -6537,18 +6515,26 @@ static void llm_load_vocab(
// special tokens // special tokens
{ {
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = { const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id }, { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id }, { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id }, { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id }, { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id }, { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id }, { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id }, { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id }, { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id }, { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id }, { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id }, { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id }, { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
{ LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
{ LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
// deprecated
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
}; };
for (const auto & it : special_token_types) { for (const auto & it : special_token_types) {
@ -6579,46 +6565,140 @@ static void llm_load_vocab(
} }
} }
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc. // auto-detect special tokens by text
// // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID // for now, we apply this workaround to find the tokens based on their text
// for now, we apply this workaround to find the EOT token based on its text
if (vocab.special_eot_id == -1) { for (const auto & t : vocab.token_to_id) {
for (const auto & t : vocab.token_to_id) { // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
if (false if (false
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
// need to fix convert script
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|| t.first == "<|eot_id|>" || t.first == "<|eot_id|>"
|| t.first == "<|im_end|>" || t.first == "<|im_end|>"
|| t.first == "<|end|>" || t.first == "<|end|>"
|| t.first == "<end_of_turn>" || t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>" || t.first == "<|endoftext|>"
|| t.first == "<EOT>" || t.first == "<EOT>"
|| t.first == "<end▁of▁sentence>" // DeepSeek
) { ) {
vocab.special_eot_id = t.second; vocab.special_eot_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.first.c_str()); __func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
break;
} }
} }
}
// find EOM token: "<|eom_id|>" // find EOM token: "<|eom_id|>"
// if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID if (false
// for now, we apply this workaround to find the EOM token based on its text || t.first == "<|eom_id|>"
if (vocab.special_eom_id == -1) { ) {
const auto & t = vocab.token_to_id.find("<|eom_id|>"); vocab.special_eom_id = t.second;
if (t != vocab.token_to_id.end()) { if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
vocab.special_eom_id = t->second; LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { __func__, t.second, t.first.c_str());
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
__func__, t->first.c_str()); }
vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL; }
}
// find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_prefix|>" // Qwen
|| t.first == "<fim-prefix>"
|| t.first == "<fim▁begin>" // DeepSeek
|| t.first == "<PRE>"
) {
vocab.special_fim_pre_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_suffix|>" // Qwen
|| t.first == "<fim-suffix>"
|| t.first == "<fim▁hole>" // DeepSeek
|| t.first == "<SUF>"
) {
vocab.special_fim_suf_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_middle|>" // Qwen
|| t.first == "<fim-middle>"
|| t.first == "<fim▁end>" // DeepSeek
|| t.first == "<MID>"
) {
vocab.special_fim_mid_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_pad|>" // Qwen
|| t.first == "<fim-pad>"
|| t.first == "<PAD>"
) {
vocab.special_fim_pad_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|fim_repo|>" // Qwen
|| t.first == "<|repo_name|>"
|| t.first == "<fim-repo>"
|| t.first == "<REPO>"
) {
vocab.special_fim_rep_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
}
// find FIM_SEP token: "<|file_sep|>"
if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
if (false
|| t.first == "<|file_sep|>" // Qwen
) {
vocab.special_fim_sep_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
} }
} }
} }
@ -6627,6 +6707,19 @@ static void llm_load_vocab(
// this is currently determined based on the token text, which is obviously not ideal // this is currently determined based on the token text, which is obviously not ideal
// ref: https://github.com/ggerganov/llama.cpp/issues/9606 // ref: https://github.com/ggerganov/llama.cpp/issues/9606
vocab.special_eog_ids.clear(); vocab.special_eog_ids.clear();
if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
}
if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
}
if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
}
for (const auto & t : vocab.token_to_id) { for (const auto & t : vocab.token_to_id) {
if (false if (false
|| t.first == "<|eot_id|>" || t.first == "<|eot_id|>"
@ -6639,24 +6732,31 @@ static void llm_load_vocab(
) { ) {
vocab.special_eog_ids.insert(t.second); vocab.special_eog_ids.insert(t.second);
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
__func__, t.first.c_str()); __func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
} }
} else {
// token is control, but not marked as EOG -> print a warning
if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n",
__func__, t.second, t.first.c_str());
}
} }
} }
if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) { // sanity checks
if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_eos_id); vocab.special_eog_ids.insert(vocab.special_eos_id);
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
} }
if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) { if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_eot_id); vocab.special_eog_ids.insert(vocab.special_eot_id);
LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
} }
if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) { if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_eom_id); vocab.special_eog_ids.insert(vocab.special_eom_id);
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__); LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
} }
@ -6850,20 +6950,24 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
// special tokens // special tokens
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); } if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); } if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); } if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); } if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); } if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); } if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
for (const auto & id : vocab.special_eog_ids) { for (const auto & id : vocab.special_eog_ids) {
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() ); LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
@ -15978,9 +16082,11 @@ struct llm_build_context {
cur = ggml_get_rows(ctx0, cur, inp_out_ids); cur = ggml_get_rows(ctx0, cur, inp_out_ids);
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_norm", -1);
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1); cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
return gf; return gf;
@ -19417,7 +19523,7 @@ struct llama_context * llama_new_context_with_model(
} }
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
} }
@ -21271,6 +21377,10 @@ llama_token llama_token_eos(const struct llama_model * model) {
return llama_token_eos_impl(model->vocab); return llama_token_eos_impl(model->vocab);
} }
llama_token llama_token_eot(const struct llama_model * model) {
return llama_token_eot_impl(model->vocab);
}
llama_token llama_token_cls(const struct llama_model * model) { llama_token llama_token_cls(const struct llama_model * model) {
return llama_token_cls_impl(model->vocab); return llama_token_cls_impl(model->vocab);
} }
@ -21307,8 +21417,28 @@ llama_token llama_token_suffix(const struct llama_model * model) {
return llama_token_suffix_impl(model->vocab); return llama_token_suffix_impl(model->vocab);
} }
llama_token llama_token_eot(const struct llama_model * model) { llama_token llama_token_fim_pre(const struct llama_model * model) {
return llama_token_eot_impl(model->vocab); return llama_token_fim_pre_impl(model->vocab);
}
llama_token llama_token_fim_suf(const struct llama_model * model) {
return llama_token_fim_suf_impl(model->vocab);
}
llama_token llama_token_fim_mid(const struct llama_model * model) {
return llama_token_fim_mid_impl(model->vocab);
}
llama_token llama_token_fim_pad(const struct llama_model * model) {
return llama_token_fim_pad_impl(model->vocab);
}
llama_token llama_token_fim_rep(const struct llama_model * model) {
return llama_token_fim_rep_impl(model->vocab);
}
llama_token llama_token_fim_sep(const struct llama_model * model) {
return llama_token_fim_sep_impl(model->vocab);
} }
// //
@ -21336,6 +21466,13 @@ int32_t llama_token_to_piece(
return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special); return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
} }
bool llama_token_is_prefix(
const struct llama_model * model,
llama_token token0,
llama_token token1) {
return llama_token_is_prefix_impl(model->vocab, token0, token1);
}
int32_t llama_detokenize( int32_t llama_detokenize(
const struct llama_model * model, const struct llama_model * model,
const llama_token * tokens, const llama_token * tokens,
@ -21666,6 +21803,10 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root); return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
} }
struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
return llama_sampler_init_infill_impl(model->vocab);
}
// //
// model split // model split
// //

View file

@ -696,7 +696,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"pattern": "^abc?d*efg+(hij)?kl$" "pattern": "^abc?d*efg+(hij)?kl$"
})""", })""",
R"""( R"""(
root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\"" space
space ::= | " " | "\n" [ \t]{0,20} space ::= | " " | "\n" [ \t]{0,20}
)""" )"""
}); });
@ -709,7 +709,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$" "pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
})""", })""",
R"""( R"""(
root ::= "\"" "[]{}()|+*?" "\"" space root ::= "\"" ("[]{}()|+*?") "\"" space
space ::= | " " | "\n" [ \t]{0,20} space ::= | " " | "\n" [ \t]{0,20}
)""" )"""
}); });
@ -722,7 +722,20 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"pattern": "^\"$" "pattern": "^\"$"
})""", })""",
R"""( R"""(
root ::= "\"" "\"" "\"" space root ::= "\"" ("\"") "\"" space
space ::= | " " | "\n" [ \t]{0,20}
)"""
});
test({
SUCCESS,
"regexp with top-level alternation",
R"""({
"type": "string",
"pattern": "^A|B|C|D$"
})""",
R"""(
root ::= "\"" ("A" | "B" | "C" | "D") "\"" space
space ::= | " " | "\n" [ \t]{0,20} space ::= | " " | "\n" [ \t]{0,20}
)""" )"""
}); });
@ -736,7 +749,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""", })""",
R"""( R"""(
dot ::= [^\x0A\x0D] dot ::= [^\x0A\x0D]
root ::= "\"" ("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot "\"" space root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\"" space
root-1 ::= [0-9] root-1 ::= [0-9]
space ::= | " " | "\n" [ \t]{0,20} space ::= | " " | "\n" [ \t]{0,20}
)""" )"""

View file

@ -111,6 +111,28 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float
} }
} }
static void test_xtc(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p, float t) {
const size_t n_vocab = probs.size();
std::vector<llama_token_data> cur;
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
APPLY(llama_sampler_init_softmax(), &cur_p);
DUMP(&cur_p);
APPLY(llama_sampler_init_xtc(p, t, 0, 0), &cur_p);
DUMP(&cur_p);
GGML_ASSERT(cur_p.size == expected_probs.size());
for (size_t i = 0; i < cur_p.size; i++) {
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
}
}
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) { static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const size_t n_vocab = probs.size(); const size_t n_vocab = probs.size();
@ -263,7 +285,7 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec
} }
const int64_t t_end = ggml_time_us(); const int64_t t_end = ggml_time_us();
llama_sampler_free(cnstr); llama_sampler_free(cnstr);
printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter); printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
} }
#define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter)) #define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
@ -279,12 +301,13 @@ static void test_perf() {
data.emplace_back(llama_token_data{i, logit, 0.0f}); data.emplace_back(llama_token_data{i, logit, 0.0f});
} }
BENCH(llama_sampler_init_top_k (40), data, 32); BENCH(llama_sampler_init_top_k (40), data, 32);
BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32); BENCH(llama_sampler_init_top_p (0.8f, 1), data, 32);
BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32); BENCH(llama_sampler_init_min_p (0.2f, 1), data, 32);
BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32); BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
BENCH(llama_sampler_init_typical (0.5f, 1), data, 32); BENCH(llama_sampler_init_typical (0.5f, 1), data, 32);
BENCH(llama_sampler_init_softmax (), data, 32); BENCH(llama_sampler_init_xtc (1.0f, 0.1f, 1, 1), data, 32);
BENCH(llama_sampler_init_softmax (), data, 32);
} }
int main(void) { int main(void) {
@ -309,6 +332,14 @@ int main(void) {
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f); test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f); test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f);
printf("XTC should:\n");
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.1f}, 0.99f, 0.09f);
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.2f, 0.1f}, 0.99f, 0.19f);
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.3f, 0.2f, 0.1f}, 0.99f, 0.29f);
printf("XTC should not:\n");
test_xtc({0.4f, 0.3f, 0.2f, 0.1f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0.99f, 0.39f);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);