diff --git a/common/arg.cpp b/common/arg.cpp index b27567f3b..6312c9785 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1174,7 +1174,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", - string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + string_format("KV cache data type for K : f16, f32, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 (default: %s)", params.cache_type_k.c_str()), [](common_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_k = value; @@ -1182,7 +1182,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_CACHE_TYPE_K")); add_opt(common_arg( {"-ctv", "--cache-type-v"}, "TYPE", - string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + string_format("KV cache data type for V : : f16, f32, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 (default: %s)", params.cache_type_v.c_str()), [](common_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_v = value; diff --git a/examples/server/README.md b/examples/server/README.md index 5efd9b2cc..686be7baf 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-dkvc, --dump-kv-cache` | verbose print of the KV cache | | `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | -| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16, f32, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1)
(env: LLAMA_ARG_CACHE_TYPE_K) | -| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16, f32, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1)
(env: LLAMA_ARG_CACHE_TYPE_V) | +| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | +| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) |