KV cache quantized to q8_0
This commit is contained in:
parent
111163e246
commit
c620f4d677
15 changed files with 1034 additions and 380 deletions
|
@ -198,8 +198,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
|
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
|
||||||
|
} else if (arg == "--kv-type" || arg == "-kvt") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string type_name(argv[i]);
|
||||||
|
for (char & c : type_name) {
|
||||||
|
c = std::tolower(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type_name == "q8_0") {
|
||||||
|
params.kv_type = GGML_TYPE_Q8_0;
|
||||||
|
} else if (type_name == "f16") {
|
||||||
|
params.kv_type = GGML_TYPE_F16;
|
||||||
|
} else if (type_name == "f32") {
|
||||||
|
params.kv_type = GGML_TYPE_F32;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
} else if (arg == "--memory-f32") {
|
} else if (arg == "--memory-f32") {
|
||||||
params.memory_f16 = false;
|
params.kv_type = GGML_TYPE_F32;
|
||||||
} else if (arg == "--top-p") {
|
} else if (arg == "--top-p") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -652,8 +674,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
|
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
|
||||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" -kvt, --kv-type the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
|
||||||
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
|
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||||
printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
|
printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
|
||||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||||
|
@ -735,7 +756,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
lparams.low_vram = params.low_vram;
|
lparams.low_vram = params.low_vram;
|
||||||
lparams.mul_mat_q = params.mul_mat_q;
|
lparams.mul_mat_q = params.mul_mat_q;
|
||||||
lparams.seed = params.seed;
|
lparams.seed = params.seed;
|
||||||
lparams.f16_kv = params.memory_f16;
|
lparams.kv_type = params.kv_type;
|
||||||
lparams.use_mmap = params.use_mmap;
|
lparams.use_mmap = params.use_mmap;
|
||||||
lparams.use_mlock = params.use_mlock;
|
lparams.use_mlock = params.use_mlock;
|
||||||
lparams.logits_all = params.perplexity;
|
lparams.logits_all = params.perplexity;
|
||||||
|
@ -1201,6 +1222,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
||||||
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
||||||
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
||||||
|
fprintf(stream, "kv_type: %s # default: false\n", ggml_type_name(params.kv_type));
|
||||||
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
||||||
|
|
||||||
fprintf(stream, "logit_bias:\n");
|
fprintf(stream, "logit_bias:\n");
|
||||||
|
@ -1215,7 +1237,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||||
fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
|
fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||||
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
|
||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
|
||||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
|
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
|
||||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
|
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
|
||||||
|
|
|
@ -94,9 +94,10 @@ struct gpt_params {
|
||||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||||
|
|
||||||
|
ggml_type kv_type = GGML_TYPE_Q8_0; // the type to use for the KV cache
|
||||||
|
|
||||||
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
|
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
|
||||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
|
|
|
@ -127,7 +127,7 @@ struct cmd_params {
|
||||||
std::vector<int> n_prompt;
|
std::vector<int> n_prompt;
|
||||||
std::vector<int> n_gen;
|
std::vector<int> n_gen;
|
||||||
std::vector<int> n_batch;
|
std::vector<int> n_batch;
|
||||||
std::vector<bool> f32_kv;
|
std::vector<ggml_type> kv_type;
|
||||||
std::vector<int> n_threads;
|
std::vector<int> n_threads;
|
||||||
std::vector<int> n_gpu_layers;
|
std::vector<int> n_gpu_layers;
|
||||||
std::vector<int> main_gpu;
|
std::vector<int> main_gpu;
|
||||||
|
@ -144,7 +144,7 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* n_prompt */ {512},
|
/* n_prompt */ {512},
|
||||||
/* n_gen */ {128},
|
/* n_gen */ {128},
|
||||||
/* n_batch */ {512},
|
/* n_batch */ {512},
|
||||||
/* f32_kv */ {false},
|
/* kv_type */ {GGML_TYPE_Q8_0},
|
||||||
/* n_threads */ {get_num_physical_cores()},
|
/* n_threads */ {get_num_physical_cores()},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* main_gpu */ {0},
|
/* main_gpu */ {0},
|
||||||
|
@ -165,7 +165,16 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||||
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
||||||
printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
|
|
||||||
|
std::string kv_type_default;
|
||||||
|
for (unsigned int i = 0; i < cmd_params_defaults.kv_type.size(); ++i) {
|
||||||
|
if (i > 0) {
|
||||||
|
kv_type_default += ",";
|
||||||
|
}
|
||||||
|
kv_type_default += ggml_type_name(cmd_params_defaults.kv_type[i]);
|
||||||
|
}
|
||||||
|
printf(" -kvt, --kv_type <q8_0|f16|f32> (default: %s)\n", kv_type_default.c_str());
|
||||||
|
|
||||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||||
printf(" -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
printf(" -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
|
@ -177,7 +186,6 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
|
@ -228,13 +236,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
auto p = split<int>(argv[i], split_delim);
|
auto p = split<int>(argv[i], split_delim);
|
||||||
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
||||||
} else if (arg == "--memory-f32") {
|
} else if (arg == "-kvt" || arg == "--kv-type") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
auto p = split<int>(argv[i], split_delim);
|
auto p = split<std::string>(argv[i], split_delim);
|
||||||
params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
|
|
||||||
|
std::vector<ggml_type> kvt;
|
||||||
|
for (const std::string & type_name : p) {
|
||||||
|
if (type_name == "q8_0") {
|
||||||
|
kvt.push_back(GGML_TYPE_Q8_0);
|
||||||
|
} else if (type_name == "f16") {
|
||||||
|
kvt.push_back(GGML_TYPE_F16);
|
||||||
|
} else if (type_name == "f32") {
|
||||||
|
kvt.push_back(GGML_TYPE_F32);
|
||||||
|
} else {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (invalid_param) {
|
||||||
|
fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
params.kv_type.insert(params.kv_type.end(), kvt.begin(), kvt.end());
|
||||||
} else if (arg == "-t" || arg == "--threads") {
|
} else if (arg == "-t" || arg == "--threads") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -332,7 +359,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
||||||
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
||||||
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
||||||
if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; }
|
if (params.kv_type.empty()) { params.kv_type = cmd_params_defaults.kv_type; }
|
||||||
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
||||||
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
||||||
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
||||||
|
@ -348,7 +375,7 @@ struct cmd_params_instance {
|
||||||
int n_prompt;
|
int n_prompt;
|
||||||
int n_gen;
|
int n_gen;
|
||||||
int n_batch;
|
int n_batch;
|
||||||
bool f32_kv;
|
ggml_type kv_type;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
|
@ -360,7 +387,7 @@ struct cmd_params_instance {
|
||||||
llama_context_params lparams = llama_context_default_params();
|
llama_context_params lparams = llama_context_default_params();
|
||||||
lparams.n_ctx = n_prompt + n_gen;
|
lparams.n_ctx = n_prompt + n_gen;
|
||||||
lparams.n_batch = n_batch;
|
lparams.n_batch = n_batch;
|
||||||
lparams.f16_kv = !f32_kv;
|
lparams.kv_type = kv_type;
|
||||||
lparams.n_gpu_layers = n_gpu_layers;
|
lparams.n_gpu_layers = n_gpu_layers;
|
||||||
lparams.main_gpu = main_gpu;
|
lparams.main_gpu = main_gpu;
|
||||||
lparams.mul_mat_q = mul_mat_q;
|
lparams.mul_mat_q = mul_mat_q;
|
||||||
|
@ -376,7 +403,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
||||||
|
|
||||||
for (const auto & m : params.model)
|
for (const auto & m : params.model)
|
||||||
for (const auto & nb : params.n_batch)
|
for (const auto & nb : params.n_batch)
|
||||||
for (const auto & fk : params.f32_kv)
|
for (const auto & kvt : params.kv_type)
|
||||||
for (const auto & nl : params.n_gpu_layers)
|
for (const auto & nl : params.n_gpu_layers)
|
||||||
for (const auto & mg : params.main_gpu)
|
for (const auto & mg : params.main_gpu)
|
||||||
for (const auto & mmq : params.mul_mat_q)
|
for (const auto & mmq : params.mul_mat_q)
|
||||||
|
@ -388,7 +415,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
||||||
/* .n_prompt = */ n_prompt,
|
/* .n_prompt = */ n_prompt,
|
||||||
/* .n_gen = */ n_gen,
|
/* .n_gen = */ n_gen,
|
||||||
/* .n_batch = */ nb,
|
/* .n_batch = */ nb,
|
||||||
/* .f32_kv = */ fk,
|
/* .kv_type = */ kvt,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
|
@ -439,7 +466,7 @@ struct test {
|
||||||
uint64_t model_n_params;
|
uint64_t model_n_params;
|
||||||
int n_batch;
|
int n_batch;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
bool f32_kv;
|
ggml_type kv_type;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
bool mul_mat_q;
|
bool mul_mat_q;
|
||||||
|
@ -459,7 +486,7 @@ struct test {
|
||||||
model_n_params = llama_model_n_params(lmodel);
|
model_n_params = llama_model_n_params(lmodel);
|
||||||
n_batch = inst.n_batch;
|
n_batch = inst.n_batch;
|
||||||
n_threads = inst.n_threads;
|
n_threads = inst.n_threads;
|
||||||
f32_kv = inst.f32_kv;
|
kv_type = inst.kv_type;
|
||||||
n_gpu_layers = inst.n_gpu_layers;
|
n_gpu_layers = inst.n_gpu_layers;
|
||||||
main_gpu = inst.main_gpu;
|
main_gpu = inst.main_gpu;
|
||||||
mul_mat_q = inst.mul_mat_q;
|
mul_mat_q = inst.mul_mat_q;
|
||||||
|
@ -523,7 +550,7 @@ struct test {
|
||||||
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_threads", "f16_kv",
|
"n_batch", "n_threads", "kv_type",
|
||||||
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
|
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
|
||||||
"n_prompt", "n_gen", "test_time",
|
"n_prompt", "n_gen", "test_time",
|
||||||
"avg_ns", "stddev_ns",
|
"avg_ns", "stddev_ns",
|
||||||
|
@ -543,7 +570,7 @@ struct test {
|
||||||
return INT;
|
return INT;
|
||||||
}
|
}
|
||||||
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
|
if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
|
||||||
field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
|
field == "mul_mat_q" || field == "low_vram") {
|
||||||
return BOOL;
|
return BOOL;
|
||||||
}
|
}
|
||||||
if (field == "avg_ts" || field == "stddev_ts") {
|
if (field == "avg_ts" || field == "stddev_ts") {
|
||||||
|
@ -573,7 +600,7 @@ struct test {
|
||||||
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
|
std::to_string(n_batch), std::to_string(n_threads), std::string(ggml_type_name(kv_type)),
|
||||||
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
|
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
|
||||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
||||||
|
@ -757,8 +784,8 @@ struct markdown_printer : public printer {
|
||||||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||||
fields.push_back("n_batch");
|
fields.push_back("n_batch");
|
||||||
}
|
}
|
||||||
if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
|
if (params.kv_type.size() > 1 || params.kv_type != cmd_params_defaults.kv_type) {
|
||||||
fields.push_back("f16_kv");
|
fields.push_back("kv_type");
|
||||||
}
|
}
|
||||||
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
||||||
fields.push_back("main_gpu");
|
fields.push_back("main_gpu");
|
||||||
|
@ -826,6 +853,9 @@ struct markdown_printer : public printer {
|
||||||
} else if (field == "t/s") {
|
} else if (field == "t/s") {
|
||||||
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
||||||
value = buf;
|
value = buf;
|
||||||
|
} else if (field == "kv_type") {
|
||||||
|
snprintf(buf, sizeof(buf), "%s", ggml_type_name(t.kv_type));
|
||||||
|
value = buf;
|
||||||
} else if (vmap.find(field) != vmap.end()) {
|
} else if (vmap.find(field) != vmap.end()) {
|
||||||
value = vmap.at(field);
|
value = vmap.at(field);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -276,9 +276,9 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||||
|
|
||||||
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
|
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
|
||||||
|
|
||||||
### Memory Float 32
|
### KV cache type
|
||||||
|
|
||||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
|
- `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.
|
||||||
|
|
||||||
### Batch Size
|
### Batch Size
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@
|
||||||
static llama_context ** g_ctx;
|
static llama_context ** g_ctx;
|
||||||
static llama_model ** g_model;
|
static llama_model ** g_model;
|
||||||
static gpt_params * g_params;
|
static gpt_params * g_params;
|
||||||
static std::vector<llama_token> * g_input_tokens;
|
static std::vector<llama_token> * g_embd_inp;
|
||||||
static std::ostringstream * g_output_ss;
|
static std::ostringstream * g_output_ss;
|
||||||
static std::vector<llama_token> * g_output_tokens;
|
static std::vector<llama_token> * g_output_tokens;
|
||||||
static bool is_interacting = false;
|
static bool is_interacting = false;
|
||||||
|
@ -44,7 +44,7 @@ static bool is_interacting = false;
|
||||||
|
|
||||||
static void write_logfile(
|
static void write_logfile(
|
||||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||||
const std::vector<llama_token> & input_tokens, const std::string & output,
|
const std::vector<llama_token> & embd_inp, const std::string & output,
|
||||||
const std::vector<llama_token> & output_tokens
|
const std::vector<llama_token> & output_tokens
|
||||||
) {
|
) {
|
||||||
if (params.logdir.empty()) {
|
if (params.logdir.empty()) {
|
||||||
|
@ -71,7 +71,7 @@ static void write_logfile(
|
||||||
fprintf(logfile, "binary: main\n");
|
fprintf(logfile, "binary: main\n");
|
||||||
char model_desc[128];
|
char model_desc[128];
|
||||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||||
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
dump_non_result_info_yaml(logfile, params, ctx, timestamp, embd_inp, model_desc);
|
||||||
|
|
||||||
fprintf(logfile, "\n");
|
fprintf(logfile, "\n");
|
||||||
fprintf(logfile, "######################\n");
|
fprintf(logfile, "######################\n");
|
||||||
|
@ -95,7 +95,7 @@ static void sigint_handler(int signo) {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
printf("\n");
|
printf("\n");
|
||||||
llama_print_timings(*g_ctx);
|
llama_print_timings(*g_ctx);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
write_logfile(*g_ctx, *g_params, *g_model, *g_embd_inp, g_output_ss->str(), *g_output_tokens);
|
||||||
_exit(130);
|
_exit(130);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -238,7 +238,7 @@ int main(int argc, char ** argv) {
|
||||||
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp; g_embd_inp = &embd_inp;
|
||||||
|
|
||||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
LOG("tokenize the prompt\n");
|
LOG("tokenize the prompt\n");
|
||||||
|
@ -465,7 +465,6 @@ int main(int argc, char ** argv) {
|
||||||
int n_session_consumed = 0;
|
int n_session_consumed = 0;
|
||||||
int n_past_guidance = 0;
|
int n_past_guidance = 0;
|
||||||
|
|
||||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
|
||||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
std::ostringstream output_ss; g_output_ss = &output_ss;
|
||||||
|
|
||||||
|
@ -661,9 +660,7 @@ int main(int argc, char ** argv) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
printf("%s", token_str.c_str());
|
printf("%s", token_str.c_str());
|
||||||
|
|
||||||
if (embd.size() > 1) {
|
if (embd.size() == 1) {
|
||||||
input_tokens.push_back(id);
|
|
||||||
} else {
|
|
||||||
output_tokens.push_back(id);
|
output_tokens.push_back(id);
|
||||||
output_ss << token_str;
|
output_ss << token_str;
|
||||||
}
|
}
|
||||||
|
@ -843,7 +840,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, embd_inp, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
if (ctx_guidance) { llama_free(ctx_guidance); }
|
if (ctx_guidance) { llama_free(ctx_guidance); }
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
|
@ -312,7 +312,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
lparams.n_ctx = 256;
|
lparams.n_ctx = 256;
|
||||||
lparams.seed = 1;
|
lparams.seed = 1;
|
||||||
lparams.f16_kv = false;
|
lparams.kv_type = GGML_TYPE_F32;
|
||||||
lparams.use_mlock = false;
|
lparams.use_mlock = false;
|
||||||
|
|
||||||
model = llama_load_model_from_file(params.model.c_str(), lparams);
|
model = llama_load_model_from_file(params.model.c_str(), lparams);
|
||||||
|
|
|
@ -26,7 +26,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
lparams.n_ctx = params.n_ctx;
|
lparams.n_ctx = params.n_ctx;
|
||||||
lparams.seed = params.seed;
|
lparams.seed = params.seed;
|
||||||
lparams.f16_kv = params.memory_f16;
|
lparams.kv_type = params.kv_type;
|
||||||
lparams.use_mmap = params.use_mmap;
|
lparams.use_mmap = params.use_mmap;
|
||||||
lparams.use_mlock = params.use_mlock;
|
lparams.use_mlock = params.use_mlock;
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,7 @@ Command line options:
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||||
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
|
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
|
||||||
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
|
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
|
||||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
- `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.
|
||||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
||||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
||||||
- `--numa`: Attempt optimizations that help on some NUMA systems.
|
- `--numa`: Attempt optimizations that help on some NUMA systems.
|
||||||
|
|
|
@ -704,8 +704,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" -kvt, --kv-type the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
|
||||||
if (llama_mlock_supported())
|
if (llama_mlock_supported())
|
||||||
{
|
{
|
||||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
|
@ -838,9 +837,33 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.rope_freq_scale = std::stof(argv[i]);
|
params.rope_freq_scale = std::stof(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "--kv-type" || arg == "-kvt")
|
||||||
|
{
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string type_name(argv[i]);
|
||||||
|
for (char & c : type_name) {
|
||||||
|
c = std::tolower(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type_name == "q8_0") {
|
||||||
|
params.kv_type = GGML_TYPE_Q8_0;
|
||||||
|
} else if (type_name == "f16") {
|
||||||
|
params.kv_type = GGML_TYPE_F16;
|
||||||
|
} else if (type_name == "f32") {
|
||||||
|
params.kv_type = GGML_TYPE_F32;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "error: unknown KV type: %s. Known types: q8_0, f16, f32.\n", argv[i]);
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
||||||
{
|
{
|
||||||
params.memory_f16 = false;
|
params.kv_type = GGML_TYPE_F32;
|
||||||
}
|
}
|
||||||
else if (arg == "--threads" || arg == "-t")
|
else if (arg == "--threads" || arg == "-t")
|
||||||
{
|
{
|
||||||
|
|
896
ggml-cuda.cu
896
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
163
ggml.c
163
ggml.c
|
@ -1117,11 +1117,19 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
|
||||||
|
|
||||||
static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
|
static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
|
||||||
assert(QK8_0 == 32);
|
assert(QK8_0 == 32);
|
||||||
assert(k % QK8_0 == 0);
|
|
||||||
const int nb = k / QK8_0;
|
const int nb = k / QK8_0;
|
||||||
|
|
||||||
block_q8_0 * restrict y = vy;
|
block_q8_0 * restrict y = vy;
|
||||||
|
|
||||||
|
if (k % QK8_0 != 0) {
|
||||||
|
float x_end[QK8_0] = {0};
|
||||||
|
memcpy(x_end, x + nb*QK8_0, sizeof(float) * (k % QK8_0));
|
||||||
|
|
||||||
|
block_q8_0 * y_end = y + nb;
|
||||||
|
|
||||||
|
quantize_row_q8_0(x_end, y_end, QK8_0);
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_NEON)
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
float32x4_t srcv [8];
|
float32x4_t srcv [8];
|
||||||
|
@ -4384,8 +4392,13 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
||||||
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return (t0->ne[0] == t1->ne[0]) &&
|
const int64_t blck_size = ggml_blck_size(t0->type);
|
||||||
(t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
|
|
||||||
|
const int64_t nblcks00_padded = (t0->ne[0] + blck_size - 1) / blck_size;
|
||||||
|
const int64_t nblcks10_padded = (t1->ne[0] + blck_size - 1) / blck_size;
|
||||||
|
|
||||||
|
return (nblcks00_padded == nblcks10_padded) && // ensure same number of blocks after padding
|
||||||
|
(t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
|
||||||
(t1->ne[3]%t0->ne[3] == 0);
|
(t1->ne[3]%t0->ne[3] == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6333,8 +6346,15 @@ static struct ggml_tensor * ggml_cpy_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
bool inplace) {
|
const bool inplace,
|
||||||
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
|
const bool pad) {
|
||||||
|
if (pad) {
|
||||||
|
const int64_t blck_size = ggml_blck_size(b->type);
|
||||||
|
const int64_t ne00_padded = ((a->ne[0] + blck_size - 1) / blck_size) * blck_size;
|
||||||
|
GGML_ASSERT(ne00_padded*ggml_nrows(a) == ggml_nelements(b));
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
|
||||||
|
}
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
|
@ -6350,6 +6370,8 @@ static struct ggml_tensor * ggml_cpy_impl(
|
||||||
ggml_format_name(result, "%s (copy)", a->name);
|
ggml_format_name(result, "%s (copy)", a->name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_set_op_params_i32(result, 0, pad ? 1 : 0);
|
||||||
|
|
||||||
result->op = GGML_OP_CPY;
|
result->op = GGML_OP_CPY;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
result->src[0] = a;
|
result->src[0] = a;
|
||||||
|
@ -6362,14 +6384,21 @@ struct ggml_tensor * ggml_cpy(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b) {
|
struct ggml_tensor * b) {
|
||||||
return ggml_cpy_impl(ctx, a, b, false);
|
return ggml_cpy_impl(ctx, a, b, false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_cpy_inplace(
|
struct ggml_tensor * ggml_cpy_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b) {
|
struct ggml_tensor * b) {
|
||||||
return ggml_cpy_impl(ctx, a, b, true);
|
return ggml_cpy_impl(ctx, a, b, true, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_cpy_pad(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b) {
|
||||||
|
return ggml_cpy_impl(ctx, a, b, false, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_cont
|
// ggml_cont
|
||||||
|
@ -6544,7 +6573,8 @@ static struct ggml_tensor * ggml_view_impl(
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int n_dims,
|
int n_dims,
|
||||||
const int64_t * ne,
|
const int64_t * ne,
|
||||||
size_t offset) {
|
size_t offset,
|
||||||
|
size_t i_blck) {
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
|
@ -6555,7 +6585,8 @@ static struct ggml_tensor * ggml_view_impl(
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
|
||||||
ggml_format_name(result, "%s (view)", a->name);
|
ggml_format_name(result, "%s (view)", a->name);
|
||||||
|
|
||||||
ggml_set_op_params(result, &offset, sizeof(offset));
|
size_t params[2] = {offset, i_blck};
|
||||||
|
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||||
|
|
||||||
result->op = GGML_OP_VIEW;
|
result->op = GGML_OP_VIEW;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -6572,7 +6603,7 @@ struct ggml_tensor * ggml_view_1d(
|
||||||
int64_t ne0,
|
int64_t ne0,
|
||||||
size_t offset) {
|
size_t offset) {
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
|
struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset, 0);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -6589,7 +6620,7 @@ struct ggml_tensor * ggml_view_2d(
|
||||||
|
|
||||||
const int64_t ne[2] = { ne0, ne1 };
|
const int64_t ne[2] = { ne0, ne1 };
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
|
struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset, 0);
|
||||||
|
|
||||||
result->nb[1] = nb1;
|
result->nb[1] = nb1;
|
||||||
result->nb[2] = result->nb[1]*ne1;
|
result->nb[2] = result->nb[1]*ne1;
|
||||||
|
@ -6612,7 +6643,7 @@ struct ggml_tensor * ggml_view_3d(
|
||||||
|
|
||||||
const int64_t ne[3] = { ne0, ne1, ne2 };
|
const int64_t ne[3] = { ne0, ne1, ne2 };
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
|
struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset, 0);
|
||||||
|
|
||||||
result->nb[1] = nb1;
|
result->nb[1] = nb1;
|
||||||
result->nb[2] = nb2;
|
result->nb[2] = nb2;
|
||||||
|
@ -6637,7 +6668,7 @@ struct ggml_tensor * ggml_view_4d(
|
||||||
|
|
||||||
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
|
struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset, 0);
|
||||||
|
|
||||||
result->nb[1] = nb1;
|
result->nb[1] = nb1;
|
||||||
result->nb[2] = nb2;
|
result->nb[2] = nb2;
|
||||||
|
@ -6646,6 +6677,42 @@ struct ggml_tensor * ggml_view_4d(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_view_blck_1d
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_view_blck_1d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0,
|
||||||
|
size_t offset,
|
||||||
|
size_t i_blck) {
|
||||||
|
|
||||||
|
struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset, i_blck);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ggml_view_blck_2d
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_view_blck_2d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
size_t nb1,
|
||||||
|
size_t offset,
|
||||||
|
size_t i_blck) {
|
||||||
|
|
||||||
|
const int64_t ne[2] = { ne0, ne1 };
|
||||||
|
|
||||||
|
struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset, i_blck);
|
||||||
|
|
||||||
|
result->nb[1] = nb1;
|
||||||
|
result->nb[2] = result->nb[1]*ne1;
|
||||||
|
result->nb[3] = result->nb[2];
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_permute
|
// ggml_permute
|
||||||
|
|
||||||
struct ggml_tensor * ggml_permute(
|
struct ggml_tensor * ggml_permute(
|
||||||
|
@ -8216,6 +8283,8 @@ static void ggml_compute_forward_dup_f16(
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||||
|
|
||||||
|
GGML_ASSERT(dst->op_params[0] == 0);
|
||||||
|
|
||||||
const int ith = params->ith; // thread index
|
const int ith = params->ith; // thread index
|
||||||
const int nth = params->nth; // number of threads
|
const int nth = params->nth; // number of threads
|
||||||
|
|
||||||
|
@ -8479,14 +8548,21 @@ static void ggml_compute_forward_dup_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||||
|
|
||||||
|
const bool pad = dst->op_params[0] & 1;
|
||||||
|
const int blck_size = ggml_blck_size(dst->type);
|
||||||
|
const int ne00_padded = ((ne00 + blck_size - 1) / blck_size) * blck_size;
|
||||||
|
if (pad) {
|
||||||
|
GGML_ASSERT(ggml_nelements(dst) == ne00_padded*ggml_nrows(src0));
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
||||||
|
}
|
||||||
|
|
||||||
const int ith = params->ith; // thread index
|
const int ith = params->ith; // thread index
|
||||||
const int nth = params->nth; // number of threads
|
const int nth = params->nth; // number of threads
|
||||||
|
|
||||||
|
@ -8544,15 +8620,20 @@ static void ggml_compute_forward_dup_f32(
|
||||||
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
||||||
|
|
||||||
size_t id = 0;
|
size_t id = 0;
|
||||||
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
const size_t rs = nb0 * ne00_padded / blck_size;
|
||||||
char * dst_ptr = (char *) dst->data;
|
char * dst_ptr = (char *) dst->data;
|
||||||
|
float src0_padded[ne00_padded];
|
||||||
|
|
||||||
for (int i03 = 0; i03 < ne03; i03++) {
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int i02 = 0; i02 < ne02; i02++) {
|
for (int i02 = 0; i02 < ne02; i02++) {
|
||||||
id += rs * ir0;
|
id += rs * ir0;
|
||||||
for (int i01 = ir0; i01 < ir1; i01++) {
|
for (int i01 = ir0; i01 < ir1; i01++) {
|
||||||
const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
quantize_row_q(src0_ptr, dst_ptr + id, ne00);
|
if (ne00 != ne00_padded) {
|
||||||
|
memcpy(src0_padded, src0_ptr, ne00*sizeof(float));
|
||||||
|
memset(src0_padded + ne00, 0, (ne00_padded - ne00) * sizeof(float));
|
||||||
|
}
|
||||||
|
quantize_row_q(ne00 == ne00_padded ? src0_ptr : src0_padded, dst_ptr + id, ne00_padded);
|
||||||
id += rs;
|
id += rs;
|
||||||
}
|
}
|
||||||
id += rs * (ne01 - ir1);
|
id += rs * (ne01 - ir1);
|
||||||
|
@ -8719,6 +8800,48 @@ static void ggml_compute_forward_dup_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else if (type_traits[dst->type].from_float) {
|
||||||
|
GGML_ASSERT(!pad);
|
||||||
|
GGML_ASSERT(ne00 == ne0);
|
||||||
|
GGML_ASSERT(ne01 == ne1);
|
||||||
|
GGML_ASSERT(ne02 == ne2);
|
||||||
|
GGML_ASSERT(ne03 == ne3);
|
||||||
|
|
||||||
|
size_t blck_index_0 = 0;
|
||||||
|
if (dst->src[1]->op == GGML_OP_VIEW) {
|
||||||
|
const size_t * op_params = (const size_t *) dst->src[1]->op_params;
|
||||||
|
blck_index_0 = op_params[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
|
||||||
|
|
||||||
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int i01 = ir0; i01 < ir1; i01++) {
|
||||||
|
const char * src0_row_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
||||||
|
char * dst_row_ptr = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3;
|
||||||
|
size_t blck_index = blck_index_0;
|
||||||
|
|
||||||
|
for (int i00 = 0; i00 < ne00; ++i00) {
|
||||||
|
char * dst_ptr = dst_row_ptr
|
||||||
|
+ ggml_element_size(dst) * ((i00 + blck_index_0) / ggml_blck_size(dst->type));
|
||||||
|
float * dst_tmp_ptr = (float *) (dst_ptr + ggml_element_size(dst));
|
||||||
|
|
||||||
|
if (blck_index == 0) {
|
||||||
|
memset(dst_tmp_ptr, 0, ggml_blck_size(dst->type)*sizeof(float));
|
||||||
|
}
|
||||||
|
|
||||||
|
dst_tmp_ptr[blck_index] = *((const float *) (src0_row_ptr + i00*nb00));
|
||||||
|
|
||||||
|
blck_index = (blck_index + 1) % ggml_blck_size(dst->type);
|
||||||
|
|
||||||
|
if (blck_index == 0 || i00 == (ne00 - 1)) {
|
||||||
|
quantize_row_q(dst_tmp_ptr, dst_ptr, ggml_blck_size(dst->type));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false); // TODO: implement
|
GGML_ASSERT(false); // TODO: implement
|
||||||
}
|
}
|
||||||
|
@ -11333,7 +11456,8 @@ static void ggml_compute_forward_mul_mat(
|
||||||
if (params->type == GGML_TASK_INIT) {
|
if (params->type == GGML_TASK_INIT) {
|
||||||
if (src1->type != vec_dot_type) {
|
if (src1->type != vec_dot_type) {
|
||||||
char * wdata = params->wdata;
|
char * wdata = params->wdata;
|
||||||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
const size_t row_size = ggml_type_size(vec_dot_type)*((ne10 + ggml_blck_size(vec_dot_type) - 1)
|
||||||
|
/ ggml_blck_size(vec_dot_type));
|
||||||
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||||
|
@ -11353,7 +11477,8 @@ static void ggml_compute_forward_mul_mat(
|
||||||
}
|
}
|
||||||
|
|
||||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
const size_t row_size = ggml_type_size(vec_dot_type)*((ne10 + ggml_blck_size(vec_dot_type) - 1)
|
||||||
|
/ ggml_blck_size(vec_dot_type));
|
||||||
|
|
||||||
const int64_t nr0 = ne01; // src0 rows
|
const int64_t nr0 = ne01; // src0 rows
|
||||||
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
||||||
|
|
22
ggml.h
22
ggml.h
|
@ -1062,6 +1062,12 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// a -> b, pad row size of a to a multiple of block size of b, return view(b)
|
||||||
|
GGML_API struct ggml_tensor * ggml_cpy_pad(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// make contiguous
|
// make contiguous
|
||||||
GGML_API struct ggml_tensor * ggml_cont(
|
GGML_API struct ggml_tensor * ggml_cont(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -1146,6 +1152,22 @@ extern "C" {
|
||||||
size_t nb3,
|
size_t nb3,
|
||||||
size_t offset);
|
size_t offset);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_view_blck_1d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0,
|
||||||
|
size_t offset,
|
||||||
|
size_t i_blck);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_view_blck_2d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
size_t nb1, // row stride in bytes
|
||||||
|
size_t offset,
|
||||||
|
size_t i_blck);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_permute(
|
GGML_API struct ggml_tensor * ggml_permute(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
166
llama.cpp
166
llama.cpp
|
@ -963,12 +963,33 @@ struct llama_hparams {
|
||||||
return n_embd/n_gqa();
|
return n_embd/n_gqa();
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t kv_size() const {
|
size_t kv_size(ggml_type type) const {
|
||||||
size_t result = 2ull;
|
return kv_size_k(type) + kv_size_v(type);
|
||||||
result *= (size_t) n_embd_gqa();
|
}
|
||||||
|
|
||||||
|
size_t kv_size_k(ggml_type type) const {
|
||||||
|
const int64_t blck_size = ggml_blck_size(type);
|
||||||
|
const int64_t n_embd_head_padded = ((n_embd_head() + blck_size - 1) / blck_size) * blck_size;
|
||||||
|
|
||||||
|
size_t result = 1ull;
|
||||||
|
result *= (size_t) n_embd_head_padded;
|
||||||
|
result *= (size_t) n_head_kv;
|
||||||
result *= (size_t) n_ctx;
|
result *= (size_t) n_ctx;
|
||||||
result *= (size_t) n_layer;
|
result *= (size_t) n_layer;
|
||||||
result *= sizeof(ggml_fp16_t);
|
result *= ggml_type_size(type);
|
||||||
|
result /= blck_size;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t kv_size_v(ggml_type type) const {
|
||||||
|
const size_t row_padding = type == GGML_TYPE_Q8_0 ? 128 : 0;
|
||||||
|
|
||||||
|
size_t result = 1ull;
|
||||||
|
result *= (size_t) n_embd_gqa();
|
||||||
|
result *= (size_t) n_ctx + row_padding;
|
||||||
|
result *= (size_t) n_layer;
|
||||||
|
result *= ggml_type_size(type);
|
||||||
|
result /= ggml_blck_size(type);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1165,6 +1186,9 @@ struct llama_context {
|
||||||
// key + value cache for the self attention
|
// key + value cache for the self attention
|
||||||
struct llama_kv_cache kv_self;
|
struct llama_kv_cache kv_self;
|
||||||
|
|
||||||
|
std::vector<llama_token> token_history;
|
||||||
|
int64_t previous_v_blck;
|
||||||
|
|
||||||
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
||||||
std::vector<float> logits;
|
std::vector<float> logits;
|
||||||
bool logits_all = false;
|
bool logits_all = false;
|
||||||
|
@ -1200,13 +1224,25 @@ static bool llama_kv_cache_init(
|
||||||
ggml_type wtype,
|
ggml_type wtype,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_gpu_layers) {
|
int n_gpu_layers) {
|
||||||
const int n_embd = hparams.n_embd_gqa();
|
const int blck_size = ggml_blck_size(wtype);
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_embd_head = hparams.n_embd_head();
|
||||||
|
const int n_embd_head_padded = ((n_embd_head + blck_size - 1) / blck_size) * blck_size;
|
||||||
|
const int n_head_kv = hparams.n_head_kv;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
|
||||||
const int64_t n_mem = n_layer*n_ctx;
|
if (n_ctx % ggml_blck_size(wtype) != 0) {
|
||||||
const int64_t n_elements = n_embd*n_mem;
|
LLAMA_LOG_ERROR("error: for KV type %s n_ctx must be a multiple of %d but received n_ctx=%d\n",
|
||||||
|
ggml_type_name(wtype), ggml_blck_size(wtype), n_ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
// if the KV cache is quantized we need a little extra space for each row to store the
|
||||||
|
// unquantized values between evals (this avoids precision loss when rebuilding the block)
|
||||||
|
const int64_t n_mem = n_layer*n_ctx;
|
||||||
|
const int64_t n_elements_k = n_embd_head_padded * n_head_kv * n_mem;
|
||||||
|
const int64_t n_elements_v = n_embd_head * n_head_kv * (n_mem + (wtype == GGML_TYPE_Q8_0 ? 128*n_layer : 0));
|
||||||
|
|
||||||
|
cache.buf.resize((n_elements_k + n_elements_v)*ggml_type_size(wtype)/ggml_blck_size(wtype) + 2u*MB);
|
||||||
cache.n = 0;
|
cache.n = 0;
|
||||||
|
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params;
|
||||||
|
@ -1221,8 +1257,8 @@ static bool llama_kv_cache_init(
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements_k);
|
||||||
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements_v);
|
||||||
ggml_set_name(cache.k, "cache_k");
|
ggml_set_name(cache.k, "cache_k");
|
||||||
ggml_set_name(cache.v, "cache_v");
|
ggml_set_name(cache.v, "cache_v");
|
||||||
|
|
||||||
|
@ -2305,15 +2341,13 @@ static void llm_load_tensors(
|
||||||
|
|
||||||
// print memory requirements
|
// print memory requirements
|
||||||
{
|
{
|
||||||
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
|
||||||
|
|
||||||
// this is the total memory required to run the inference
|
// this is the total memory required to run the inference
|
||||||
size_t mem_required =
|
size_t mem_required =
|
||||||
ctx_size +
|
ctx_size +
|
||||||
mmapped_size - vram_weights; // weights in VRAM not in memory
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
||||||
|
|
||||||
// this is the memory required by one llama_state
|
// this is the memory required by one llama_state
|
||||||
const size_t mem_required_state = scale*hparams.kv_size();
|
const size_t mem_required_state = hparams.kv_size(memory_type);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||||
|
@ -2337,7 +2371,7 @@ static void llm_load_tensors(
|
||||||
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
||||||
vram_kv_cache += hparams.kv_size() / 2;
|
vram_kv_cache += hparams.kv_size_v(memory_type);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
||||||
|
@ -2345,7 +2379,7 @@ static void llm_load_tensors(
|
||||||
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
||||||
vram_kv_cache += hparams.kv_size() / 2;
|
vram_kv_cache += hparams.kv_size_k(memory_type);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
@ -2454,13 +2488,17 @@ static struct ggml_cgraph * llm_build_llama(
|
||||||
|
|
||||||
GGML_ASSERT(!!kv_self.ctx);
|
GGML_ASSERT(!!kv_self.ctx);
|
||||||
|
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t blck_size_k = ggml_blck_size(kv_self.k->type);
|
||||||
const int64_t n_layer = hparams.n_layer;
|
const int64_t blck_size_v = ggml_blck_size(kv_self.v->type);
|
||||||
const int64_t n_ctx = hparams.n_ctx;
|
|
||||||
const int64_t n_head = hparams.n_head;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
const int64_t n_head_kv = hparams.n_head_kv;
|
const int64_t n_layer = hparams.n_layer;
|
||||||
const int64_t n_embd_head = hparams.n_embd_head();
|
const int64_t n_ctx = hparams.n_ctx;
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
const int64_t n_head = hparams.n_head;
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv;
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head();
|
||||||
|
const int64_t n_embd_head_padded = ((n_embd_head + blck_size_k - 1) / blck_size_k) * blck_size_k;
|
||||||
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
@ -2597,19 +2635,23 @@ static struct ggml_cgraph * llm_build_llama(
|
||||||
offload_func_v(Vcur);
|
offload_func_v(Vcur);
|
||||||
ggml_set_name(Vcur, "Vcur");
|
ggml_set_name(Vcur, "Vcur");
|
||||||
|
|
||||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
struct ggml_tensor * k = ggml_view_1d(
|
||||||
|
ctx0, kv_self.k, N*n_embd_head_padded*n_head_kv,
|
||||||
|
(ggml_element_size(kv_self.k)*n_embd_head_padded*n_head_kv)*(il*n_ctx + n_past)/blck_size_k);
|
||||||
offload_func_kq(k);
|
offload_func_kq(k);
|
||||||
ggml_set_name(k, "k");
|
ggml_set_name(k, "k");
|
||||||
|
|
||||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
const int64_t v_row_size = kv_self.v->type == GGML_TYPE_Q8_0 ? n_ctx + 128 : n_ctx;
|
||||||
( n_ctx)*ggml_element_size(kv_self.v),
|
struct ggml_tensor * v = ggml_view_blck_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
||||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
( v_row_size)*ggml_element_size(kv_self.v)/blck_size_v,
|
||||||
|
(il*v_row_size)*ggml_element_size(kv_self.v)*n_embd_gqa/blck_size_v + ggml_element_size(kv_self.v)*(n_past/blck_size_v),
|
||||||
|
n_past % blck_size_v);
|
||||||
offload_func_v(v);
|
offload_func_v(v);
|
||||||
ggml_set_name(v, "v");
|
ggml_set_name(v, "v");
|
||||||
|
|
||||||
// important: storing RoPE-ed version of K in the KV cache!
|
// important: storing RoPE-ed version of K in the KV cache!
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
ggml_build_forward_expand(gf, ggml_cpy_pad(ctx0, Kcur, k));
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
||||||
|
@ -2618,10 +2660,10 @@ static struct ggml_cgraph * llm_build_llama(
|
||||||
|
|
||||||
struct ggml_tensor * K =
|
struct ggml_tensor * K =
|
||||||
ggml_view_3d(ctx0, kv_self.k,
|
ggml_view_3d(ctx0, kv_self.k,
|
||||||
n_embd_head, n_past + N, n_head_kv,
|
n_embd_head_padded, n_past + N, n_head_kv,
|
||||||
ggml_element_size(kv_self.k)*n_embd_gqa,
|
ggml_element_size(kv_self.k)*n_embd_head_padded*n_head_kv/blck_size_k,
|
||||||
ggml_element_size(kv_self.k)*n_embd_head,
|
ggml_element_size(kv_self.k)*n_embd_head_padded/blck_size_k,
|
||||||
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
ggml_element_size(kv_self.k)*n_embd_head_padded*n_head_kv*n_ctx*il/blck_size_k);
|
||||||
offload_func_kq(K);
|
offload_func_kq(K);
|
||||||
ggml_set_name(K, "K");
|
ggml_set_name(K, "K");
|
||||||
|
|
||||||
|
@ -2646,13 +2688,16 @@ static struct ggml_cgraph * llm_build_llama(
|
||||||
offload_func_v(KQ_soft_max);
|
offload_func_v(KQ_soft_max);
|
||||||
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||||
|
|
||||||
|
|
||||||
// split cached V into n_head heads
|
// split cached V into n_head heads
|
||||||
|
const int64_t v_ne0_padded = ((n_past + N + blck_size_v - 1) / blck_size_v) * blck_size_v; // ne0 padded to multiple of blck_size_v
|
||||||
|
const int64_t v_row_size = kv_self.v->type == GGML_TYPE_Q8_0 ? n_ctx + 128 : n_ctx; // maximum ne0 + space for temporarily storing unquantized values
|
||||||
struct ggml_tensor * V =
|
struct ggml_tensor * V =
|
||||||
ggml_view_3d(ctx0, kv_self.v,
|
ggml_view_3d(ctx0, kv_self.v,
|
||||||
n_past + N, n_embd_head, n_head_kv,
|
v_ne0_padded, n_embd_head, n_head_kv,
|
||||||
ggml_element_size(kv_self.v)*n_ctx,
|
ggml_element_size(kv_self.v)*v_row_size/blck_size_v,
|
||||||
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
ggml_element_size(kv_self.v)*v_row_size*n_embd_head/blck_size_v,
|
||||||
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
ggml_element_size(kv_self.v)*v_row_size*n_embd_gqa*il/blck_size_v);
|
||||||
offload_func_v(V);
|
offload_func_v(V);
|
||||||
ggml_set_name(V, "V");
|
ggml_set_name(V, "V");
|
||||||
|
|
||||||
|
@ -3744,9 +3789,29 @@ static bool llama_eval_internal(
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens_v_redo;
|
||||||
|
const int64_t v_blck_size = ggml_blck_size(kv_self.v->type);
|
||||||
|
const int64_t current_v_blck = n_past / v_blck_size;
|
||||||
|
|
||||||
|
// if the v component of the KV cache is q8_0 the unquantized temporary values may have already been overwritten
|
||||||
|
// in that case we need to roll back to the beginning of a q8_0 block
|
||||||
|
const int64_t n_v_redo = lctx.previous_v_blck > current_v_blck ? n_past % v_blck_size : 0;
|
||||||
|
if (n_v_redo > 0) {
|
||||||
|
tokens_v_redo.insert(tokens_v_redo.end(),
|
||||||
|
lctx.token_history.begin() + n_past - n_v_redo,
|
||||||
|
lctx.token_history.begin() + n_past);
|
||||||
|
for (int64_t i = 0; i < n_tokens; ++i) {
|
||||||
|
tokens_v_redo.push_back(tokens[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
n_tokens = tokens_v_redo.size();
|
||||||
|
n_past -= n_v_redo;
|
||||||
|
}
|
||||||
|
const llama_token * tokens_eff = n_v_redo > 0 ? tokens_v_redo.data() : tokens;
|
||||||
|
|
||||||
ggml_allocr_reset(lctx.alloc);
|
ggml_allocr_reset(lctx.alloc);
|
||||||
|
|
||||||
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
|
ggml_cgraph * gf = llama_build_graph(lctx, tokens_eff, embd, n_tokens, n_past);
|
||||||
|
|
||||||
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
||||||
|
|
||||||
|
@ -3773,7 +3838,7 @@ static bool llama_eval_internal(
|
||||||
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
||||||
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
||||||
// with the BLAS calls. need a better solution
|
// with the BLAS calls. need a better solution
|
||||||
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
||||||
n_threads = std::min(4, n_threads);
|
n_threads = std::min(4, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3827,11 +3892,11 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
if (lctx.logits_all) {
|
if (lctx.logits_all) {
|
||||||
logits_out.resize(n_vocab * N);
|
logits_out.resize(n_vocab * N);
|
||||||
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
|
memcpy(logits_out.data(), (float *) ggml_get_data(res) + n_vocab*n_v_redo, sizeof(float)*n_vocab*N);
|
||||||
} else {
|
} else {
|
||||||
// return result for just the last token
|
// return result for just the last token
|
||||||
logits_out.resize(n_vocab);
|
logits_out.resize(n_vocab);
|
||||||
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
memcpy(logits_out.data(), (float *) ggml_get_data(res) + n_vocab*(n_v_redo+N-1), sizeof(float)*n_vocab);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3843,6 +3908,12 @@ static bool llama_eval_internal(
|
||||||
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// update token history and how far the v component of the KV cache was filled (for q8_0 rollback)
|
||||||
|
for (int64_t i = 0; i < n_tokens; ++i) {
|
||||||
|
lctx.token_history[n_past + i] = tokens_eff[i];
|
||||||
|
}
|
||||||
|
lctx.previous_v_blck = (n_past + n_tokens) / v_blck_size;
|
||||||
|
|
||||||
// measure the performance only for the single-token evals
|
// measure the performance only for the single-token evals
|
||||||
if (N == 1) {
|
if (N == 1) {
|
||||||
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
||||||
|
@ -6192,9 +6263,9 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.rope_freq_scale =*/ 1.0f,
|
/*.rope_freq_scale =*/ 1.0f,
|
||||||
/*.progress_callback =*/ nullptr,
|
/*.progress_callback =*/ nullptr,
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
/*.progress_callback_user_data =*/ nullptr,
|
||||||
|
/*.kv_type =*/ GGML_TYPE_Q8_0,
|
||||||
/*.low_vram =*/ false,
|
/*.low_vram =*/ false,
|
||||||
/*.mul_mat_q =*/ true,
|
/*.mul_mat_q =*/ true,
|
||||||
/*.f16_kv =*/ true,
|
|
||||||
/*.logits_all =*/ false,
|
/*.logits_all =*/ false,
|
||||||
/*.vocab_only =*/ false,
|
/*.vocab_only =*/ false,
|
||||||
/*.use_mmap =*/ true,
|
/*.use_mmap =*/ true,
|
||||||
|
@ -6269,8 +6340,6 @@ struct llama_model * llama_load_model_from_file(
|
||||||
|
|
||||||
llama_model * model = new llama_model;
|
llama_model * model = new llama_model;
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
||||||
|
|
||||||
unsigned cur_percentage = 0;
|
unsigned cur_percentage = 0;
|
||||||
if (params.progress_callback == NULL) {
|
if (params.progress_callback == NULL) {
|
||||||
params.progress_callback_user_data = &cur_percentage;
|
params.progress_callback_user_data = &cur_percentage;
|
||||||
|
@ -6289,7 +6358,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||||
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
|
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
|
||||||
params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
|
params.low_vram, params.kv_type, params.use_mmap, params.use_mlock, params.vocab_only,
|
||||||
params.progress_callback, params.progress_callback_user_data)) {
|
params.progress_callback, params.progress_callback_user_data)) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
||||||
delete model;
|
delete model;
|
||||||
|
@ -6320,11 +6389,9 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->rng = std::mt19937(params.seed);
|
ctx->rng = std::mt19937(params.seed);
|
||||||
ctx->logits_all = params.logits_all;
|
ctx->logits_all = params.logits_all;
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
||||||
|
|
||||||
// reserve memory for context buffers
|
// reserve memory for context buffers
|
||||||
if (!params.vocab_only) {
|
if (!params.vocab_only) {
|
||||||
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, params.kv_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
||||||
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -6337,6 +6404,9 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
const auto & hparams = ctx->model.hparams;
|
const auto & hparams = ctx->model.hparams;
|
||||||
|
|
||||||
|
ctx->token_history.resize(hparams.n_ctx);
|
||||||
|
ctx->previous_v_blck = 0;
|
||||||
|
|
||||||
// resized during inference
|
// resized during inference
|
||||||
if (params.logits_all) {
|
if (params.logits_all) {
|
||||||
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
||||||
|
|
3
llama.h
3
llama.h
|
@ -140,10 +140,11 @@ extern "C" {
|
||||||
// context pointer passed to the progress callback
|
// context pointer passed to the progress callback
|
||||||
void * progress_callback_user_data;
|
void * progress_callback_user_data;
|
||||||
|
|
||||||
|
enum ggml_type kv_type; // the type to use for the KV cache
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
||||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
||||||
bool f16_kv; // use fp16 for KV cache
|
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
bool use_mmap; // use mmap if possible
|
bool use_mmap; // use mmap if possible
|
||||||
|
|
|
@ -11,9 +11,9 @@ CLI_ARGS_MAIN_PERPLEXITY = [
|
||||||
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
|
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
|
||||||
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
|
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
|
||||||
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
|
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
|
||||||
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
|
"interactive", "interactive-first", "keep", "kv_type", "logdir", "logit-bias", "lora",
|
||||||
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
|
"lora-base", "low-vram", "main-gpu", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
|
||||||
"model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
|
"model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
|
||||||
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
|
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
|
||||||
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
|
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
|
||||||
"repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
|
"repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue