Merge branch 'master' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # Makefile # README.md # build.zig
This commit is contained in:
commit
a6c3dbc351
34 changed files with 4181 additions and 2456 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -45,6 +45,7 @@ models-mnt
|
||||||
/server
|
/server
|
||||||
/simple
|
/simple
|
||||||
/batched
|
/batched
|
||||||
|
/batched-bench
|
||||||
/export-lora
|
/export-lora
|
||||||
/finetune
|
/finetune
|
||||||
/speculative
|
/speculative
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
// swift-tools-version:5.3
|
// swift-tools-version:5.5
|
||||||
|
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
#if arch(arm) || arch(arm64)
|
#if arch(arm) || arch(arm64)
|
||||||
let platforms: [SupportedPlatform]? = [
|
let platforms: [SupportedPlatform]? = [
|
||||||
.macOS(.v11),
|
.macOS(.v12),
|
||||||
.iOS(.v14),
|
.iOS(.v14),
|
||||||
.watchOS(.v4),
|
.watchOS(.v4),
|
||||||
.tvOS(.v14)
|
.tvOS(.v14)
|
||||||
|
@ -41,12 +41,13 @@ let package = Package(
|
||||||
"ggml.c",
|
"ggml.c",
|
||||||
"llama.cpp",
|
"llama.cpp",
|
||||||
"ggml-alloc.c",
|
"ggml-alloc.c",
|
||||||
|
"ggml-backend.c",
|
||||||
"k_quants.c",
|
"k_quants.c",
|
||||||
] + additionalSources,
|
] + additionalSources,
|
||||||
resources: resources,
|
resources: resources,
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: [
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
.define("GGML_USE_K_QUANTS"),
|
.define("GGML_USE_K_QUANTS"),
|
||||||
.define("GGML_USE_ACCELERATE")
|
.define("GGML_USE_ACCELERATE")
|
||||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||||
|
|
|
@ -5,6 +5,8 @@ set(TARGET common)
|
||||||
add_library(${TARGET} OBJECT
|
add_library(${TARGET} OBJECT
|
||||||
common.h
|
common.h
|
||||||
common.cpp
|
common.cpp
|
||||||
|
sampling.h
|
||||||
|
sampling.cpp
|
||||||
console.h
|
console.h
|
||||||
console.cpp
|
console.cpp
|
||||||
grammar-parser.h
|
grammar-parser.h
|
||||||
|
|
|
@ -107,6 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
std::string arg;
|
std::string arg;
|
||||||
gpt_params default_params;
|
gpt_params default_params;
|
||||||
const std::string arg_prefix = "--";
|
const std::string arg_prefix = "--";
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
arg = argv[i];
|
arg = argv[i];
|
||||||
|
@ -184,7 +185,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.top_k = std::stoi(argv[i]);
|
sparams.top_k = std::stoi(argv[i]);
|
||||||
} else if (arg == "-c" || arg == "--ctx-size") {
|
} else if (arg == "-c" || arg == "--ctx-size") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -216,73 +217,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.top_p = std::stof(argv[i]);
|
sparams.top_p = std::stof(argv[i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.temp = std::stof(argv[i]);
|
sparams.temp = std::stof(argv[i]);
|
||||||
} else if (arg == "--tfs") {
|
} else if (arg == "--tfs") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.tfs_z = std::stof(argv[i]);
|
sparams.tfs_z = std::stof(argv[i]);
|
||||||
} else if (arg == "--typical") {
|
} else if (arg == "--typical") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.typical_p = std::stof(argv[i]);
|
sparams.typical_p = std::stof(argv[i]);
|
||||||
} else if (arg == "--repeat-last-n") {
|
} else if (arg == "--repeat-last-n") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.repeat_last_n = std::stoi(argv[i]);
|
sparams.repeat_last_n = std::stoi(argv[i]);
|
||||||
} else if (arg == "--repeat-penalty") {
|
} else if (arg == "--repeat-penalty") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.repeat_penalty = std::stof(argv[i]);
|
sparams.repeat_penalty = std::stof(argv[i]);
|
||||||
} else if (arg == "--frequency-penalty") {
|
} else if (arg == "--frequency-penalty") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.frequency_penalty = std::stof(argv[i]);
|
sparams.frequency_penalty = std::stof(argv[i]);
|
||||||
} else if (arg == "--presence-penalty") {
|
} else if (arg == "--presence-penalty") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.presence_penalty = std::stof(argv[i]);
|
sparams.presence_penalty = std::stof(argv[i]);
|
||||||
} else if (arg == "--mirostat") {
|
} else if (arg == "--mirostat") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.mirostat = std::stoi(argv[i]);
|
sparams.mirostat = std::stoi(argv[i]);
|
||||||
} else if (arg == "--mirostat-lr") {
|
} else if (arg == "--mirostat-lr") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.mirostat_eta = std::stof(argv[i]);
|
sparams.mirostat_eta = std::stof(argv[i]);
|
||||||
} else if (arg == "--mirostat-ent") {
|
} else if (arg == "--mirostat-ent") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.mirostat_tau = std::stof(argv[i]);
|
sparams.mirostat_tau = std::stof(argv[i]);
|
||||||
} else if (arg == "--cfg-negative-prompt") {
|
} else if (arg == "--cfg-negative-prompt") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.cfg_negative_prompt = argv[i];
|
sparams.cfg_negative_prompt = argv[i];
|
||||||
} else if (arg == "--cfg-negative-prompt-file") {
|
} else if (arg == "--cfg-negative-prompt-file") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -294,16 +295,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
|
||||||
if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
|
if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
|
||||||
params.cfg_negative_prompt.pop_back();
|
sparams.cfg_negative_prompt.pop_back();
|
||||||
}
|
}
|
||||||
} else if (arg == "--cfg-scale") {
|
} else if (arg == "--cfg-scale") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.cfg_scale = std::stof(argv[i]);
|
sparams.cfg_scale = std::stof(argv[i]);
|
||||||
} else if (arg == "-b" || arg == "--batch-size") {
|
} else if (arg == "-b" || arg == "--batch-size") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -512,7 +513,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
} else if (arg == "--ignore-eos") {
|
} else if (arg == "--ignore-eos") {
|
||||||
params.ignore_eos = true;
|
params.ignore_eos = true;
|
||||||
} else if (arg == "--no-penalize-nl") {
|
} else if (arg == "--no-penalize-nl") {
|
||||||
params.penalize_nl = false;
|
sparams.penalize_nl = false;
|
||||||
} else if (arg == "-l" || arg == "--logit-bias") {
|
} else if (arg == "-l" || arg == "--logit-bias") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -524,7 +525,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
std::string value_str;
|
std::string value_str;
|
||||||
try {
|
try {
|
||||||
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
||||||
params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
||||||
} else {
|
} else {
|
||||||
throw std::exception();
|
throw std::exception();
|
||||||
}
|
}
|
||||||
|
@ -627,6 +628,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
|
const llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
|
@ -659,19 +662,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
|
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
|
||||||
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
|
||||||
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
|
||||||
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
|
||||||
printf(" --mirostat N use Mirostat sampling.\n");
|
printf(" --mirostat N use Mirostat sampling.\n");
|
||||||
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
||||||
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
|
||||||
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
|
||||||
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
|
||||||
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
||||||
printf(" modifies the likelihood of token appearing in the completion,\n");
|
printf(" modifies the likelihood of token appearing in the completion,\n");
|
||||||
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
||||||
|
@ -682,7 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" negative prompt to use for guidance. (default: empty)\n");
|
printf(" negative prompt to use for guidance. (default: empty)\n");
|
||||||
printf(" --cfg-negative-prompt-file FNAME\n");
|
printf(" --cfg-negative-prompt-file FNAME\n");
|
||||||
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
||||||
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
|
||||||
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
||||||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
||||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
||||||
|
@ -690,7 +693,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
|
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
||||||
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
||||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||||
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
||||||
|
@ -840,7 +843,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.ignore_eos) {
|
if (params.ignore_eos) {
|
||||||
params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -932,127 +935,6 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// Sampling utils
|
|
||||||
//
|
|
||||||
|
|
||||||
llama_token llama_sample_token(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
struct llama_context * ctx_guidance,
|
|
||||||
struct llama_grammar * grammar,
|
|
||||||
const struct gpt_params & params,
|
|
||||||
const std::vector<llama_token> & last_tokens,
|
|
||||||
std::vector<llama_token_data> & candidates,
|
|
||||||
int idx) {
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
||||||
|
|
||||||
const float temp = params.temp;
|
|
||||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
|
||||||
const float top_p = params.top_p;
|
|
||||||
const float tfs_z = params.tfs_z;
|
|
||||||
const float typical_p = params.typical_p;
|
|
||||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
|
||||||
const float repeat_penalty = params.repeat_penalty;
|
|
||||||
const float alpha_presence = params.presence_penalty;
|
|
||||||
const float alpha_frequency = params.frequency_penalty;
|
|
||||||
const int mirostat = params.mirostat;
|
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
|
||||||
const bool penalize_nl = params.penalize_nl;
|
|
||||||
|
|
||||||
llama_token id = 0;
|
|
||||||
|
|
||||||
float * logits = llama_get_logits_ith(ctx, idx);
|
|
||||||
|
|
||||||
// Apply params.logit_bias map
|
|
||||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
|
||||||
logits[it->first] += it->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
candidates.clear();
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
if (ctx_guidance) {
|
|
||||||
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
// apply penalties
|
|
||||||
if (!last_tokens.empty()) {
|
|
||||||
const float nl_logit = logits[llama_token_nl(ctx)];
|
|
||||||
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
|
|
||||||
|
|
||||||
llama_sample_repetition_penalty(ctx, &cur_p,
|
|
||||||
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, repeat_penalty);
|
|
||||||
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
|
|
||||||
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
|
||||||
last_n_repeat, alpha_frequency, alpha_presence);
|
|
||||||
|
|
||||||
if (!penalize_nl) {
|
|
||||||
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
|
||||||
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
|
|
||||||
cur_p.data[idx].logit = nl_logit;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (grammar != NULL) {
|
|
||||||
llama_sample_grammar(ctx, &cur_p, grammar);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (temp <= 0) {
|
|
||||||
// Greedy sampling
|
|
||||||
id = llama_sample_token_greedy(ctx, &cur_p);
|
|
||||||
} else {
|
|
||||||
if (mirostat == 1) {
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
const int mirostat_m = 100;
|
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
|
||||||
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
|
||||||
} else if (mirostat == 2) {
|
|
||||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
|
||||||
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
|
||||||
} else {
|
|
||||||
// Temperature sampling
|
|
||||||
size_t min_keep = std::max(1, params.n_probs);
|
|
||||||
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
|
|
||||||
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
|
|
||||||
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
|
|
||||||
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
|
|
||||||
llama_sample_temp(ctx, &cur_p, temp);
|
|
||||||
|
|
||||||
{
|
|
||||||
const int n_top = 10;
|
|
||||||
LOG("top %d candidates:\n", n_top);
|
|
||||||
|
|
||||||
for (int i = 0; i < n_top; i++) {
|
|
||||||
const llama_token id = cur_p.data[i].id;
|
|
||||||
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
id = llama_sample_token(ctx, &cur_p);
|
|
||||||
|
|
||||||
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// printf("`%d`", candidates_p.size);
|
|
||||||
|
|
||||||
if (grammar != NULL) {
|
|
||||||
llama_grammar_accept_token(ctx, grammar, id);
|
|
||||||
}
|
|
||||||
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// YAML utils
|
// YAML utils
|
||||||
//
|
//
|
||||||
|
@ -1204,6 +1086,8 @@ std::string get_sortable_timestamp() {
|
||||||
|
|
||||||
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||||
|
const llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
||||||
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
||||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||||
|
@ -1250,21 +1134,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
|
|
||||||
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
||||||
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
||||||
dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
|
dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
||||||
fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
|
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
||||||
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
||||||
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
||||||
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
||||||
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
||||||
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
||||||
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
|
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
|
||||||
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
||||||
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
||||||
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
||||||
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
||||||
|
|
||||||
const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
|
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
|
||||||
const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
||||||
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
||||||
|
|
||||||
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
||||||
|
@ -1277,7 +1161,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
||||||
|
|
||||||
fprintf(stream, "logit_bias:\n");
|
fprintf(stream, "logit_bias:\n");
|
||||||
for (std::pair<llama_token, float> lb : params.logit_bias) {
|
for (std::pair<llama_token, float> lb : sparams.logit_bias) {
|
||||||
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1301,30 +1185,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||||
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
|
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
|
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
||||||
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
||||||
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
||||||
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
||||||
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
||||||
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
||||||
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
||||||
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
|
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
||||||
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
||||||
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
||||||
fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
|
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
|
||||||
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
||||||
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
||||||
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
||||||
fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
|
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
|
||||||
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
||||||
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
||||||
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
||||||
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
||||||
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
||||||
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
||||||
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
|
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
|
||||||
|
|
||||||
fprintf(stream, "reverse_prompt:\n");
|
fprintf(stream, "reverse_prompt:\n");
|
||||||
for (std::string ap : params.antiprompt) {
|
for (std::string ap : params.antiprompt) {
|
||||||
|
@ -1342,15 +1226,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
||||||
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
||||||
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
||||||
fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
|
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
||||||
|
|
||||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
||||||
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
||||||
|
|
||||||
fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
|
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||||
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
|
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,8 @@
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
#define LOG_NO_FILE_LINE_FUNCTION
|
#define LOG_NO_FILE_LINE_FUNCTION
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
|
@ -49,31 +51,12 @@ struct gpt_params {
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
||||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||||
|
|
||||||
// sampling parameters
|
// // sampling parameters
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
struct llama_sampling_params sampling_params;
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
|
||||||
float temp = 0.80f; // 1.0 = disabled
|
|
||||||
float repeat_penalty = 1.10f; // 1.0 = disabled
|
|
||||||
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
||||||
float frequency_penalty = 0.00f; // 0.0 = disabled
|
|
||||||
float presence_penalty = 0.00f; // 0.0 = disabled
|
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
|
||||||
|
|
||||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
|
||||||
|
|
||||||
// Classifier-Free Guidance
|
|
||||||
// https://arxiv.org/abs/2306.17806
|
|
||||||
std::string cfg_negative_prompt; // string to help guidance
|
|
||||||
float cfg_scale = 1.f; // How strong is guidance
|
|
||||||
|
|
||||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
|
@ -115,7 +98,6 @@ struct gpt_params {
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool ignore_eos = false; // ignore generated EOS tokens
|
bool ignore_eos = false; // ignore generated EOS tokens
|
||||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
|
@ -180,36 +162,6 @@ std::string llama_detokenize_bpe(
|
||||||
llama_context * ctx,
|
llama_context * ctx,
|
||||||
const std::vector<llama_token> & tokens);
|
const std::vector<llama_token> & tokens);
|
||||||
|
|
||||||
//
|
|
||||||
// Sampling utils
|
|
||||||
//
|
|
||||||
|
|
||||||
// this is a common sampling function used across the examples for convenience
|
|
||||||
// it can serve as a starting point for implementing your own sampling function
|
|
||||||
//
|
|
||||||
// required:
|
|
||||||
// - ctx: context to use for sampling
|
|
||||||
// - params: sampling parameters
|
|
||||||
//
|
|
||||||
// optional:
|
|
||||||
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
|
|
||||||
// - grammar: grammar to use for sampling, ignore if NULL
|
|
||||||
// - last_tokens: needed for repetition penalty, ignore if empty
|
|
||||||
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
|
||||||
//
|
|
||||||
// returns:
|
|
||||||
// - token: sampled token
|
|
||||||
// - candidates: vector of candidate tokens
|
|
||||||
//
|
|
||||||
llama_token llama_sample_token(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
struct llama_context * ctx_guidance,
|
|
||||||
struct llama_grammar * grammar,
|
|
||||||
const struct gpt_params & params,
|
|
||||||
const std::vector<llama_token> & last_tokens,
|
|
||||||
std::vector<llama_token_data> & candidates,
|
|
||||||
int idx = 0);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// YAML utils
|
// YAML utils
|
||||||
//
|
//
|
||||||
|
|
166
common/sampling.cpp
Normal file
166
common/sampling.cpp
Normal file
|
@ -0,0 +1,166 @@
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
|
llama_sampling_context::~llama_sampling_context() {
|
||||||
|
for (auto & it : sequence_contexts) {
|
||||||
|
if (it.second.grammar != NULL) {
|
||||||
|
llama_grammar_free(it.second.grammar);
|
||||||
|
it.second.grammar = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampling_context llama_sampling_context_init(
|
||||||
|
const struct gpt_params & params,
|
||||||
|
llama_grammar * grammar) {
|
||||||
|
llama_sampling_context result;
|
||||||
|
|
||||||
|
result.params = params.sampling_params;
|
||||||
|
result.grammar = grammar;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: Creates the context if it doesn't exist, so this always return something.
|
||||||
|
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
|
||||||
|
llama_sampling_context & ctx_sampling,
|
||||||
|
const llama_seq_id seq) {
|
||||||
|
const auto it = ctx_sampling.sequence_contexts.find(seq);
|
||||||
|
if (it != ctx_sampling.sequence_contexts.end()) {
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
llama_sampler_sequence_context new_ctx = {
|
||||||
|
2.0f * ctx_sampling.params.mirostat_tau,
|
||||||
|
ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
|
||||||
|
};
|
||||||
|
return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llama_sampling_context_reset(
|
||||||
|
llama_sampling_context & ctx_sampling,
|
||||||
|
const llama_seq_id seq) {
|
||||||
|
const auto it = ctx_sampling.sequence_contexts.find(seq);
|
||||||
|
if (it == ctx_sampling.sequence_contexts.end()) return false;
|
||||||
|
if (it->second.grammar != NULL) {
|
||||||
|
llama_grammar_free(it->second.grammar);
|
||||||
|
it->second.grammar = NULL;
|
||||||
|
}
|
||||||
|
ctx_sampling.sequence_contexts.erase(it);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_sampling_sample(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
struct llama_context * ctx_guidance,
|
||||||
|
struct llama_sampling_context & ctx_sampling,
|
||||||
|
const std::vector<llama_token> & last_tokens,
|
||||||
|
std::vector<llama_token_data> & candidates,
|
||||||
|
const int idx,
|
||||||
|
llama_seq_id seq) {
|
||||||
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
|
const llama_sampling_params & params = ctx_sampling.params;
|
||||||
|
const float temp = params.temp;
|
||||||
|
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||||
|
const float top_p = params.top_p;
|
||||||
|
const float tfs_z = params.tfs_z;
|
||||||
|
const float typical_p = params.typical_p;
|
||||||
|
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||||
|
const float repeat_penalty = params.repeat_penalty;
|
||||||
|
const float alpha_presence = params.presence_penalty;
|
||||||
|
const float alpha_frequency = params.frequency_penalty;
|
||||||
|
const int mirostat = params.mirostat;
|
||||||
|
const float mirostat_tau = params.mirostat_tau;
|
||||||
|
const float mirostat_eta = params.mirostat_eta;
|
||||||
|
const bool penalize_nl = params.penalize_nl;
|
||||||
|
|
||||||
|
llama_token id = 0;
|
||||||
|
|
||||||
|
float * logits = llama_get_logits_ith(ctx, idx);
|
||||||
|
|
||||||
|
// Apply params.logit_bias map
|
||||||
|
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||||
|
logits[it->first] += it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
candidates.clear();
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
|
if (ctx_guidance) {
|
||||||
|
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
// apply penalties
|
||||||
|
if (!last_tokens.empty()) {
|
||||||
|
const float nl_logit = logits[llama_token_nl(ctx)];
|
||||||
|
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
|
||||||
|
|
||||||
|
llama_sample_repetition_penalty(ctx, &cur_p,
|
||||||
|
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
||||||
|
last_n_repeat, repeat_penalty);
|
||||||
|
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
|
||||||
|
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
||||||
|
last_n_repeat, alpha_frequency, alpha_presence);
|
||||||
|
|
||||||
|
if (!penalize_nl) {
|
||||||
|
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
||||||
|
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
|
||||||
|
cur_p.data[idx].logit = nl_logit;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
|
||||||
|
|
||||||
|
if (ctx_seq.grammar != NULL) {
|
||||||
|
llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (temp <= 0) {
|
||||||
|
// Greedy sampling
|
||||||
|
id = llama_sample_token_greedy(ctx, &cur_p);
|
||||||
|
} else {
|
||||||
|
if (mirostat == 1) {
|
||||||
|
const int mirostat_m = 100;
|
||||||
|
llama_sample_temp(ctx, &cur_p, temp);
|
||||||
|
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
|
||||||
|
} else if (mirostat == 2) {
|
||||||
|
llama_sample_temp(ctx, &cur_p, temp);
|
||||||
|
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
|
||||||
|
} else {
|
||||||
|
// Temperature sampling
|
||||||
|
size_t min_keep = std::max(1, params.n_probs);
|
||||||
|
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
|
||||||
|
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
|
||||||
|
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
|
||||||
|
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
|
||||||
|
llama_sample_temp(ctx, &cur_p, temp);
|
||||||
|
|
||||||
|
{
|
||||||
|
const int n_top = 10;
|
||||||
|
LOG("top %d candidates:\n", n_top);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_top; i++) {
|
||||||
|
const llama_token id = cur_p.data[i].id;
|
||||||
|
(void)id; // To avoid a warning that id is unused when logging is disabled.
|
||||||
|
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
id = llama_sample_token(ctx, &cur_p);
|
||||||
|
|
||||||
|
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx_seq.grammar != NULL) {
|
||||||
|
llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return id;
|
||||||
|
}
|
108
common/sampling.h
Normal file
108
common/sampling.h
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
// sampling parameters
|
||||||
|
typedef struct llama_sampling_params {
|
||||||
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
|
float temp = 0.80f; // 1.0 = disabled
|
||||||
|
float repeat_penalty = 1.10f; // 1.0 = disabled
|
||||||
|
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
|
float frequency_penalty = 0.00f; // 0.0 = disabled
|
||||||
|
float presence_penalty = 0.00f; // 0.0 = disabled
|
||||||
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
|
|
||||||
|
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||||
|
|
||||||
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
|
||||||
|
// Classifier-Free Guidance
|
||||||
|
// https://arxiv.org/abs/2306.17806
|
||||||
|
std::string cfg_negative_prompt; // string to help guidance
|
||||||
|
float cfg_scale = 1.f; // How strong is guidance
|
||||||
|
|
||||||
|
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||||
|
|
||||||
|
} llama_sampling_params;
|
||||||
|
|
||||||
|
// per-sequence sampler context
|
||||||
|
typedef struct llama_sampler_sequence_context {
|
||||||
|
float mirostat_mu; // mirostat sampler state
|
||||||
|
llama_grammar * grammar;
|
||||||
|
} llama_sampler_sequence_context;
|
||||||
|
|
||||||
|
// general sampler context
|
||||||
|
typedef struct llama_sampling_context {
|
||||||
|
~llama_sampling_context();
|
||||||
|
|
||||||
|
// parameters that will be used for sampling and when creating
|
||||||
|
// new llama_sampler_sequence_context instances
|
||||||
|
llama_sampling_params params;
|
||||||
|
|
||||||
|
// map of sequence ids to sampler contexts
|
||||||
|
std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
|
||||||
|
|
||||||
|
// when non-NULL, new instances of llama_sampler_sequence_context
|
||||||
|
// will get a copy of the grammar here
|
||||||
|
// note: only the pointer is stored here, it is not a copy of
|
||||||
|
// the grammar and shouldn't be freed
|
||||||
|
llama_grammar * grammar;
|
||||||
|
} llama_sampling_context;
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
// Create a new sampling context instance.
|
||||||
|
llama_sampling_context llama_sampling_context_init(
|
||||||
|
const struct gpt_params & params,
|
||||||
|
llama_grammar * grammar = NULL);
|
||||||
|
|
||||||
|
// Fetches the sampler context for the specified sequence id (defaults to 0).
|
||||||
|
// If the context for that sequence id doesn't already exist, it will be created with
|
||||||
|
// default values based on the parameters in the ctx_sampling argument.
|
||||||
|
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
|
||||||
|
llama_sampling_context & ctx_sampling,
|
||||||
|
const llama_seq_id seq = 0);
|
||||||
|
|
||||||
|
// Reset the sampler context for the supplied sequence id (defaults to 0).
|
||||||
|
// This is necessary to reuse a sequence id or free memory used by sequences
|
||||||
|
// that are no longer required.
|
||||||
|
bool llama_sampling_context_reset(
|
||||||
|
llama_sampling_context & ctx_sampling,
|
||||||
|
const llama_seq_id seq = 0);
|
||||||
|
|
||||||
|
// this is a common sampling function used across the examples for convenience
|
||||||
|
// it can serve as a starting point for implementing your own sampling function
|
||||||
|
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||||
|
// llama_sampling_context_reset when a sequence ends
|
||||||
|
//
|
||||||
|
// required:
|
||||||
|
// - ctx: context to use for sampling
|
||||||
|
// - ctx_sampling: sampling-specific context
|
||||||
|
//
|
||||||
|
// optional:
|
||||||
|
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
|
||||||
|
// - last_tokens: needed for repetition penalty, ignore if empty
|
||||||
|
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
||||||
|
// - seq: sequence id to associate sampler state with
|
||||||
|
//
|
||||||
|
// returns:
|
||||||
|
// - token: sampled token
|
||||||
|
// - candidates: vector of candidate tokens
|
||||||
|
//
|
||||||
|
llama_token llama_sampling_sample(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
struct llama_context * ctx_guidance,
|
||||||
|
struct llama_sampling_context & ctx_sampling,
|
||||||
|
const std::vector<llama_token> & last_tokens,
|
||||||
|
std::vector<llama_token_data> & candidates,
|
||||||
|
const int idx = 0,
|
||||||
|
llama_seq_id seq = 0);
|
238
convert-bloom-hf-to-gguf.py
Executable file
238
convert-bloom-hf-to-gguf.py
Executable file
|
@ -0,0 +1,238 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# HF bloom --> gguf conversion
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer # type: ignore[import]
|
||||||
|
|
||||||
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
|
def count_model_parts(dir_model: Path) -> int:
|
||||||
|
num_parts = 0
|
||||||
|
for filename in os.listdir(dir_model):
|
||||||
|
if filename.startswith("pytorch_model-"):
|
||||||
|
num_parts += 1
|
||||||
|
|
||||||
|
if num_parts > 0:
|
||||||
|
print("gguf: found " + str(num_parts) + " model parts")
|
||||||
|
return num_parts
|
||||||
|
|
||||||
|
|
||||||
|
# Supported Models:
|
||||||
|
# https://huggingface.co/bigscience/bloom-1b7
|
||||||
|
# https://huggingface.co/bigscience/bloom-3b
|
||||||
|
# https://huggingface.co/bigscience/bloom-7b1
|
||||||
|
# https://huggingface.co/Langboat/bloom-1b4-zh
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
|
||||||
|
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||||
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
|
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
||||||
|
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
dir_model = args.model
|
||||||
|
ftype = args.ftype
|
||||||
|
if not dir_model.is_dir():
|
||||||
|
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# possible tensor data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
|
if args.outfile is not None:
|
||||||
|
fname_out = args.outfile
|
||||||
|
else:
|
||||||
|
# output in the same directory as the model by default
|
||||||
|
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
||||||
|
|
||||||
|
print("gguf: loading model "+dir_model.name)
|
||||||
|
|
||||||
|
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||||
|
hparams = json.load(f)
|
||||||
|
|
||||||
|
if hparams["architectures"][0] != "BloomForCausalLM":
|
||||||
|
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# get number of model parts
|
||||||
|
num_parts = count_model_parts(dir_model)
|
||||||
|
|
||||||
|
ARCH=gguf.MODEL_ARCH.BLOOM
|
||||||
|
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||||
|
|
||||||
|
print("gguf: get model metadata")
|
||||||
|
|
||||||
|
block_count = hparams["n_layer"]
|
||||||
|
|
||||||
|
gguf_writer.add_name("Bloom")
|
||||||
|
n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
|
||||||
|
n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
|
||||||
|
gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
|
||||||
|
gguf_writer.add_embedding_length(n_embed)
|
||||||
|
gguf_writer.add_feed_forward_length(4 * n_embed)
|
||||||
|
gguf_writer.add_block_count(block_count)
|
||||||
|
gguf_writer.add_head_count(n_head)
|
||||||
|
gguf_writer.add_head_count_kv(n_head)
|
||||||
|
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||||
|
gguf_writer.add_file_type(ftype)
|
||||||
|
|
||||||
|
# TOKENIZATION
|
||||||
|
|
||||||
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
|
tokens: list[bytearray] = []
|
||||||
|
scores: list[float] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
# gpt2 tokenizer
|
||||||
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||||
|
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||||
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
|
assert max(tokenizer.vocab.values()) < vocab_size
|
||||||
|
|
||||||
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
|
|
||||||
|
for i in range(vocab_size):
|
||||||
|
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||||
|
scores.append(0.0) # dummy
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
gguf_writer.add_token_list(tokens)
|
||||||
|
gguf_writer.add_token_scores(scores)
|
||||||
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||||
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
||||||
|
# TENSORS
|
||||||
|
|
||||||
|
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
|
||||||
|
|
||||||
|
# params for qkv transform
|
||||||
|
n_head_kv = hparams.get("n_head_kv", n_head)
|
||||||
|
head_dim = n_embed // n_head
|
||||||
|
|
||||||
|
# tensor info
|
||||||
|
print("gguf: get tensor metadata")
|
||||||
|
|
||||||
|
if num_parts == 0:
|
||||||
|
part_names = iter(("pytorch_model.bin",))
|
||||||
|
else:
|
||||||
|
part_names = (
|
||||||
|
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
for part_name in part_names:
|
||||||
|
if args.vocab_only:
|
||||||
|
break
|
||||||
|
print("gguf: loading model part '" + part_name + "'")
|
||||||
|
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
||||||
|
|
||||||
|
has_lm_head = True
|
||||||
|
if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
|
||||||
|
has_lm_head = False
|
||||||
|
|
||||||
|
for original_name in model_part.keys():
|
||||||
|
data = model_part[original_name]
|
||||||
|
name = re.sub(r'transformer\.', '', original_name)
|
||||||
|
|
||||||
|
old_dtype = data.dtype
|
||||||
|
|
||||||
|
# convert any unsupported data types to float32
|
||||||
|
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
||||||
|
data = data.to(torch.float32)
|
||||||
|
|
||||||
|
data = data.squeeze().numpy()
|
||||||
|
|
||||||
|
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
|
||||||
|
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
||||||
|
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
||||||
|
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
||||||
|
qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
|
||||||
|
data = np.concatenate(
|
||||||
|
(qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
||||||
|
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
||||||
|
qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
|
||||||
|
axis=0
|
||||||
|
)
|
||||||
|
print("re-format attention.linear_qkv.weight")
|
||||||
|
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
|
||||||
|
qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
|
||||||
|
data = np.concatenate(
|
||||||
|
(qkv_bias[:, 0, :].reshape((n_embed,)),
|
||||||
|
qkv_bias[:, 1, :].reshape((n_embed,)),
|
||||||
|
qkv_bias[:, 2, :].reshape((n_embed,))),
|
||||||
|
axis=0
|
||||||
|
)
|
||||||
|
print("re-format attention.linear_qkv.bias")
|
||||||
|
|
||||||
|
# map tensor names
|
||||||
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
|
if new_name is None:
|
||||||
|
print("Can not map tensor '" + name + "'")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
data_dtype = data.dtype
|
||||||
|
|
||||||
|
# if f32 desired, convert any float16 to float32
|
||||||
|
if ftype == 0 and data_dtype == np.float16:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
|
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
|
||||||
|
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||||
|
|
||||||
|
gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
if not has_lm_head and name == "word_embeddings.weight":
|
||||||
|
gguf_writer.add_tensor("output.weight", data)
|
||||||
|
print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa
|
||||||
|
|
||||||
|
|
||||||
|
print("gguf: write header")
|
||||||
|
gguf_writer.write_header_to_file()
|
||||||
|
print("gguf: write metadata")
|
||||||
|
gguf_writer.write_kv_data_to_file()
|
||||||
|
if not args.vocab_only:
|
||||||
|
print("gguf: write tensors")
|
||||||
|
gguf_writer.write_tensors_to_file()
|
||||||
|
|
||||||
|
gguf_writer.close()
|
||||||
|
|
||||||
|
print(f"gguf: model successfully exported to '{fname_out}'")
|
||||||
|
print("")
|
|
@ -25,6 +25,7 @@ else()
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
|
add_subdirectory(batched-bench)
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
add_subdirectory(parallel)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(embd-input)
|
add_subdirectory(embd-input)
|
||||||
|
|
5
examples/batched-bench/CMakeLists.txt
Normal file
5
examples/batched-bench/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET batched-bench)
|
||||||
|
add_executable(${TARGET} batched-bench.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
51
examples/batched-bench/README.md
Normal file
51
examples/batched-bench/README.md
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# llama.cpp/example/batched-bench
|
||||||
|
|
||||||
|
Benchmark the batched decoding performance of `llama.cpp`
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
There are 2 modes of operation:
|
||||||
|
|
||||||
|
- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
|
||||||
|
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
|
||||||
|
|
||||||
|
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||||
|
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
|
||||||
|
|
||||||
|
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||||
|
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
|
||||||
|
|
||||||
|
# custom set of batches
|
||||||
|
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
|
||||||
|
```
|
||||||
|
|
||||||
|
## Sample results
|
||||||
|
|
||||||
|
- `PP` - prompt tokens per batch
|
||||||
|
- `TG` - generated tokens per batch
|
||||||
|
- `B` - number of batches
|
||||||
|
- `N_KV` - required KV cache size
|
||||||
|
- `T_PP` - prompt processing time (i.e. time to first token)
|
||||||
|
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
|
||||||
|
- `T_TG` - time to generate all batches
|
||||||
|
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
|
||||||
|
- `T` - total time
|
||||||
|
- `S` - total speed (i.e. all tokens / total time)
|
||||||
|
|
||||||
|
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
||||||
|
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
||||||
|
| 128 | 128 | 1 | 256 | 0.108 | 1186.64 | 3.079 | 41.57 | 3.187 | 80.32 |
|
||||||
|
| 128 | 128 | 2 | 512 | 0.198 | 1295.19 | 5.029 | 50.90 | 5.227 | 97.95 |
|
||||||
|
| 128 | 128 | 4 | 1024 | 0.373 | 1373.96 | 6.878 | 74.44 | 7.251 | 141.23 |
|
||||||
|
| 128 | 128 | 8 | 2048 | 0.751 | 1363.27 | 7.344 | 139.43 | 8.095 | 252.99 |
|
||||||
|
| 128 | 128 | 16 | 4096 | 1.570 | 1304.68 | 8.455 | 242.23 | 10.024 | 408.60 |
|
||||||
|
| 128 | 128 | 32 | 8192 | 3.408 | 1201.73 | 8.801 | 465.40 | 12.209 | 670.96 |
|
||||||
|
| 128 | 256 | 1 | 384 | 0.107 | 1196.70 | 6.329 | 40.45 | 6.436 | 59.67 |
|
||||||
|
| 128 | 256 | 2 | 768 | 0.194 | 1317.45 | 10.239 | 50.00 | 10.433 | 73.61 |
|
||||||
|
| 128 | 256 | 4 | 1536 | 0.366 | 1399.03 | 13.960 | 73.35 | 14.326 | 107.22 |
|
||||||
|
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
||||||
|
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
||||||
|
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
251
examples/batched-bench/batched-bench.cpp
Normal file
251
examples/batched-bench/batched-bench.cpp
Normal file
|
@ -0,0 +1,251 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// mutates the input string
|
||||||
|
static std::vector<int> parse_list(char * p) {
|
||||||
|
std::vector<int> ret;
|
||||||
|
|
||||||
|
char * q = p;
|
||||||
|
|
||||||
|
while (*p) {
|
||||||
|
if (*p == ',') {
|
||||||
|
*p = '\0';
|
||||||
|
ret.push_back(std::atoi(q));
|
||||||
|
q = p + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
++p;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.push_back(std::atoi(q));
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (argc == 1 || argv[1][0] == '-') {
|
||||||
|
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
|
||||||
|
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
||||||
|
printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
||||||
|
return 1 ;
|
||||||
|
}
|
||||||
|
|
||||||
|
int n_kv_max = 2048;
|
||||||
|
int is_pp_shared = 0;
|
||||||
|
int n_gpu_layers = 0;
|
||||||
|
int mmq = 0;
|
||||||
|
|
||||||
|
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
|
||||||
|
std::vector<int> n_tg = { 128, 256, };
|
||||||
|
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
|
||||||
|
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
|
||||||
|
|
||||||
|
if (argc >= 2) {
|
||||||
|
params.model = argv[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 3) {
|
||||||
|
n_kv_max = std::atoi(argv[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 4) {
|
||||||
|
is_pp_shared = std::atoi(argv[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 5) {
|
||||||
|
n_gpu_layers = std::atoi(argv[4]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 6) {
|
||||||
|
mmq = std::atoi(argv[5]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 7) {
|
||||||
|
n_pp = parse_list(argv[6]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 8) {
|
||||||
|
n_tg = parse_list(argv[7]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argc >= 9) {
|
||||||
|
n_pl = parse_list(argv[8]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// init LLM
|
||||||
|
|
||||||
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
|
// initialize the model
|
||||||
|
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
|
||||||
|
model_params.n_gpu_layers = n_gpu_layers;
|
||||||
|
|
||||||
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
|
|
||||||
|
ctx_params.seed = 1234;
|
||||||
|
ctx_params.n_ctx = n_kv_max;
|
||||||
|
ctx_params.n_batch = 512;
|
||||||
|
ctx_params.mul_mat_q = mmq;
|
||||||
|
|
||||||
|
ctx_params.n_threads = params.n_threads;
|
||||||
|
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
|
|
||||||
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
if (ctx == NULL) {
|
||||||
|
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch batch = llama_batch_init(n_kv_max, 0);
|
||||||
|
|
||||||
|
// decode in batches of ctx_params.n_batch tokens
|
||||||
|
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
|
||||||
|
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
||||||
|
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||||
|
|
||||||
|
llama_batch batch_view = {
|
||||||
|
n_tokens,
|
||||||
|
batch.token + i,
|
||||||
|
nullptr,
|
||||||
|
batch.pos + i,
|
||||||
|
batch.seq_id + i,
|
||||||
|
batch.logits + i,
|
||||||
|
0, 0, 0, // unused
|
||||||
|
};
|
||||||
|
|
||||||
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
|
if (ret != 0) {
|
||||||
|
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
// warm up
|
||||||
|
{
|
||||||
|
batch.n_tokens = 16;
|
||||||
|
|
||||||
|
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
batch.token[i] = 0;
|
||||||
|
batch.pos[i] = i;
|
||||||
|
batch.seq_id[i] = 0;
|
||||||
|
batch.logits[i] = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
|
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||||
|
|
||||||
|
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
||||||
|
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
||||||
|
for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
|
||||||
|
const int pp = n_pp[i_pp];
|
||||||
|
const int tg = n_tg[i_tg];
|
||||||
|
const int pl = n_pl[i_pl];
|
||||||
|
|
||||||
|
const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
|
||||||
|
|
||||||
|
if (n_ctx_req > n_kv_max) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
batch.n_tokens = is_pp_shared ? pp : pl*pp;
|
||||||
|
|
||||||
|
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
batch.token[i] = 0;
|
||||||
|
batch.pos[i] = i;
|
||||||
|
batch.seq_id[i] = 0;
|
||||||
|
batch.logits[i] = false;
|
||||||
|
}
|
||||||
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
|
|
||||||
|
const auto t_pp_start = ggml_time_us();
|
||||||
|
|
||||||
|
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||||
|
|
||||||
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_pp_shared) {
|
||||||
|
for (int32_t i = 1; i < pl; ++i) {
|
||||||
|
llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto t_pp_end = ggml_time_us();
|
||||||
|
|
||||||
|
const auto t_tg_start = ggml_time_us();
|
||||||
|
|
||||||
|
for (int i = 0; i < tg; ++i) {
|
||||||
|
batch.n_tokens = pl;
|
||||||
|
|
||||||
|
for (int j = 0; j < pl; ++j) {
|
||||||
|
batch.token[j] = 0;
|
||||||
|
batch.pos[j] = pp + i;
|
||||||
|
batch.seq_id[j] = j;
|
||||||
|
batch.logits[j] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
|
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto t_tg_end = ggml_time_us();
|
||||||
|
|
||||||
|
const int32_t n_kv = n_ctx_req;
|
||||||
|
|
||||||
|
const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
|
||||||
|
const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
|
||||||
|
const float t = t_pp + t_tg;
|
||||||
|
|
||||||
|
const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
|
||||||
|
const float speed_tg = pl*tg / t_tg;
|
||||||
|
const float speed = n_kv / t;
|
||||||
|
|
||||||
|
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
9
examples/batched.swift/.gitignore
vendored
Normal file
9
examples/batched.swift/.gitignore
vendored
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
.DS_Store
|
||||||
|
/.build
|
||||||
|
/Packages
|
||||||
|
xcuserdata/
|
||||||
|
DerivedData/
|
||||||
|
.swiftpm/configuration/registries.json
|
||||||
|
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
|
||||||
|
.netrc
|
||||||
|
batched_swift
|
6
examples/batched.swift/Makefile
Executable file
6
examples/batched.swift/Makefile
Executable file
|
@ -0,0 +1,6 @@
|
||||||
|
.PHONY: build
|
||||||
|
|
||||||
|
build:
|
||||||
|
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||||
|
rm -f ./batched_swift
|
||||||
|
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
|
22
examples/batched.swift/Package.swift
Normal file
22
examples/batched.swift/Package.swift
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
// swift-tools-version: 5.5
|
||||||
|
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
||||||
|
|
||||||
|
import PackageDescription
|
||||||
|
|
||||||
|
let package = Package(
|
||||||
|
name: "batched_swift",
|
||||||
|
platforms: [.macOS(.v12)],
|
||||||
|
dependencies: [
|
||||||
|
.package(name: "llama", path: "../../"),
|
||||||
|
],
|
||||||
|
targets: [
|
||||||
|
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||||
|
// Targets can depend on other targets in this package and products from dependencies.
|
||||||
|
.executableTarget(
|
||||||
|
name: "batched_swift",
|
||||||
|
dependencies: ["llama"],
|
||||||
|
path: "Sources",
|
||||||
|
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
4
examples/batched.swift/README.md
Normal file
4
examples/batched.swift/README.md
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
This is a swift clone of `examples/batched`.
|
||||||
|
|
||||||
|
$ `make`
|
||||||
|
$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
|
255
examples/batched.swift/Sources/main.swift
Normal file
255
examples/batched.swift/Sources/main.swift
Normal file
|
@ -0,0 +1,255 @@
|
||||||
|
import Foundation
|
||||||
|
import llama
|
||||||
|
|
||||||
|
let arguments = CommandLine.arguments
|
||||||
|
|
||||||
|
// Check that we have at least one argument (the model path)
|
||||||
|
guard arguments.count > 1 else {
|
||||||
|
print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
let modelPath: String = arguments[1]
|
||||||
|
let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
|
||||||
|
let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
|
||||||
|
|
||||||
|
// total length of the sequences including the prompt
|
||||||
|
let n_len: Int = 32
|
||||||
|
|
||||||
|
// init LLM
|
||||||
|
llama_backend_init(false)
|
||||||
|
defer {
|
||||||
|
llama_backend_free()
|
||||||
|
}
|
||||||
|
|
||||||
|
let model_params = llama_model_default_params()
|
||||||
|
guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
|
||||||
|
print("Failed to load model")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
defer {
|
||||||
|
llama_free_model(model)
|
||||||
|
}
|
||||||
|
|
||||||
|
var tokens = tokenize(text: prompt, add_bos: true)
|
||||||
|
|
||||||
|
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
|
||||||
|
|
||||||
|
var context_params = llama_context_default_params()
|
||||||
|
context_params.seed = 1234
|
||||||
|
context_params.n_ctx = n_kv_req
|
||||||
|
context_params.n_batch = UInt32(max(n_len, n_parallel))
|
||||||
|
context_params.n_threads = 8
|
||||||
|
context_params.n_threads_batch = 8
|
||||||
|
|
||||||
|
let context = llama_new_context_with_model(model, context_params)
|
||||||
|
guard context != nil else {
|
||||||
|
print("Failed to initialize context")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
defer {
|
||||||
|
llama_free(context)
|
||||||
|
}
|
||||||
|
|
||||||
|
let n_ctx = llama_n_ctx(context)
|
||||||
|
|
||||||
|
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
||||||
|
|
||||||
|
if n_kv_req > n_ctx {
|
||||||
|
print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
var buffer: [CChar] = []
|
||||||
|
for id: llama_token in tokens {
|
||||||
|
print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
|
||||||
|
}
|
||||||
|
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
|
||||||
|
defer {
|
||||||
|
llama_batch_free(batch)
|
||||||
|
}
|
||||||
|
|
||||||
|
// evaluate the initial prompt
|
||||||
|
batch.n_tokens = Int32(tokens.count)
|
||||||
|
|
||||||
|
for (i, token) in tokens.enumerated() {
|
||||||
|
batch.token[i] = token
|
||||||
|
batch.pos[i] = Int32(i)
|
||||||
|
batch.seq_id[i] = 0
|
||||||
|
batch.logits[i] = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// llama_decode will output logits only for the last token of the prompt
|
||||||
|
batch.logits[Int(batch.n_tokens) - 1] = 1
|
||||||
|
|
||||||
|
if llama_decode(context, batch) != 0 {
|
||||||
|
print("llama_decode() failed")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 1 ..< n_parallel {
|
||||||
|
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
|
||||||
|
}
|
||||||
|
|
||||||
|
if n_parallel > 1 {
|
||||||
|
print("generating \(n_parallel) sequences ...\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
var streams: [String] = .init(repeating: "", count: n_parallel)
|
||||||
|
var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
|
||||||
|
var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
|
||||||
|
|
||||||
|
var n_cur = batch.n_tokens
|
||||||
|
var n_decode = 0
|
||||||
|
|
||||||
|
let t_main_start = ggml_time_us()
|
||||||
|
|
||||||
|
while n_cur <= n_len {
|
||||||
|
// prepare the next batch
|
||||||
|
batch.n_tokens = 0
|
||||||
|
|
||||||
|
// sample the next token for each parallel sequence / stream
|
||||||
|
for i in 0 ..< n_parallel {
|
||||||
|
if i_batch[i] < 0 {
|
||||||
|
// the stream has already finished
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
var n_vocab = llama_n_vocab(model)
|
||||||
|
var logits = llama_get_logits_ith(context, i_batch[i])
|
||||||
|
|
||||||
|
var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
|
||||||
|
|
||||||
|
for token_id in 0 ..< n_vocab {
|
||||||
|
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
|
||||||
|
}
|
||||||
|
|
||||||
|
var candidates_p: llama_token_data_array = .init(
|
||||||
|
data: &candidates,
|
||||||
|
size: candidates.count,
|
||||||
|
sorted: false
|
||||||
|
)
|
||||||
|
|
||||||
|
let top_k: Int32 = 40
|
||||||
|
let top_p: Float = 0.9
|
||||||
|
let temp: Float = 0.4
|
||||||
|
|
||||||
|
llama_sample_top_k(context, &candidates_p, top_k, 1)
|
||||||
|
llama_sample_top_p(context, &candidates_p, top_p, 1)
|
||||||
|
llama_sample_temp(context, &candidates_p, temp)
|
||||||
|
|
||||||
|
let new_token_id = llama_sample_token(context, &candidates_p)
|
||||||
|
|
||||||
|
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||||
|
|
||||||
|
// is it an end of stream? -> mark the stream as finished
|
||||||
|
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
||||||
|
i_batch[i] = -1
|
||||||
|
// print("")
|
||||||
|
if n_parallel > 1 {
|
||||||
|
print("stream \(i) finished at n_cur = \(n_cur)")
|
||||||
|
}
|
||||||
|
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
|
||||||
|
|
||||||
|
// if there is only one stream, we print immediately to stdout
|
||||||
|
if n_parallel == 1 {
|
||||||
|
print(nextStringPiece, terminator: "")
|
||||||
|
}
|
||||||
|
streams[i] += nextStringPiece
|
||||||
|
|
||||||
|
// push this new token for next evaluation
|
||||||
|
batch.token[Int(batch.n_tokens)] = new_token_id
|
||||||
|
batch.pos[Int(batch.n_tokens)] = n_cur
|
||||||
|
batch.seq_id[Int(batch.n_tokens)] = Int32(i)
|
||||||
|
batch.logits[Int(batch.n_tokens)] = 1
|
||||||
|
|
||||||
|
i_batch[i] = batch.n_tokens
|
||||||
|
|
||||||
|
batch.n_tokens += 1
|
||||||
|
|
||||||
|
n_decode += 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// all streams are finished
|
||||||
|
if batch.n_tokens == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
n_cur += 1
|
||||||
|
|
||||||
|
// evaluate the current batch with the transformer model
|
||||||
|
if llama_decode(context, batch) != 0 {
|
||||||
|
print("llama_decode() failed")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if n_parallel > 1 {
|
||||||
|
print("\n")
|
||||||
|
for (i, stream) in streams.enumerated() {
|
||||||
|
print("sequence \(i):\n\n\(prompt)\(stream)\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let t_main_end = ggml_time_us()
|
||||||
|
|
||||||
|
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
|
||||||
|
|
||||||
|
llama_print_timings(context)
|
||||||
|
|
||||||
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
|
let n_tokens = text.count + (add_bos ? 1 : 0)
|
||||||
|
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||||
|
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
|
||||||
|
var swiftTokens: [llama_token] = []
|
||||||
|
for i in 0 ..< tokenCount {
|
||||||
|
swiftTokens.append(tokens[Int(i)])
|
||||||
|
}
|
||||||
|
tokens.deallocate()
|
||||||
|
return swiftTokens
|
||||||
|
}
|
||||||
|
|
||||||
|
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
|
||||||
|
var result = [CChar](repeating: 0, count: 8)
|
||||||
|
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
|
||||||
|
if nTokens < 0 {
|
||||||
|
if result.count >= -Int(nTokens) {
|
||||||
|
result.removeLast(-Int(nTokens))
|
||||||
|
} else {
|
||||||
|
result.removeAll()
|
||||||
|
}
|
||||||
|
let check = llama_token_to_piece(
|
||||||
|
model,
|
||||||
|
token,
|
||||||
|
&result,
|
||||||
|
Int32(result.count)
|
||||||
|
)
|
||||||
|
assert(check == nTokens)
|
||||||
|
} else {
|
||||||
|
result.removeLast(result.count - Int(nTokens))
|
||||||
|
}
|
||||||
|
if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
|
||||||
|
return utfString
|
||||||
|
} else {
|
||||||
|
buffer.append(contentsOf: result)
|
||||||
|
let data = Data(buffer.map { UInt8(bitPattern: $0) })
|
||||||
|
if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
|
||||||
|
buffer = []
|
||||||
|
}
|
||||||
|
guard let bufferString = String(data: data, encoding: .utf8) else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
buffer = []
|
||||||
|
return bufferString
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
|
||||||
ctx_params.seed = 1234;
|
ctx_params.seed = 1234;
|
||||||
ctx_params.n_ctx = n_kv_req;
|
ctx_params.n_ctx = n_kv_req;
|
||||||
ctx_params.n_batch = std::max(n_len, n_parallel);
|
ctx_params.n_batch = std::max(n_len, n_parallel);
|
||||||
ctx_params.n_threads = params.n_threads;
|
ctx_params.n_threads = params.n_threads;
|
||||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
|
@ -128,21 +128,22 @@ bool eval_string(struct MyModel * mymodel,const char* str){
|
||||||
llama_token sampling_id(struct MyModel* mymodel) {
|
llama_token sampling_id(struct MyModel* mymodel) {
|
||||||
llama_context* ctx = mymodel->ctx;
|
llama_context* ctx = mymodel->ctx;
|
||||||
gpt_params params = mymodel->params;
|
gpt_params params = mymodel->params;
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
// int n_ctx = llama_n_ctx(ctx);
|
// int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float temp = params.temp;
|
const float temp = sparams.temp;
|
||||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
|
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = sparams.top_p;
|
||||||
const float tfs_z = params.tfs_z;
|
const float tfs_z = sparams.tfs_z;
|
||||||
const float typical_p = params.typical_p;
|
const float typical_p = sparams.typical_p;
|
||||||
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||||
// const float repeat_penalty = params.repeat_penalty;
|
// const float repeat_penalty = params.repeat_penalty;
|
||||||
// const float alpha_presence = params.presence_penalty;
|
// const float alpha_presence = params.presence_penalty;
|
||||||
// const float alpha_frequency = params.frequency_penalty;
|
// const float alpha_frequency = params.frequency_penalty;
|
||||||
const int mirostat = params.mirostat;
|
const int mirostat = sparams.mirostat;
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
const float mirostat_tau = sparams.mirostat_tau;
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
const float mirostat_eta = sparams.mirostat_eta;
|
||||||
// const bool penalize_nl = params.penalize_nl;
|
// const bool penalize_nl = params.penalize_nl;
|
||||||
|
|
||||||
llama_token id = 0;
|
llama_token id = 0;
|
||||||
|
@ -151,7 +152,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
||||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
// Apply params.logit_bias map
|
// Apply params.logit_bias map
|
||||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
||||||
logits[it->first] += it->second;
|
logits[it->first] += it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -104,6 +104,7 @@ static void sigint_handler(int signo) {
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
@ -206,7 +207,7 @@ int main(int argc, char ** argv) {
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (params.cfg_scale > 1.f) {
|
if (sparams.cfg_scale > 1.f) {
|
||||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
ctx_guidance = llama_new_context_with_model(model, lparams);
|
||||||
}
|
}
|
||||||
|
@ -269,9 +270,9 @@ int main(int argc, char ** argv) {
|
||||||
int guidance_offset = 0;
|
int guidance_offset = 0;
|
||||||
int original_prompt_len = 0;
|
int original_prompt_len = 0;
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
@ -312,7 +313,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||||
|
@ -358,7 +359,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||||
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
|
@ -376,8 +377,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
{
|
{
|
||||||
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
|
||||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
|
||||||
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -434,6 +435,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(model);
|
const int n_vocab = llama_n_vocab(model);
|
||||||
|
|
||||||
|
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
|
|
||||||
|
@ -552,7 +554,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||||
|
|
||||||
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
|
const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
|
||||||
|
|
||||||
last_tokens.erase(last_tokens.begin());
|
last_tokens.erase(last_tokens.begin());
|
||||||
last_tokens.push_back(id);
|
last_tokens.push_back(id);
|
||||||
|
|
|
@ -109,6 +109,7 @@ int main(int argc, char ** argv) {
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("main", "log"));
|
log_set_target(log_filename_generator("main", "log"));
|
||||||
|
@ -179,7 +180,7 @@ int main(int argc, char ** argv) {
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (params.cfg_scale > 1.f) {
|
if (sparams.cfg_scale > 1.f) {
|
||||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
ctx_guidance = llama_new_context_with_model(model, lparams);
|
||||||
}
|
}
|
||||||
|
@ -257,9 +258,9 @@ int main(int argc, char ** argv) {
|
||||||
int guidance_offset = 0;
|
int guidance_offset = 0;
|
||||||
int original_prompt_len = 0;
|
int original_prompt_len = 0;
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
|
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
|
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
@ -296,6 +297,9 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||||
__func__, n_matching_session_tokens, embd_inp.size());
|
__func__, n_matching_session_tokens, embd_inp.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// remove any "future" tokens that we might have inherited from the previous session
|
||||||
|
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGLN(
|
LOGLN(
|
||||||
|
@ -343,7 +347,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||||
|
@ -395,7 +399,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||||
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
|
@ -413,8 +417,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
{
|
{
|
||||||
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
|
||||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
|
||||||
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -469,6 +473,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(model);
|
const int n_vocab = llama_n_vocab(model);
|
||||||
|
|
||||||
|
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
|
|
||||||
|
@ -543,9 +548,6 @@ int main(int argc, char ** argv) {
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
embd.erase(embd.begin(), embd.begin() + i);
|
embd.erase(embd.begin(), embd.begin() + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove any "future" tokens that we might have inherited from the session from the KV cache
|
|
||||||
llama_kv_cache_tokens_rm(ctx, n_past, -1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// evaluate tokens in batches
|
// evaluate tokens in batches
|
||||||
|
@ -625,7 +627,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG("saved session to %s\n", path_session.c_str());
|
LOG("saved session to %s\n", path_session.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
|
const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
|
||||||
|
|
||||||
last_tokens.erase(last_tokens.begin());
|
last_tokens.erase(last_tokens.begin());
|
||||||
last_tokens.push_back(id);
|
last_tokens.push_back(id);
|
||||||
|
|
|
@ -125,6 +125,8 @@ int main(int argc, char ** argv) {
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
|
||||||
|
|
||||||
// load the prompts from an external file if there are any
|
// load the prompts from an external file if there are any
|
||||||
if (params.prompt.empty()) {
|
if (params.prompt.empty()) {
|
||||||
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||||
|
@ -339,7 +341,7 @@ int main(int argc, char ** argv) {
|
||||||
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
||||||
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
||||||
|
|
||||||
const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
|
const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
|
||||||
|
|
||||||
if (client.n_decoded == 1) {
|
if (client.n_decoded == 1) {
|
||||||
// start measuring generation time after the first token to make sure all concurrent clients
|
// start measuring generation time after the first token to make sure all concurrent clients
|
||||||
|
@ -384,7 +386,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
n_total_prompt += client.n_prompt;
|
n_total_prompt += client.n_prompt;
|
||||||
n_total_gen += client.n_decoded;
|
n_total_gen += client.n_decoded;
|
||||||
|
llama_sampling_context_reset(ctx_sampling, client.seq_id);
|
||||||
client.seq_id = -1;
|
client.seq_id = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,10 @@
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
llama_sampling_params & sparams = params.sampling_params;
|
||||||
params.seed = 42;
|
params.seed = 42;
|
||||||
params.n_threads = 4;
|
params.n_threads = 4;
|
||||||
params.repeat_last_n = 64;
|
sparams.repeat_last_n = 64;
|
||||||
params.prompt = "The quick brown fox";
|
params.prompt = "The quick brown fox";
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
@ -24,7 +25,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
auto n_past = 0;
|
auto n_past = 0;
|
||||||
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
|
||||||
|
|
||||||
// init
|
// init
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -136,6 +136,11 @@
|
||||||
display: block;
|
display: block;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fieldset label.slim {
|
||||||
|
margin: 0 0.5em;
|
||||||
|
display: inline;
|
||||||
|
}
|
||||||
|
|
||||||
header, footer {
|
header, footer {
|
||||||
text-align: center;
|
text-align: center;
|
||||||
}
|
}
|
||||||
|
@ -145,6 +150,14 @@
|
||||||
color: #888;
|
color: #888;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.mode-chat textarea[name=prompt] {
|
||||||
|
height: 4.5em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.mode-completion textarea[name=prompt] {
|
||||||
|
height: 10em;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@keyframes loading-bg-wipe {
|
@keyframes loading-bg-wipe {
|
||||||
0% {
|
0% {
|
||||||
|
@ -187,7 +200,7 @@
|
||||||
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
||||||
historyTemplate: "{{name}}: {{message}}",
|
historyTemplate: "{{name}}: {{message}}",
|
||||||
transcript: [],
|
transcript: [],
|
||||||
type: "chat",
|
type: "chat", // "chat" | "completion"
|
||||||
char: "Llama",
|
char: "Llama",
|
||||||
user: "User",
|
user: "User",
|
||||||
})
|
})
|
||||||
|
@ -365,13 +378,44 @@
|
||||||
return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
|
return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function runLlama(prompt, llamaParams, char) {
|
||||||
|
const currentMessages = [];
|
||||||
|
const history = session.value.transcript;
|
||||||
|
if (controller.value) {
|
||||||
|
throw new Error("already running");
|
||||||
|
}
|
||||||
|
controller.value = new AbortController();
|
||||||
|
for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
|
||||||
|
const data = chunk.data;
|
||||||
|
|
||||||
|
if (data.stop) {
|
||||||
|
while (
|
||||||
|
currentMessages.length > 0 &&
|
||||||
|
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
|
||||||
|
) {
|
||||||
|
currentMessages.pop();
|
||||||
|
}
|
||||||
|
transcriptUpdate([...history, [char, currentMessages]])
|
||||||
|
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
|
||||||
|
} else {
|
||||||
|
currentMessages.push(data);
|
||||||
|
transcriptUpdate([...history, [char, currentMessages]])
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.timings) {
|
||||||
|
llamaStats.value = data.timings;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
controller.value = null;
|
||||||
|
}
|
||||||
|
|
||||||
// send message to server
|
// send message to server
|
||||||
const chat = async (msg) => {
|
const chat = async (msg) => {
|
||||||
if (controller.value) {
|
if (controller.value) {
|
||||||
console.log('already running...');
|
console.log('already running...');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
controller.value = new AbortController();
|
|
||||||
|
|
||||||
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
|
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
|
||||||
|
|
||||||
|
@ -391,55 +435,41 @@
|
||||||
).join("\n"),
|
).join("\n"),
|
||||||
});
|
});
|
||||||
|
|
||||||
const currentMessages = [];
|
await runLlama(prompt, {
|
||||||
const history = session.value.transcript
|
|
||||||
|
|
||||||
const llamaParams = {
|
|
||||||
...params.value,
|
...params.value,
|
||||||
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
|
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
|
||||||
|
}, "{{char}}");
|
||||||
|
}
|
||||||
|
|
||||||
|
const runCompletion = async () => {
|
||||||
|
if (controller.value) {
|
||||||
|
console.log('already running...');
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
const {prompt} = session.value;
|
||||||
|
transcriptUpdate([...session.value.transcript, ["", prompt]]);
|
||||||
|
await runLlama(prompt, {
|
||||||
|
...params.value,
|
||||||
|
stop: [],
|
||||||
|
}, "");
|
||||||
|
}
|
||||||
|
|
||||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
|
const stop = (e) => {
|
||||||
const data = chunk.data;
|
e.preventDefault();
|
||||||
|
if (controller.value) {
|
||||||
if (data.stop) {
|
controller.value.abort();
|
||||||
while (
|
controller.value = null;
|
||||||
currentMessages.length > 0 &&
|
|
||||||
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
|
|
||||||
) {
|
|
||||||
currentMessages.pop();
|
|
||||||
}
|
|
||||||
transcriptUpdate([...history, ["{{char}}", currentMessages]])
|
|
||||||
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
|
|
||||||
} else {
|
|
||||||
currentMessages.push(data);
|
|
||||||
transcriptUpdate([...history, ["{{char}}", currentMessages]])
|
|
||||||
}
|
|
||||||
|
|
||||||
if (data.timings) {
|
|
||||||
llamaStats.value = data.timings;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
controller.value = null;
|
const reset = (e) => {
|
||||||
|
stop(e);
|
||||||
|
transcriptUpdate([]);
|
||||||
}
|
}
|
||||||
|
|
||||||
function MessageInput() {
|
function MessageInput() {
|
||||||
const message = useSignal("")
|
const message = useSignal("")
|
||||||
|
|
||||||
const stop = (e) => {
|
|
||||||
e.preventDefault();
|
|
||||||
if (controller.value) {
|
|
||||||
controller.value.abort();
|
|
||||||
controller.value = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const reset = (e) => {
|
|
||||||
stop(e);
|
|
||||||
transcriptUpdate([]);
|
|
||||||
}
|
|
||||||
|
|
||||||
const submit = (e) => {
|
const submit = (e) => {
|
||||||
stop(e);
|
stop(e);
|
||||||
chat(message.value);
|
chat(message.value);
|
||||||
|
@ -474,6 +504,19 @@
|
||||||
`
|
`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function CompletionControls() {
|
||||||
|
const submit = (e) => {
|
||||||
|
stop(e);
|
||||||
|
runCompletion();
|
||||||
|
}
|
||||||
|
return html`
|
||||||
|
<div>
|
||||||
|
<button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
|
||||||
|
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
||||||
|
<button onclick=${reset}>Reset</button>
|
||||||
|
</div>`;
|
||||||
|
}
|
||||||
|
|
||||||
const ChatLog = (props) => {
|
const ChatLog = (props) => {
|
||||||
const messages = session.value.transcript;
|
const messages = session.value.transcript;
|
||||||
const container = useRef(null)
|
const container = useRef(null)
|
||||||
|
@ -497,7 +540,11 @@
|
||||||
data;
|
data;
|
||||||
message = html`<${Markdownish} text=${template(text)} />`
|
message = html`<${Markdownish} text=${template(text)} />`
|
||||||
}
|
}
|
||||||
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
if(user) {
|
||||||
|
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
||||||
|
} else {
|
||||||
|
return html`<p key=${index}>${message}</p>`
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
|
@ -574,18 +621,31 @@
|
||||||
userTemplateAutosave()
|
userTemplateAutosave()
|
||||||
}, [session.value, params.value])
|
}, [session.value, params.value])
|
||||||
|
|
||||||
return html`
|
const GrammarControl = () => (
|
||||||
<form>
|
html`
|
||||||
<fieldset>
|
<div>
|
||||||
<${UserTemplateResetButton}/>
|
<label for="template">Grammar</label>
|
||||||
</fieldset>
|
<textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
||||||
|
<input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
||||||
|
<button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
||||||
|
</div>
|
||||||
|
`
|
||||||
|
);
|
||||||
|
|
||||||
<fieldset>
|
const PromptControlFieldSet = () => (
|
||||||
<div>
|
html`
|
||||||
<label for="prompt">Prompt</label>
|
<fieldset>
|
||||||
<textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
|
<div>
|
||||||
</div>
|
<label htmlFor="prompt">Prompt</label>
|
||||||
</fieldset>
|
<textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
|
||||||
|
</div>
|
||||||
|
</fieldset>
|
||||||
|
`
|
||||||
|
);
|
||||||
|
|
||||||
|
const ChatConfigForm = () => (
|
||||||
|
html`
|
||||||
|
${PromptControlFieldSet()}
|
||||||
|
|
||||||
<fieldset class="two">
|
<fieldset class="two">
|
||||||
<div>
|
<div>
|
||||||
|
@ -609,15 +669,30 @@
|
||||||
<label for="template">Chat history template</label>
|
<label for="template">Chat history template</label>
|
||||||
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
|
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
|
||||||
</div>
|
</div>
|
||||||
|
${GrammarControl()}
|
||||||
|
</fieldset>
|
||||||
|
`
|
||||||
|
);
|
||||||
|
|
||||||
|
const CompletionConfigForm = () => (
|
||||||
|
html`
|
||||||
|
${PromptControlFieldSet()}
|
||||||
|
<fieldset>${GrammarControl()}</fieldset>
|
||||||
|
`
|
||||||
|
);
|
||||||
|
|
||||||
|
return html`
|
||||||
|
<form>
|
||||||
|
<fieldset class="two">
|
||||||
|
<${UserTemplateResetButton}/>
|
||||||
<div>
|
<div>
|
||||||
<label for="template">Grammar</label>
|
<label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
|
||||||
<textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
<label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
|
||||||
<input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
|
||||||
<button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
|
||||||
</div>
|
</div>
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
|
||||||
|
${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
|
||||||
|
|
||||||
<fieldset class="two">
|
<fieldset class="two">
|
||||||
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
|
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
|
||||||
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
|
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
|
||||||
|
@ -851,7 +926,7 @@
|
||||||
function App(props) {
|
function App(props) {
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<div>
|
<div class="mode-${session.value.type}">
|
||||||
<header>
|
<header>
|
||||||
<h1>llama.cpp</h1>
|
<h1>llama.cpp</h1>
|
||||||
</header>
|
</header>
|
||||||
|
@ -861,7 +936,7 @@
|
||||||
</main>
|
</main>
|
||||||
|
|
||||||
<section id="write">
|
<section id="write">
|
||||||
<${MessageInput} />
|
<${session.value.type === 'chat' ? MessageInput : CompletionControls} />
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<footer>
|
<footer>
|
||||||
|
|
|
@ -200,6 +200,7 @@ struct llama_server_context
|
||||||
llama_model *model = nullptr;
|
llama_model *model = nullptr;
|
||||||
llama_context *ctx = nullptr;
|
llama_context *ctx = nullptr;
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
llama_sampling_context ctx_sampling;
|
||||||
int n_ctx;
|
int n_ctx;
|
||||||
|
|
||||||
grammar_parser::parse_state parsed_grammar;
|
grammar_parser::parse_state parsed_grammar;
|
||||||
|
@ -254,6 +255,7 @@ struct llama_server_context
|
||||||
if (grammar != nullptr) {
|
if (grammar != nullptr) {
|
||||||
llama_grammar_free(grammar);
|
llama_grammar_free(grammar);
|
||||||
grammar = nullptr;
|
grammar = nullptr;
|
||||||
|
ctx_sampling = llama_sampling_context_init(params, NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -329,8 +331,8 @@ struct llama_server_context
|
||||||
grammar_parser::print_grammar(stderr, parsed_grammar);
|
grammar_parser::print_grammar(stderr, parsed_grammar);
|
||||||
|
|
||||||
{
|
{
|
||||||
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
|
||||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
|
||||||
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
|
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -339,6 +341,7 @@ struct llama_server_context
|
||||||
grammar = llama_grammar_init(
|
grammar = llama_grammar_init(
|
||||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||||
}
|
}
|
||||||
|
ctx_sampling = llama_sampling_context_init(params, grammar);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -402,6 +405,7 @@ struct llama_server_context
|
||||||
// compare the evaluated prompt with the new prompt
|
// compare the evaluated prompt with the new prompt
|
||||||
n_past = common_part(embd, prompt_tokens);
|
n_past = common_part(embd, prompt_tokens);
|
||||||
embd = prompt_tokens;
|
embd = prompt_tokens;
|
||||||
|
|
||||||
if (n_past == num_prompt_tokens)
|
if (n_past == num_prompt_tokens)
|
||||||
{
|
{
|
||||||
// we have to evaluate at least 1 token to generate logits.
|
// we have to evaluate at least 1 token to generate logits.
|
||||||
|
@ -409,6 +413,9 @@ struct llama_server_context
|
||||||
n_past--;
|
n_past--;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// since #3228 we now have to manually manage the KV cache
|
||||||
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||||
|
|
||||||
LOG_VERBOSE("prompt ingested", {
|
LOG_VERBOSE("prompt ingested", {
|
||||||
{"n_past", n_past},
|
{"n_past", n_past},
|
||||||
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
||||||
|
@ -458,9 +465,6 @@ struct llama_server_context
|
||||||
// compare the evaluated prompt with the new prompt
|
// compare the evaluated prompt with the new prompt
|
||||||
n_past = common_part(embd, prompt_tokens);
|
n_past = common_part(embd, prompt_tokens);
|
||||||
|
|
||||||
// since #3228 we now have to manually manage the KV cache
|
|
||||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
||||||
|
|
||||||
embd = prompt_tokens;
|
embd = prompt_tokens;
|
||||||
if (n_past == num_prompt_tokens)
|
if (n_past == num_prompt_tokens)
|
||||||
{
|
{
|
||||||
|
@ -468,6 +472,9 @@ struct llama_server_context
|
||||||
n_past--;
|
n_past--;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// since #3228 we now have to manually manage the KV cache
|
||||||
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||||
|
|
||||||
LOG_VERBOSE("prompt ingested", {
|
LOG_VERBOSE("prompt ingested", {
|
||||||
{"n_past", n_past},
|
{"n_past", n_past},
|
||||||
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
||||||
|
@ -550,12 +557,12 @@ struct llama_server_context
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(llama_n_vocab(model));
|
candidates.reserve(llama_n_vocab(model));
|
||||||
|
|
||||||
result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
|
result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
const int32_t n_probs = params.n_probs;
|
const int32_t n_probs = params.sampling_params.n_probs;
|
||||||
if (params.temp <= 0 && n_probs > 0)
|
if (params.sampling_params.temp <= 0 && n_probs > 0)
|
||||||
{
|
{
|
||||||
// For llama_sample_token_greedy we need to sort candidates
|
// For llama_sample_token_greedy we need to sort candidates
|
||||||
llama_sample_softmax(ctx, &candidates_p);
|
llama_sample_softmax(ctx, &candidates_p);
|
||||||
|
@ -630,7 +637,7 @@ struct llama_server_context
|
||||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
||||||
generated_text += token_text;
|
generated_text += token_text;
|
||||||
|
|
||||||
if (params.n_probs > 0)
|
if (params.sampling_params.n_probs > 0)
|
||||||
{
|
{
|
||||||
generated_token_probs.push_back(token_with_probs);
|
generated_token_probs.push_back(token_with_probs);
|
||||||
}
|
}
|
||||||
|
@ -711,15 +718,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf("usage: %s [options]\n", argv0);
|
printf("usage: %s [options]\n", argv0);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
printf(" -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
if (llama_mlock_supported())
|
if (llama_mlock_supported())
|
||||||
{
|
{
|
||||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
|
@ -864,6 +872,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
params.n_threads = std::stoi(argv[i]);
|
params.n_threads = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
|
else if (arg == "--threads-batch" || arg == "-tb")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_threads_batch = std::stoi(argv[i]);
|
||||||
|
}
|
||||||
else if (arg == "-b" || arg == "--batch-size")
|
else if (arg == "-b" || arg == "--batch-size")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
|
@ -1018,34 +1035,35 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
|
|
||||||
static json format_generation_settings(llama_server_context &llama)
|
static json format_generation_settings(llama_server_context &llama)
|
||||||
{
|
{
|
||||||
const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
|
const auto & sparams = llama.params.sampling_params;
|
||||||
const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
|
const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
|
||||||
|
const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
|
||||||
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||||
|
|
||||||
return json{
|
return json{
|
||||||
{"n_ctx", llama.n_ctx},
|
{"n_ctx", llama.n_ctx},
|
||||||
{"model", llama.params.model_alias},
|
{"model", llama.params.model_alias},
|
||||||
{"seed", llama.params.seed},
|
{"seed", llama.params.seed},
|
||||||
{"temp", llama.params.temp},
|
{"temp", sparams.temp},
|
||||||
{"top_k", llama.params.top_k},
|
{"top_k", sparams.top_k},
|
||||||
{"top_p", llama.params.top_p},
|
{"top_p", sparams.top_p},
|
||||||
{"tfs_z", llama.params.tfs_z},
|
{"tfs_z", sparams.tfs_z},
|
||||||
{"typical_p", llama.params.typical_p},
|
{"typical_p", sparams.typical_p},
|
||||||
{"repeat_last_n", llama.params.repeat_last_n},
|
{"repeat_last_n", sparams.repeat_last_n},
|
||||||
{"repeat_penalty", llama.params.repeat_penalty},
|
{"repeat_penalty", sparams.repeat_penalty},
|
||||||
{"presence_penalty", llama.params.presence_penalty},
|
{"presence_penalty", sparams.presence_penalty},
|
||||||
{"frequency_penalty", llama.params.frequency_penalty},
|
{"frequency_penalty", sparams.frequency_penalty},
|
||||||
{"mirostat", llama.params.mirostat},
|
{"mirostat", sparams.mirostat},
|
||||||
{"mirostat_tau", llama.params.mirostat_tau},
|
{"mirostat_tau", sparams.mirostat_tau},
|
||||||
{"mirostat_eta", llama.params.mirostat_eta},
|
{"mirostat_eta", sparams.mirostat_eta},
|
||||||
{"penalize_nl", llama.params.penalize_nl},
|
{"penalize_nl", sparams.penalize_nl},
|
||||||
{"stop", llama.params.antiprompt},
|
{"stop", llama.params.antiprompt},
|
||||||
{"n_predict", llama.params.n_predict},
|
{"n_predict", llama.params.n_predict},
|
||||||
{"n_keep", llama.params.n_keep},
|
{"n_keep", llama.params.n_keep},
|
||||||
{"ignore_eos", ignore_eos},
|
{"ignore_eos", ignore_eos},
|
||||||
{"stream", llama.stream},
|
{"stream", llama.stream},
|
||||||
{"logit_bias", llama.params.logit_bias},
|
{"logit_bias", sparams.logit_bias},
|
||||||
{"n_probs", llama.params.n_probs},
|
{"n_probs", sparams.n_probs},
|
||||||
{"grammar", llama.params.grammar},
|
{"grammar", llama.params.grammar},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1094,7 +1112,7 @@ static json format_final_response(llama_server_context &llama, const std::string
|
||||||
{"timings", format_timings(llama)},
|
{"timings", format_timings(llama)},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0)
|
if (llama.params.sampling_params.n_probs > 0)
|
||||||
{
|
{
|
||||||
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
||||||
}
|
}
|
||||||
|
@ -1110,7 +1128,7 @@ static json format_partial_response(
|
||||||
{"stop", false},
|
{"stop", false},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0)
|
if (llama.params.sampling_params.n_probs > 0)
|
||||||
{
|
{
|
||||||
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
||||||
}
|
}
|
||||||
|
@ -1142,26 +1160,28 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
||||||
static void parse_options_completion(const json &body, llama_server_context &llama)
|
static void parse_options_completion(const json &body, llama_server_context &llama)
|
||||||
{
|
{
|
||||||
gpt_params default_params;
|
gpt_params default_params;
|
||||||
|
const auto & default_sparams = default_params.sampling_params;
|
||||||
|
auto & sparams = llama.params.sampling_params;
|
||||||
|
|
||||||
llama.stream = json_value(body, "stream", false);
|
llama.stream = json_value(body, "stream", false);
|
||||||
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
||||||
llama.params.top_k = json_value(body, "top_k", default_params.top_k);
|
sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
|
||||||
llama.params.top_p = json_value(body, "top_p", default_params.top_p);
|
sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
|
||||||
llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
|
sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
|
||||||
llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
|
sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
|
||||||
llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
|
sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
|
||||||
llama.params.temp = json_value(body, "temperature", default_params.temp);
|
sparams.temp = json_value(body, "temperature", default_sparams.temp);
|
||||||
llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
|
sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
|
||||||
llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
|
sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
|
||||||
llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
|
sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
|
||||||
llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
|
sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
|
||||||
llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
|
sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
|
||||||
llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
|
sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
|
||||||
llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
|
sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
|
||||||
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
||||||
llama.params.seed = json_value(body, "seed", default_params.seed);
|
llama.params.seed = json_value(body, "seed", default_params.seed);
|
||||||
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
||||||
llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
|
sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
|
||||||
|
|
||||||
if (body.count("prompt") != 0)
|
if (body.count("prompt") != 0)
|
||||||
{
|
{
|
||||||
|
@ -1172,10 +1192,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||||
llama.prompt = "";
|
llama.prompt = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
llama.params.logit_bias.clear();
|
sparams.logit_bias.clear();
|
||||||
if (json_value(body, "ignore_eos", false))
|
if (json_value(body, "ignore_eos", false))
|
||||||
{
|
{
|
||||||
llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
|
sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto &logit_bias = body.find("logit_bias");
|
const auto &logit_bias = body.find("logit_bias");
|
||||||
|
@ -1191,11 +1211,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||||
{
|
{
|
||||||
if (el[1].is_number())
|
if (el[1].is_number())
|
||||||
{
|
{
|
||||||
llama.params.logit_bias[tok] = el[1].get<float>();
|
sparams.logit_bias[tok] = el[1].get<float>();
|
||||||
}
|
}
|
||||||
else if (el[1].is_boolean() && !el[1].get<bool>())
|
else if (el[1].is_boolean() && !el[1].get<bool>())
|
||||||
{
|
{
|
||||||
llama.params.logit_bias[tok] = -INFINITY;
|
sparams.logit_bias[tok] = -INFINITY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1215,6 +1235,8 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
|
||||||
|
|
||||||
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1423,7 +1445,7 @@ int main(int argc, char **argv)
|
||||||
}
|
}
|
||||||
|
|
||||||
auto probs = llama.generated_token_probs;
|
auto probs = llama.generated_token_probs;
|
||||||
if (llama.params.n_probs > 0 && llama.stopped_word) {
|
if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
|
||||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
||||||
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
||||||
}
|
}
|
||||||
|
@ -1475,7 +1497,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
std::vector<completion_token_output> probs_output = {};
|
std::vector<completion_token_output> probs_output = {};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0) {
|
if (llama.params.sampling_params.n_probs > 0) {
|
||||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
||||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
||||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
||||||
|
@ -1596,7 +1618,7 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
std::vector<completion_token_output> probs_output = {};
|
std::vector<completion_token_output> probs_output = {};
|
||||||
|
|
||||||
if (llama.params.n_probs > 0) {
|
if (llama.params.sampling_params.n_probs > 0) {
|
||||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
||||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
||||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
||||||
|
|
|
@ -125,6 +125,8 @@ int main(int argc, char ** argv) {
|
||||||
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
|
||||||
|
|
||||||
const auto t_dec_start = ggml_time_us();
|
const auto t_dec_start = ggml_time_us();
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -134,7 +136,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
|
llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
|
||||||
|
|
||||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||||
last_tokens.erase(last_tokens.begin());
|
last_tokens.erase(last_tokens.begin());
|
||||||
|
@ -211,7 +213,13 @@ int main(int argc, char ** argv) {
|
||||||
if (grammar_dft) {
|
if (grammar_dft) {
|
||||||
llama_grammar_free(grammar_dft);
|
llama_grammar_free(grammar_dft);
|
||||||
}
|
}
|
||||||
grammar_dft = llama_grammar_copy(grammar_tgt);
|
// Note: Hardcoded to sequence id 0, if this ever supports parallel generation
|
||||||
|
// that will need to change.
|
||||||
|
auto it = ctx_sampling.sequence_contexts.find(0);
|
||||||
|
GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
|
||||||
|
// This is necessary because each sequence id in sequence_contexts
|
||||||
|
// uses a copy of the original grammar.
|
||||||
|
grammar_dft = llama_grammar_copy(it->second.grammar);
|
||||||
|
|
||||||
LOG("copied target grammar to draft grammar\n");
|
LOG("copied target grammar to draft grammar\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,29 +88,31 @@ class MODEL_ARCH(IntEnum):
|
||||||
PERSIMMON : int = auto()
|
PERSIMMON : int = auto()
|
||||||
REFACT : int = auto()
|
REFACT : int = auto()
|
||||||
BERT : int = auto()
|
BERT : int = auto()
|
||||||
|
BLOOM : int = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
TOKEN_EMBD : int = auto()
|
TOKEN_EMBD : int = auto()
|
||||||
TOKEN_TYPES : int = auto()
|
TOKEN_EMBD_NORM : int = auto()
|
||||||
POS_EMBD : int = auto()
|
TOKEN_TYPES : int = auto()
|
||||||
OUTPUT : int = auto()
|
POS_EMBD : int = auto()
|
||||||
OUTPUT_NORM : int = auto()
|
OUTPUT : int = auto()
|
||||||
ROPE_FREQS : int = auto()
|
OUTPUT_NORM : int = auto()
|
||||||
ATTN_Q : int = auto()
|
ROPE_FREQS : int = auto()
|
||||||
ATTN_K : int = auto()
|
ATTN_Q : int = auto()
|
||||||
ATTN_V : int = auto()
|
ATTN_K : int = auto()
|
||||||
ATTN_QKV : int = auto()
|
ATTN_V : int = auto()
|
||||||
ATTN_OUT : int = auto()
|
ATTN_QKV : int = auto()
|
||||||
ATTN_NORM : int = auto()
|
ATTN_OUT : int = auto()
|
||||||
ATTN_NORM_2 : int = auto()
|
ATTN_NORM : int = auto()
|
||||||
ATTN_ROT_EMBD: int = auto()
|
ATTN_NORM_2 : int = auto()
|
||||||
FFN_GATE : int = auto()
|
ATTN_ROT_EMBD : int = auto()
|
||||||
FFN_DOWN : int = auto()
|
FFN_GATE : int = auto()
|
||||||
FFN_UP : int = auto()
|
FFN_DOWN : int = auto()
|
||||||
FFN_NORM : int = auto()
|
FFN_UP : int = auto()
|
||||||
ATTN_Q_NORM : int = auto()
|
FFN_NORM : int = auto()
|
||||||
ATTN_K_NORM : int = auto()
|
ATTN_Q_NORM : int = auto()
|
||||||
|
ATTN_K_NORM : int = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
|
@ -125,29 +127,31 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.PERSIMMON: "persimmon",
|
MODEL_ARCH.PERSIMMON: "persimmon",
|
||||||
MODEL_ARCH.REFACT: "refact",
|
MODEL_ARCH.REFACT: "refact",
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -282,6 +286,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.BLOOM: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.GPT2: [
|
MODEL_ARCH.GPT2: [
|
||||||
# TODO
|
# TODO
|
||||||
],
|
],
|
||||||
|
@ -311,6 +327,7 @@ class TensorNameMap:
|
||||||
"gpt_neox.embed_in", # gptneox
|
"gpt_neox.embed_in", # gptneox
|
||||||
"transformer.wte", # gpt2 gpt-j mpt refact
|
"transformer.wte", # gpt2 gpt-j mpt refact
|
||||||
"transformer.word_embeddings", # falcon
|
"transformer.word_embeddings", # falcon
|
||||||
|
"word_embeddings", # bloom
|
||||||
"model.embed_tokens", # llama-hf
|
"model.embed_tokens", # llama-hf
|
||||||
"tok_embeddings", # llama-pth
|
"tok_embeddings", # llama-pth
|
||||||
"embeddings.word_embeddings", # bert
|
"embeddings.word_embeddings", # bert
|
||||||
|
@ -322,6 +339,11 @@ class TensorNameMap:
|
||||||
"embeddings.token_type_embeddings", # bert
|
"embeddings.token_type_embeddings", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
|
# Normalization of token embeddings
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
||||||
|
"word_embeddings_layernorm", # bloom
|
||||||
|
),
|
||||||
|
|
||||||
# Position embeddings
|
# Position embeddings
|
||||||
MODEL_TENSOR.POS_EMBD: (
|
MODEL_TENSOR.POS_EMBD: (
|
||||||
"transformer.wpe", # gpt2
|
"transformer.wpe", # gpt2
|
||||||
|
@ -332,7 +354,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
||||||
"output", # llama-pth
|
"output", # llama-pth bloom
|
||||||
"word_embeddings_for_head", # persimmon
|
"word_embeddings_for_head", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -344,7 +366,7 @@ class TensorNameMap:
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
"embeddings.LayerNorm", # bert
|
"embeddings.LayerNorm", # bert
|
||||||
"transformer.norm_f", # mpt
|
"transformer.norm_f", # mpt
|
||||||
"ln_f", # refact
|
"ln_f", # refact bloom
|
||||||
"language_model.encoder.final_layernorm", # persimmon
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -361,6 +383,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
||||||
"transformer.blocks.{bid}.norm_1", # mpt
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||||
|
"h.{bid}.input_layernorm", # bloom
|
||||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||||
"layers.{bid}.attention_norm", # llama-pth
|
"layers.{bid}.attention_norm", # llama-pth
|
||||||
|
@ -379,6 +402,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
"transformer.h.{bid}.attn.c_attn", # gpt2
|
||||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
|
"h.{bid}.self_attention.query_key_value", # bloom
|
||||||
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -412,6 +436,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
||||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
|
"h.{bid}.self_attention.dense", # bloom
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wo", # llama-pth
|
"layers.{bid}.attention.wo", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||||
|
@ -429,6 +454,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_NORM: (
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_2", # gpt2 refact
|
"transformer.h.{bid}.ln_2", # gpt2 refact
|
||||||
|
"h.{bid}.post_attention_layernorm", # bloom
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
"layers.{bid}.ffn_norm", # llama-pth
|
"layers.{bid}.ffn_norm", # llama-pth
|
||||||
|
@ -442,6 +468,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||||
|
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
||||||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
|
@ -461,6 +488,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
||||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||||
|
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
||||||
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||||
"encoder.layer.{bid}.output.dense", # bert
|
"encoder.layer.{bid}.output.dense", # bert
|
||||||
|
|
392
llama.cpp
392
llama.cpp
|
@ -189,6 +189,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_STARCODER,
|
LLM_ARCH_STARCODER,
|
||||||
LLM_ARCH_PERSIMMON,
|
LLM_ARCH_PERSIMMON,
|
||||||
LLM_ARCH_REFACT,
|
LLM_ARCH_REFACT,
|
||||||
|
LLM_ARCH_BLOOM,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -202,7 +203,8 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
||||||
{ LLM_ARCH_STARCODER, "starcoder" },
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
||||||
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
||||||
{ LLM_ARCH_REFACT, "refact" },
|
{ LLM_ARCH_REFACT, "refact" },
|
||||||
|
{ LLM_ARCH_BLOOM, "bloom" },
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llm_kv {
|
enum llm_kv {
|
||||||
|
@ -305,6 +307,7 @@ struct LLM_KV {
|
||||||
|
|
||||||
enum llm_tensor {
|
enum llm_tensor {
|
||||||
LLM_TENSOR_TOKEN_EMBD,
|
LLM_TENSOR_TOKEN_EMBD,
|
||||||
|
LLM_TENSOR_TOKEN_EMBD_NORM,
|
||||||
LLM_TENSOR_POS_EMBD,
|
LLM_TENSOR_POS_EMBD,
|
||||||
LLM_TENSOR_OUTPUT,
|
LLM_TENSOR_OUTPUT,
|
||||||
LLM_TENSOR_OUTPUT_NORM,
|
LLM_TENSOR_OUTPUT_NORM,
|
||||||
|
@ -467,6 +470,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_BLOOM,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
@ -1212,6 +1230,8 @@ struct llama_model {
|
||||||
|
|
||||||
struct ggml_tensor * tok_embeddings;
|
struct ggml_tensor * tok_embeddings;
|
||||||
struct ggml_tensor * pos_embeddings;
|
struct ggml_tensor * pos_embeddings;
|
||||||
|
struct ggml_tensor * tok_norm;
|
||||||
|
struct ggml_tensor * tok_norm_b;
|
||||||
|
|
||||||
struct ggml_tensor * output_norm;
|
struct ggml_tensor * output_norm;
|
||||||
struct ggml_tensor * output_norm_b;
|
struct ggml_tensor * output_norm_b;
|
||||||
|
@ -2062,13 +2082,13 @@ static void llm_load_hparams(
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_PERSIMMON:
|
case LLM_ARCH_PERSIMMON:
|
||||||
{
|
{
|
||||||
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 36: model.type = e_model::MODEL_8B; break;
|
case 36: model.type = e_model::MODEL_8B; break;
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_REFACT:
|
case LLM_ARCH_REFACT:
|
||||||
{
|
{
|
||||||
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
||||||
|
@ -2077,6 +2097,19 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BLOOM:
|
||||||
|
{
|
||||||
|
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 24: model.type = e_model::MODEL_1B; break;
|
||||||
|
case 30:
|
||||||
|
switch (hparams.n_embd) {
|
||||||
|
case 2560: model.type = e_model::MODEL_3B; break;
|
||||||
|
case 4096: model.type = e_model::MODEL_7B; break;
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_MPT:
|
case LLM_ARCH_MPT:
|
||||||
{
|
{
|
||||||
hparams.f_clamp_kqv = 0.0f;
|
hparams.f_clamp_kqv = 0.0f;
|
||||||
|
@ -2682,6 +2715,88 @@ static void llm_load_tensors(
|
||||||
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BLOOM:
|
||||||
|
{
|
||||||
|
// TODO: CPU-only for now
|
||||||
|
|
||||||
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
|
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
||||||
|
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
ggml_backend_type backend_norm;
|
||||||
|
ggml_backend_type backend_output;
|
||||||
|
|
||||||
|
if (n_gpu_layers > int(n_layer)) {
|
||||||
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||||
|
// on Windows however this is detrimental unless everything is on the GPU
|
||||||
|
#ifndef _WIN32
|
||||||
|
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
||||||
|
#else
|
||||||
|
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||||
|
#endif // _WIN32
|
||||||
|
|
||||||
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||||
|
} else {
|
||||||
|
backend_norm = GGML_BACKEND_CPU;
|
||||||
|
backend_output = GGML_BACKEND_CPU;
|
||||||
|
}
|
||||||
|
|
||||||
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||||
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
||||||
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
|
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||||
|
}
|
||||||
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
vram_weights += ggml_nbytes(model.output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t n_ff = hparams.n_ff;
|
||||||
|
|
||||||
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
|
||||||
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
|
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||||
|
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||||
|
|
||||||
|
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||||
|
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
||||||
|
|
||||||
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||||
|
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||||
|
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
||||||
|
|
||||||
|
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||||
|
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
||||||
|
|
||||||
|
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
||||||
|
|
||||||
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights +=
|
||||||
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||||
|
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
||||||
|
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
||||||
|
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
||||||
|
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
|
||||||
|
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_MPT:
|
case LLM_ARCH_MPT:
|
||||||
{
|
{
|
||||||
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
|
@ -5002,6 +5117,248 @@ static struct ggml_cgraph * llm_build_persimmon(
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct ggml_cgraph * llm_build_bloom(
|
||||||
|
llama_context & lctx,
|
||||||
|
const llama_batch & batch) {
|
||||||
|
const auto & model = lctx.model;
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
const auto & cparams = lctx.cparams;
|
||||||
|
|
||||||
|
const auto & kv_self = lctx.kv_self;
|
||||||
|
|
||||||
|
GGML_ASSERT(!!kv_self.ctx);
|
||||||
|
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
const int64_t n_layer = hparams.n_layer;
|
||||||
|
const int64_t n_ctx = cparams.n_ctx;
|
||||||
|
const int64_t n_head = hparams.n_head;
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv;
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head();
|
||||||
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||||
|
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
const float norm_eps = hparams.f_norm_eps;
|
||||||
|
|
||||||
|
const int32_t n_tokens = batch.n_tokens;
|
||||||
|
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
||||||
|
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
||||||
|
|
||||||
|
auto & buf_compute = lctx.buf_compute;
|
||||||
|
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/*.mem_size =*/ buf_compute.size,
|
||||||
|
/*.mem_buffer =*/ buf_compute.data,
|
||||||
|
/*.no_alloc =*/ false,
|
||||||
|
};
|
||||||
|
|
||||||
|
params.no_alloc = true;
|
||||||
|
|
||||||
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * token;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
if (batch.token) {
|
||||||
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||||
|
|
||||||
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
||||||
|
}
|
||||||
|
ggml_set_name(inp_tokens, "inp_tokens");
|
||||||
|
|
||||||
|
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
||||||
|
} else {
|
||||||
|
#ifdef GGML_USE_MPI
|
||||||
|
GGML_ASSERT(false && "not implemented");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
||||||
|
|
||||||
|
ggml_allocr_alloc(lctx.alloc, token);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// KQ_scale
|
||||||
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||||
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
||||||
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||||
|
}
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||||
|
ggml_set_name(KQ_mask, "KQ_mask");
|
||||||
|
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
float * data = (float *) KQ_mask->data;
|
||||||
|
memset(data, 0, ggml_nbytes(KQ_mask));
|
||||||
|
|
||||||
|
for (int h = 0; h < 1; ++h) {
|
||||||
|
for (int j = 0; j < n_tokens; ++j) {
|
||||||
|
const llama_pos pos = batch.pos[j];
|
||||||
|
const llama_seq_id seq_id = batch.seq_id[j];
|
||||||
|
|
||||||
|
for (int i = 0; i < n_kv; ++i) {
|
||||||
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
||||||
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// norm
|
||||||
|
{
|
||||||
|
inpL = ggml_norm(ctx0, token, norm_eps);
|
||||||
|
inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_set_name(inpL, "inpL");
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
{
|
||||||
|
// Norm
|
||||||
|
cur = ggml_norm(ctx0, inpL, norm_eps);
|
||||||
|
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Self Attention
|
||||||
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
||||||
|
|
||||||
|
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
||||||
|
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
||||||
|
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
||||||
|
|
||||||
|
struct ggml_tensor * Qcur = tmpq;
|
||||||
|
struct ggml_tensor * Kcur = tmpk;
|
||||||
|
|
||||||
|
// store key and value to memory
|
||||||
|
{
|
||||||
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
||||||
|
ggml_set_name(Vcur, "Vcur");
|
||||||
|
|
||||||
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
||||||
|
ggml_set_name(k, "k");
|
||||||
|
|
||||||
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
||||||
|
( n_ctx)*ggml_element_size(kv_self.v),
|
||||||
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * Q =
|
||||||
|
ggml_permute(ctx0,
|
||||||
|
ggml_cpy(ctx0,
|
||||||
|
Qcur,
|
||||||
|
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
||||||
|
0, 2, 1, 3);
|
||||||
|
ggml_set_name(Q, "Q");
|
||||||
|
|
||||||
|
struct ggml_tensor * K =
|
||||||
|
ggml_view_3d(ctx0, kv_self.k,
|
||||||
|
n_embd_head, n_kv, n_head_kv,
|
||||||
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
||||||
|
ggml_element_size(kv_self.k)*n_embd_head,
|
||||||
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
||||||
|
ggml_set_name(K, "K");
|
||||||
|
|
||||||
|
// K * Q
|
||||||
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
ggml_set_name(KQ, "KQ");
|
||||||
|
|
||||||
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
||||||
|
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
||||||
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||||
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
||||||
|
|
||||||
|
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
|
||||||
|
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
||||||
|
|
||||||
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
|
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
||||||
|
ggml_set_name(KQ_masked, "KQ_masked");
|
||||||
|
|
||||||
|
// KQ = soft_max(KQ_masked)
|
||||||
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||||
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||||
|
|
||||||
|
// split cached V into n_head heads
|
||||||
|
struct ggml_tensor * V =
|
||||||
|
ggml_view_3d(ctx0, kv_self.v,
|
||||||
|
n_kv, n_embd_head, n_head_kv,
|
||||||
|
ggml_element_size(kv_self.v)*n_ctx,
|
||||||
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
||||||
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
||||||
|
ggml_set_name(V, "V");
|
||||||
|
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
|
ggml_set_name(KQV, "KQV");
|
||||||
|
|
||||||
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||||
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
ggml_set_name(KQV_merged, "KQV_merged");
|
||||||
|
|
||||||
|
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
||||||
|
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
||||||
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Projection
|
||||||
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
||||||
|
|
||||||
|
// Add the input
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
|
||||||
|
struct ggml_tensor * inpFF = cur;
|
||||||
|
|
||||||
|
// FF
|
||||||
|
{
|
||||||
|
// Norm
|
||||||
|
{
|
||||||
|
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
||||||
|
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
||||||
|
|
||||||
|
// GELU activation
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
|
||||||
|
// Projection
|
||||||
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
||||||
|
}
|
||||||
|
|
||||||
|
inpL = ggml_add(ctx0, cur, inpFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Output Norm
|
||||||
|
{
|
||||||
|
cur = ggml_norm(ctx0, inpL, norm_eps);
|
||||||
|
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
||||||
|
}
|
||||||
|
ggml_set_name(cur, "result_norm");
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
ggml_set_name(cur, "result_output");
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
ggml_free(ctx0);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_cgraph * llm_build_mpt(
|
static struct ggml_cgraph * llm_build_mpt(
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
const llama_batch & batch) {
|
const llama_batch & batch) {
|
||||||
|
@ -5031,9 +5388,6 @@ static struct ggml_cgraph * llm_build_mpt(
|
||||||
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
||||||
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
||||||
|
|
||||||
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
|
|
||||||
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
|
|
||||||
|
|
||||||
auto & buf_compute = lctx.buf_compute;
|
auto & buf_compute = lctx.buf_compute;
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
|
@ -5354,6 +5708,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm_build_refact(lctx, batch);
|
result = llm_build_refact(lctx, batch);
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_BLOOM:
|
||||||
|
{
|
||||||
|
result = llm_build_bloom(lctx, batch);
|
||||||
|
} break;
|
||||||
case LLM_ARCH_MPT:
|
case LLM_ARCH_MPT:
|
||||||
{
|
{
|
||||||
result = llm_build_mpt(lctx, batch);
|
result = llm_build_mpt(lctx, batch);
|
||||||
|
@ -5990,7 +6348,6 @@ private:
|
||||||
for (int i = 0; i < (int)text_utf.size(); i++) {
|
for (int i = 0; i < (int)text_utf.size(); i++) {
|
||||||
const std::string & utf_char = text_utf[i];
|
const std::string & utf_char = text_utf[i];
|
||||||
bool split_condition = false;
|
bool split_condition = false;
|
||||||
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
|
||||||
int bytes_remain = text_utf.size() - i;
|
int bytes_remain = text_utf.size() - i;
|
||||||
// forward backward lookups
|
// forward backward lookups
|
||||||
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
||||||
|
@ -6016,9 +6373,9 @@ private:
|
||||||
if (!split_condition && bytes_remain >= 3) {
|
if (!split_condition && bytes_remain >= 3) {
|
||||||
// 're|'ve|'ll
|
// 're|'ve|'ll
|
||||||
if (utf_char == "\'" && (
|
if (utf_char == "\'" && (
|
||||||
(utf_char_next == "r" || utf_char_next_next == "e") ||
|
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
||||||
(utf_char_next == "v" || utf_char_next_next == "e") ||
|
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
||||||
(utf_char_next == "l" || utf_char_next_next == "l"))
|
(utf_char_next == "l" && utf_char_next_next == "l"))
|
||||||
) {
|
) {
|
||||||
split_condition = true;
|
split_condition = true;
|
||||||
}
|
}
|
||||||
|
@ -6069,7 +6426,7 @@ private:
|
||||||
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
||||||
split_condition = true;
|
split_condition = true;
|
||||||
}
|
}
|
||||||
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
|
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
||||||
split_condition = true;
|
split_condition = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7588,8 +7945,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
const std::string name = ggml_get_name(meta);
|
const std::string name = ggml_get_name(meta);
|
||||||
|
|
||||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||||
if (name.find("attn_v.weight") != std::string::npos ||
|
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
||||||
name.find("attn_qkv.weight") != std::string::npos) {
|
|
||||||
++n_attention_wv;
|
++n_attention_wv;
|
||||||
}
|
}
|
||||||
else if (name.find("ffn_down.weight") != std::string::npos) {
|
else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||||
|
|
93
prompts/mnemonics.txt
Normal file
93
prompts/mnemonics.txt
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
For each kanji character, write a Markdown‐formatted mnemonic that uses its keyword and the keyword of all its components.
|
||||||
|
|
||||||
|
Kanji: 欠 (lack of)
|
||||||
|
Components: 𠂊 (hook claw), 人 (person)
|
||||||
|
Mnemonic: This **person** is a pirate. He lost his hand to a crocodile many years ago. Nowadays, the ***lack of*** a hand does not bother him too much. In fact, the **hook claw** that replaces it is the mark of a true pirate, so he is quite proud of it!
|
||||||
|
|
||||||
|
Kanji: 類 (kind (of something))
|
||||||
|
Components: 米 (rice), 大 (large), 頁 (page)
|
||||||
|
Mnemonic: The waiter at a Chinese restaurant hands you a **large** menu. Each **page** has all ***kinds*** of **rice** on offer!
|
||||||
|
|
||||||
|
Kanji: 燃 (burn)
|
||||||
|
Components: 火 (fire), 然 (sort of thing)
|
||||||
|
Mnemonic: ***Burning*** things up with **fire** is just my **sort of thing**. (Spoken like a true pyromaniac.)
|
||||||
|
|
||||||
|
Kanji: 頂 (top of)
|
||||||
|
Components: 丁 (street), 頁 (page)
|
||||||
|
Mnemonic: To be at the ***top of*** your game, you need both practical knowledge (**street** smarts) and theoretical knowledge (having read many **pages**).
|
||||||
|
|
||||||
|
Kanji: 険 (risky and steep)
|
||||||
|
Components: 阝 (small village), 㑒 (consensus)
|
||||||
|
Mnemonic: Everyone agrees (there is **consensus**) that the path to the **small village** is ***risky and steep***.
|
||||||
|
|
||||||
|
Kanji: 困 (distressed)
|
||||||
|
Components: 囗 (closed box), 木 (tree)
|
||||||
|
Mnemonic: You would feel ***distressed*** too if you were a **tree** trapped in a **closed box**! I have no place to grow!
|
||||||
|
|
||||||
|
Kanji: 頭 (head)
|
||||||
|
Components: 豆 (bean), 頁 (page)
|
||||||
|
Mnemonic: What do you have in that ***head*** of yours? A **bean** for a brain? Go read more **pages** and become more knowledgeable about the world!
|
||||||
|
|
||||||
|
Kanji: 確 (certain)
|
||||||
|
Components: 石 (stone), 冖 (roof without a chimney), 隹 (old bird)
|
||||||
|
Mnemonic: An **old bird** has made a nest on your **roof**. What do you do? You call Misaka from a <cite>A ***Certain*** Scientific Railgun</cite> to get rid of it, of course! But she doesn’t really want to vaporize the poor thing, so she just throws a **stone** to scare it away. (What was the point of calling her, then‽)
|
||||||
|
|
||||||
|
Kanji: 魚 (fish)
|
||||||
|
Components: 𠂊 (hook claw), 田 (rice field), 灬 (fire sparks)
|
||||||
|
Mnemonic: Catch ***fish*** with a **hook**, collect rice from the **rice field**, cook them with **fire**… And my meal is ready!
|
||||||
|
|
||||||
|
Kanji: 警 (to police (something))
|
||||||
|
Components: 敬 (respect), 言 (say)
|
||||||
|
Mnemonic: ***To police something*** is to make people **respect** what the law **says**.
|
||||||
|
|
||||||
|
Kanji: 筆 (writing brush)
|
||||||
|
Components: 竹 (bamboo), 聿 (brush)
|
||||||
|
Mnemonic: A traditional ***writing brush*** is a **brush** made of **bamboo**.
|
||||||
|
|
||||||
|
Kanji: 獄 (prison)
|
||||||
|
Components: 犭 (animal), 言 (say), 犬 (dog)
|
||||||
|
Mnemonic: In ***prison***, like in the **animal** kingdom, only the toughest survive. You have to watch what you **say**. It’s a **dog**‐eat‐dog world.
|
||||||
|
|
||||||
|
Kanji: 新 (new)
|
||||||
|
Components: 立 (standing up), 木 (tree), 斤 (axe)
|
||||||
|
Mnemonic: In order for a ***new*** construction to be made, an empty lot is needed. If there are any **trees** **standing up**, they must be cut down with an **axe**.
|
||||||
|
|
||||||
|
Kanji: 怪 (suspicious)
|
||||||
|
Components: 忄 (weak heart), 圣 (sacred)
|
||||||
|
Mnemonic: That painting of the **Sacred** **Heart** of Jesus looks ***suspicious***. I think it might be a forgery.
|
||||||
|
|
||||||
|
Kanji: 温 (warm (to the touch))
|
||||||
|
Components: 氵 (water drops), 日 (sun), 皿 (dish)
|
||||||
|
Mnemonic: If you leave **water** on a **dish** in the **sun**, it will get ***warm***.
|
||||||
|
|
||||||
|
Kanji: 階 (floor (of a building))
|
||||||
|
Components: 阝 (small village), 皆 (all)
|
||||||
|
Mnemonic: It might be a **small village**, but, despite that, **all** of its buildings have many ***floors***. It’s a village of skyscrapers!
|
||||||
|
|
||||||
|
Kanji: 多 (many)
|
||||||
|
Components: 夕 (evening (before sunset)), 夕 (evening (before sunset))
|
||||||
|
Mnemonic: Two **evenings** in a day would be one too ***many***.
|
||||||
|
|
||||||
|
Kanji: 別 (separate)
|
||||||
|
Components: 口 (mouth), 万 (ten thousand), 刂 (knife)
|
||||||
|
Mnemonic: Tom Six is at it again. For his next flick, he wants to stitch together **ten thousand** people, **mouth**‐to‐anus. One of the most graphic and disturbing scenes will feature one of the victims using a **knife** to ***separate*** perself.
|
||||||
|
|
||||||
|
Kanji: 並 (line up)
|
||||||
|
Components: 䒑 (antlers on a wall), 业 (runway)
|
||||||
|
Mnemonic: In order to land a plane you have to ***line up*** properly with the **runway**. The things that look like **antlers** at the end of the runway are the control towers; you should follow their instructions.
|
||||||
|
|
||||||
|
Kanji: 姿 (figure)
|
||||||
|
Components: 次 (next), 女 (woman)
|
||||||
|
Mnemonic: The **next** **woman** that I date will have a perfect **figure**. Because I’m done with 3D women—it will *literally* be an anime figure!
|
||||||
|
|
||||||
|
Kanji: 実 (real)
|
||||||
|
Components: 宀 (roof with a chimney), 𡗗 (three people)
|
||||||
|
Mnemonic: Living under a **roof with a chimney** with **three people** (a wife and two children)—a happy family life—is not something I could have ever imagined. It does not feel ***real***.
|
||||||
|
|
||||||
|
Kanji: 謝 (apologize)
|
||||||
|
Components: 言 (say), 射 (shoot)
|
||||||
|
Mnemonic: **Shot** first, ***apologize*** (**say** you are sorry) later.
|
||||||
|
|
||||||
|
Kanji: 提 (propose)
|
||||||
|
Components: 扌 (left hand), 是 (go with)
|
||||||
|
Mnemonic:
|
|
@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||||
{ " Hello" , { 258, 23090, }, },
|
{ " Hello" , { 258, 23090, }, },
|
||||||
{ " Hello" , { 466, 23090, }, },
|
{ " Hello" , { 466, 23090, }, },
|
||||||
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
|
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
|
||||||
|
{ "\n =" , { 1212, 40, }, },
|
||||||
|
{ "' era" , { 18, 4932, }, },
|
||||||
};
|
};
|
||||||
|
|
||||||
return _k_tests;
|
return _k_tests;
|
||||||
|
@ -155,7 +157,7 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
||||||
|
|
||||||
const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
|
const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
||||||
|
|
||||||
|
@ -169,10 +171,8 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & tok : res) {
|
for (const auto & tok : res) {
|
||||||
ofs << tok << " ";
|
ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ofs << "\n";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||||
|
|
|
@ -41,6 +41,8 @@ tests = [
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello\n Hello",
|
" Hello\n Hello",
|
||||||
|
"\n =",
|
||||||
|
"' era",
|
||||||
]
|
]
|
||||||
|
|
||||||
for text in tests:
|
for text in tests:
|
||||||
|
@ -69,15 +71,14 @@ fname_tok = args.fname_tok
|
||||||
if fname_tok:
|
if fname_tok:
|
||||||
print('tokenizing file: ', fname_tok)
|
print('tokenizing file: ', fname_tok)
|
||||||
fname_out = fname_tok + '.tok'
|
fname_out = fname_tok + '.tok'
|
||||||
with open(fname_tok, 'r') as f:
|
with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
s = ''.join(lines)
|
s = ''.join(lines)
|
||||||
res = tokenizer.encode(s)
|
res = tokenizer.encode(s)
|
||||||
# write to file
|
# write to file
|
||||||
with open(fname_out, 'w') as f:
|
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||||
for x in res:
|
for x in res:
|
||||||
f.write(str(x) + ' ')
|
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||||
f.write('\n')
|
|
||||||
print('len(res): ', len(res))
|
print('len(res): ', len(res))
|
||||||
print('len(lines): ', len(lines))
|
print('len(lines): ', len(lines))
|
||||||
print('results written to: ', fname_out)
|
print('results written to: ', fname_out)
|
||||||
|
|
|
@ -174,10 +174,8 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & tok : res) {
|
for (const auto & tok : res) {
|
||||||
ofs << tok << " ";
|
ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ofs << "\n";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||||
|
|
|
@ -81,15 +81,14 @@ fname_tok = args.fname_tok
|
||||||
if fname_tok:
|
if fname_tok:
|
||||||
print('tokenizing file: ', fname_tok)
|
print('tokenizing file: ', fname_tok)
|
||||||
fname_out = fname_tok + '.tok'
|
fname_out = fname_tok + '.tok'
|
||||||
with open(fname_tok, 'r') as f:
|
with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
s = ''.join(lines)
|
s = ''.join(lines)
|
||||||
res = tokenizer.encode(s, add_bos=True)
|
res = tokenizer.encode(s, add_bos=True)
|
||||||
# write to file
|
# write to file
|
||||||
with open(fname_out, 'w') as f:
|
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||||
for x in res:
|
for x in res:
|
||||||
f.write(str(x) + ' ')
|
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||||
f.write('\n')
|
|
||||||
print('len(res): ', len(res))
|
print('len(res): ', len(res))
|
||||||
print('len(lines): ', len(lines))
|
print('len(lines): ', len(lines))
|
||||||
print('results written to: ', fname_out)
|
print('results written to: ', fname_out)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue