Merge branch 'master' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	Makefile
#	README.md
#	build.zig
This commit is contained in:
Concedo 2023-10-12 16:32:00 +08:00
commit a6c3dbc351
34 changed files with 4181 additions and 2456 deletions

1
.gitignore vendored
View file

@ -45,6 +45,7 @@ models-mnt
/server /server
/simple /simple
/batched /batched
/batched-bench
/export-lora /export-lora
/finetune /finetune
/speculative /speculative

View file

@ -1,10 +1,10 @@
// swift-tools-version:5.3 // swift-tools-version:5.5
import PackageDescription import PackageDescription
#if arch(arm) || arch(arm64) #if arch(arm) || arch(arm64)
let platforms: [SupportedPlatform]? = [ let platforms: [SupportedPlatform]? = [
.macOS(.v11), .macOS(.v12),
.iOS(.v14), .iOS(.v14),
.watchOS(.v4), .watchOS(.v4),
.tvOS(.v14) .tvOS(.v14)
@ -41,12 +41,13 @@ let package = Package(
"ggml.c", "ggml.c",
"llama.cpp", "llama.cpp",
"ggml-alloc.c", "ggml-alloc.c",
"ggml-backend.c",
"k_quants.c", "k_quants.c",
] + additionalSources, ] + additionalSources,
resources: resources, resources: resources,
publicHeadersPath: "spm-headers", publicHeadersPath: "spm-headers",
cSettings: [ cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32"]), .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.define("GGML_USE_K_QUANTS"), .define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE") .define("GGML_USE_ACCELERATE")
// NOTE: NEW_LAPACK will required iOS version 16.4+ // NOTE: NEW_LAPACK will required iOS version 16.4+

View file

@ -5,6 +5,8 @@ set(TARGET common)
add_library(${TARGET} OBJECT add_library(${TARGET} OBJECT
common.h common.h
common.cpp common.cpp
sampling.h
sampling.cpp
console.h console.h
console.cpp console.cpp
grammar-parser.h grammar-parser.h

View file

@ -107,6 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
std::string arg; std::string arg;
gpt_params default_params; gpt_params default_params;
const std::string arg_prefix = "--"; const std::string arg_prefix = "--";
llama_sampling_params & sparams = params.sampling_params;
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
arg = argv[i]; arg = argv[i];
@ -184,7 +185,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.top_k = std::stoi(argv[i]); sparams.top_k = std::stoi(argv[i]);
} else if (arg == "-c" || arg == "--ctx-size") { } else if (arg == "-c" || arg == "--ctx-size") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -216,73 +217,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.top_p = std::stof(argv[i]); sparams.top_p = std::stof(argv[i]);
} else if (arg == "--temp") { } else if (arg == "--temp") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.temp = std::stof(argv[i]); sparams.temp = std::stof(argv[i]);
} else if (arg == "--tfs") { } else if (arg == "--tfs") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.tfs_z = std::stof(argv[i]); sparams.tfs_z = std::stof(argv[i]);
} else if (arg == "--typical") { } else if (arg == "--typical") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.typical_p = std::stof(argv[i]); sparams.typical_p = std::stof(argv[i]);
} else if (arg == "--repeat-last-n") { } else if (arg == "--repeat-last-n") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.repeat_last_n = std::stoi(argv[i]); sparams.repeat_last_n = std::stoi(argv[i]);
} else if (arg == "--repeat-penalty") { } else if (arg == "--repeat-penalty") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.repeat_penalty = std::stof(argv[i]); sparams.repeat_penalty = std::stof(argv[i]);
} else if (arg == "--frequency-penalty") { } else if (arg == "--frequency-penalty") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.frequency_penalty = std::stof(argv[i]); sparams.frequency_penalty = std::stof(argv[i]);
} else if (arg == "--presence-penalty") { } else if (arg == "--presence-penalty") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.presence_penalty = std::stof(argv[i]); sparams.presence_penalty = std::stof(argv[i]);
} else if (arg == "--mirostat") { } else if (arg == "--mirostat") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.mirostat = std::stoi(argv[i]); sparams.mirostat = std::stoi(argv[i]);
} else if (arg == "--mirostat-lr") { } else if (arg == "--mirostat-lr") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.mirostat_eta = std::stof(argv[i]); sparams.mirostat_eta = std::stof(argv[i]);
} else if (arg == "--mirostat-ent") { } else if (arg == "--mirostat-ent") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.mirostat_tau = std::stof(argv[i]); sparams.mirostat_tau = std::stof(argv[i]);
} else if (arg == "--cfg-negative-prompt") { } else if (arg == "--cfg-negative-prompt") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.cfg_negative_prompt = argv[i]; sparams.cfg_negative_prompt = argv[i];
} else if (arg == "--cfg-negative-prompt-file") { } else if (arg == "--cfg-negative-prompt-file") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -294,16 +295,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true; invalid_param = true;
break; break;
} }
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt)); std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') { if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
params.cfg_negative_prompt.pop_back(); sparams.cfg_negative_prompt.pop_back();
} }
} else if (arg == "--cfg-scale") { } else if (arg == "--cfg-scale") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.cfg_scale = std::stof(argv[i]); sparams.cfg_scale = std::stof(argv[i]);
} else if (arg == "-b" || arg == "--batch-size") { } else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -512,7 +513,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} else if (arg == "--ignore-eos") { } else if (arg == "--ignore-eos") {
params.ignore_eos = true; params.ignore_eos = true;
} else if (arg == "--no-penalize-nl") { } else if (arg == "--no-penalize-nl") {
params.penalize_nl = false; sparams.penalize_nl = false;
} else if (arg == "-l" || arg == "--logit-bias") { } else if (arg == "-l" || arg == "--logit-bias") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -524,7 +525,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
std::string value_str; std::string value_str;
try { try {
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
} else { } else {
throw std::exception(); throw std::exception();
} }
@ -627,6 +628,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
const llama_sampling_params & sparams = params.sampling_params;
printf("usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
printf("\n"); printf("\n");
printf("options:\n"); printf("options:\n");
@ -659,19 +662,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
printf(" --mirostat N use Mirostat sampling.\n"); printf(" --mirostat N use Mirostat sampling.\n");
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
printf(" modifies the likelihood of token appearing in the completion,\n"); printf(" modifies the likelihood of token appearing in the completion,\n");
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
@ -682,7 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" negative prompt to use for guidance. (default: empty)\n"); printf(" negative prompt to use for guidance. (default: empty)\n");
printf(" --cfg-negative-prompt-file FNAME\n"); printf(" --cfg-negative-prompt-file FNAME\n");
printf(" negative prompt file to use for guidance. (default: empty)\n"); printf(" negative prompt file to use for guidance. (default: empty)\n");
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n"); printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
@ -690,7 +693,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --no-penalize-nl do not penalize newline token\n");
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp); printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
@ -840,7 +843,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
} }
if (params.ignore_eos) { if (params.ignore_eos) {
params.logit_bias[llama_token_eos(lctx)] = -INFINITY; params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
} }
{ {
@ -932,127 +935,6 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
return result; return result;
} }
//
// Sampling utils
//
llama_token llama_sample_token(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_grammar * grammar,
const struct gpt_params & params,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
int idx) {
const int n_ctx = llama_n_ctx(ctx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
float * logits = llama_get_logits_ith(ctx, idx);
// Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
candidates.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
}
// apply penalties
if (!last_tokens.empty()) {
const float nl_logit = logits[llama_token_nl(ctx)];
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
}
if (grammar != NULL) {
llama_sample_grammar(ctx, &cur_p, grammar);
}
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &cur_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temp(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temp(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
size_t min_keep = std::max(1, params.n_probs);
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
llama_sample_temp(ctx, &cur_p, temp);
{
const int n_top = 10;
LOG("top %d candidates:\n", n_top);
for (int i = 0; i < n_top; i++) {
const llama_token id = cur_p.data[i].id;
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
}
}
id = llama_sample_token(ctx, &cur_p);
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
}
}
// printf("`%d`", candidates_p.size);
if (grammar != NULL) {
llama_grammar_accept_token(ctx, grammar, id);
}
return id;
}
// //
// YAML utils // YAML utils
// //
@ -1204,6 +1086,8 @@ std::string get_sortable_timestamp() {
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx, void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) { const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
const llama_sampling_params & sparams = params.sampling_params;
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT); fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
fprintf(stream, "build_number: %d\n", BUILD_NUMBER); fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
@ -1250,21 +1134,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str()); dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale); fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty); fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx)); const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY; const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str()); dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
@ -1277,7 +1161,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
fprintf(stream, "logit_bias:\n"); fprintf(stream, "logit_bias:\n");
for (std::pair<llama_token, float> lb : params.logit_bias) { for (std::pair<llama_token, float> lb : sparams.logit_bias) {
if (ignore_eos && lb.first == logit_bias_eos->first) { if (ignore_eos && lb.first == logit_bias_eos->first) {
continue; continue;
} }
@ -1301,30 +1185,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false"); fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs); fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false"); fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false"); fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false"); fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty); fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str()); dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens); dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty); fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
fprintf(stream, "reverse_prompt:\n"); fprintf(stream, "reverse_prompt:\n");
for (std::string ap : params.antiprompt) { for (std::string ap : params.antiprompt) {
@ -1342,15 +1226,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
fprintf(stream, "temp: %f # default: 0.8\n", params.temp); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", params.top_k); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p); fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
} }

View file

@ -4,6 +4,8 @@
#include "llama.h" #include "llama.h"
#include "sampling.h"
#define LOG_NO_FILE_LINE_FUNCTION #define LOG_NO_FILE_LINE_FUNCTION
#include "log.h" #include "log.h"
@ -49,31 +51,12 @@ struct gpt_params {
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
// sampling parameters // // sampling parameters
int32_t top_k = 40; // <= 0 to use vocab size struct llama_sampling_params sampling_params;
float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
float repeat_penalty = 1.10f; // 1.0 = disabled
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance
std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding std::string model_draft = ""; // draft model for speculative decoding
@ -115,7 +98,6 @@ struct gpt_params {
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens bool ignore_eos = false; // ignore generated EOS tokens
bool instruct = false; // instruction mode (used for Alpaca models) bool instruct = false; // instruction mode (used for Alpaca models)
bool penalize_nl = true; // consider newlines as a repeatable token
bool logits_all = false; // return logits for all tokens in the batch bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
@ -180,36 +162,6 @@ std::string llama_detokenize_bpe(
llama_context * ctx, llama_context * ctx,
const std::vector<llama_token> & tokens); const std::vector<llama_token> & tokens);
//
// Sampling utils
//
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
//
// required:
// - ctx: context to use for sampling
// - params: sampling parameters
//
// optional:
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
// - grammar: grammar to use for sampling, ignore if NULL
// - last_tokens: needed for repetition penalty, ignore if empty
// - idx: sample from llama_get_logits_ith(ctx, idx)
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
//
llama_token llama_sample_token(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_grammar * grammar,
const struct gpt_params & params,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
int idx = 0);
// //
// YAML utils // YAML utils
// //

166
common/sampling.cpp Normal file
View file

@ -0,0 +1,166 @@
#include "sampling.h"
llama_sampling_context::~llama_sampling_context() {
for (auto & it : sequence_contexts) {
if (it.second.grammar != NULL) {
llama_grammar_free(it.second.grammar);
it.second.grammar = NULL;
}
}
}
llama_sampling_context llama_sampling_context_init(
const struct gpt_params & params,
llama_grammar * grammar) {
llama_sampling_context result;
result.params = params.sampling_params;
result.grammar = grammar;
return result;
}
// Note: Creates the context if it doesn't exist, so this always return something.
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
llama_sampling_context & ctx_sampling,
const llama_seq_id seq) {
const auto it = ctx_sampling.sequence_contexts.find(seq);
if (it != ctx_sampling.sequence_contexts.end()) {
return it->second;
}
llama_sampler_sequence_context new_ctx = {
2.0f * ctx_sampling.params.mirostat_tau,
ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
};
return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
}
bool llama_sampling_context_reset(
llama_sampling_context & ctx_sampling,
const llama_seq_id seq) {
const auto it = ctx_sampling.sequence_contexts.find(seq);
if (it == ctx_sampling.sequence_contexts.end()) return false;
if (it->second.grammar != NULL) {
llama_grammar_free(it->second.grammar);
it->second.grammar = NULL;
}
ctx_sampling.sequence_contexts.erase(it);
return true;
}
llama_token llama_sampling_sample(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_sampling_context & ctx_sampling,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
const int idx,
llama_seq_id seq) {
const int n_ctx = llama_n_ctx(ctx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const llama_sampling_params & params = ctx_sampling.params;
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
float * logits = llama_get_logits_ith(ctx, idx);
// Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
candidates.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
}
// apply penalties
if (!last_tokens.empty()) {
const float nl_logit = logits[llama_token_nl(ctx)];
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
}
llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
if (ctx_seq.grammar != NULL) {
llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
}
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &cur_p);
} else {
if (mirostat == 1) {
const int mirostat_m = 100;
llama_sample_temp(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
} else if (mirostat == 2) {
llama_sample_temp(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
} else {
// Temperature sampling
size_t min_keep = std::max(1, params.n_probs);
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
llama_sample_temp(ctx, &cur_p, temp);
{
const int n_top = 10;
LOG("top %d candidates:\n", n_top);
for (int i = 0; i < n_top; i++) {
const llama_token id = cur_p.data[i].id;
(void)id; // To avoid a warning that id is unused when logging is disabled.
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
}
}
id = llama_sample_token(ctx, &cur_p);
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
}
}
if (ctx_seq.grammar != NULL) {
llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
}
return id;
}

108
common/sampling.h Normal file
View file

@ -0,0 +1,108 @@
#pragma once
#include "llama.h"
#include <string>
#include <vector>
#include <unordered_map>
// sampling parameters
typedef struct llama_sampling_params {
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
float repeat_penalty = 1.10f; // 1.0 = disabled
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = true; // consider newlines as a repeatable token
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
} llama_sampling_params;
// per-sequence sampler context
typedef struct llama_sampler_sequence_context {
float mirostat_mu; // mirostat sampler state
llama_grammar * grammar;
} llama_sampler_sequence_context;
// general sampler context
typedef struct llama_sampling_context {
~llama_sampling_context();
// parameters that will be used for sampling and when creating
// new llama_sampler_sequence_context instances
llama_sampling_params params;
// map of sequence ids to sampler contexts
std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
// when non-NULL, new instances of llama_sampler_sequence_context
// will get a copy of the grammar here
// note: only the pointer is stored here, it is not a copy of
// the grammar and shouldn't be freed
llama_grammar * grammar;
} llama_sampling_context;
#include "common.h"
// Create a new sampling context instance.
llama_sampling_context llama_sampling_context_init(
const struct gpt_params & params,
llama_grammar * grammar = NULL);
// Fetches the sampler context for the specified sequence id (defaults to 0).
// If the context for that sequence id doesn't already exist, it will be created with
// default values based on the parameters in the ctx_sampling argument.
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
llama_sampling_context & ctx_sampling,
const llama_seq_id seq = 0);
// Reset the sampler context for the supplied sequence id (defaults to 0).
// This is necessary to reuse a sequence id or free memory used by sequences
// that are no longer required.
bool llama_sampling_context_reset(
llama_sampling_context & ctx_sampling,
const llama_seq_id seq = 0);
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call
// llama_sampling_context_reset when a sequence ends
//
// required:
// - ctx: context to use for sampling
// - ctx_sampling: sampling-specific context
//
// optional:
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
// - last_tokens: needed for repetition penalty, ignore if empty
// - idx: sample from llama_get_logits_ith(ctx, idx)
// - seq: sequence id to associate sampler state with
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
//
llama_token llama_sampling_sample(
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_sampling_context & ctx_sampling,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
const int idx = 0,
llama_seq_id seq = 0);

238
convert-bloom-hf-to-gguf.py Executable file
View file

@ -0,0 +1,238 @@
#!/usr/bin/env python3
# HF bloom --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import re
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
# Supported Models:
# https://huggingface.co/bigscience/bloom-1b7
# https://huggingface.co/bigscience/bloom-3b
# https://huggingface.co/bigscience/bloom-7b1
# https://huggingface.co/Langboat/bloom-1b4-zh
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "BloomForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.BLOOM
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
gguf_writer.add_name("Bloom")
n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
gguf_writer.add_embedding_length(n_embed)
gguf_writer.add_feed_forward_length(4 * n_embed)
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(n_head)
gguf_writer.add_head_count_kv(n_head)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
print("gguf: get gpt2 tokenizer vocab")
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
# params for qkv transform
n_head_kv = hparams.get("n_head_kv", n_head)
head_dim = n_embed // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
has_lm_head = True
if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
has_lm_head = False
for original_name in model_part.keys():
data = model_part[original_name]
name = re.sub(r'transformer\.', '', original_name)
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
# Map bloom-style qkv_linear to gpt-style qkv_linear
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
data = np.concatenate(
(qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
axis=0
)
print("re-format attention.linear_qkv.weight")
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
data = np.concatenate(
(qkv_bias[:, 0, :].reshape((n_embed,)),
qkv_bias[:, 1, :].reshape((n_embed,)),
qkv_bias[:, 2, :].reshape((n_embed,))),
axis=0
)
print("re-format attention.linear_qkv.bias")
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
if not has_lm_head and name == "word_embeddings.weight":
gguf_writer.add_tensor("output.weight", data)
print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -25,6 +25,7 @@ else()
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(batched) add_subdirectory(batched)
add_subdirectory(batched-bench)
add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(parallel) add_subdirectory(parallel)
add_subdirectory(embd-input) add_subdirectory(embd-input)

View file

@ -0,0 +1,5 @@
set(TARGET batched-bench)
add_executable(${TARGET} batched-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,51 @@
# llama.cpp/example/batched-bench
Benchmark the batched decoding performance of `llama.cpp`
## Usage
There are 2 modes of operation:
- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
# custom set of batches
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
```
## Sample results
- `PP` - prompt tokens per batch
- `TG` - generated tokens per batch
- `B` - number of batches
- `N_KV` - required KV cache size
- `T_PP` - prompt processing time (i.e. time to first token)
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
- `T_TG` - time to generate all batches
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
- `T` - total time
- `S` - total speed (i.e. all tokens / total time)
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
| 128 | 128 | 1 | 256 | 0.108 | 1186.64 | 3.079 | 41.57 | 3.187 | 80.32 |
| 128 | 128 | 2 | 512 | 0.198 | 1295.19 | 5.029 | 50.90 | 5.227 | 97.95 |
| 128 | 128 | 4 | 1024 | 0.373 | 1373.96 | 6.878 | 74.44 | 7.251 | 141.23 |
| 128 | 128 | 8 | 2048 | 0.751 | 1363.27 | 7.344 | 139.43 | 8.095 | 252.99 |
| 128 | 128 | 16 | 4096 | 1.570 | 1304.68 | 8.455 | 242.23 | 10.024 | 408.60 |
| 128 | 128 | 32 | 8192 | 3.408 | 1201.73 | 8.801 | 465.40 | 12.209 | 670.96 |
| 128 | 256 | 1 | 384 | 0.107 | 1196.70 | 6.329 | 40.45 | 6.436 | 59.67 |
| 128 | 256 | 2 | 768 | 0.194 | 1317.45 | 10.239 | 50.00 | 10.433 | 73.61 |
| 128 | 256 | 4 | 1536 | 0.366 | 1399.03 | 13.960 | 73.35 | 14.326 | 107.22 |
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |

View file

@ -0,0 +1,251 @@
#include "common.h"
#include "llama.h"
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
// mutates the input string
static std::vector<int> parse_list(char * p) {
std::vector<int> ret;
char * q = p;
while (*p) {
if (*p == ',') {
*p = '\0';
ret.push_back(std::atoi(q));
q = p + 1;
}
++p;
}
ret.push_back(std::atoi(q));
return ret;
}
int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
return 1 ;
}
int n_kv_max = 2048;
int is_pp_shared = 0;
int n_gpu_layers = 0;
int mmq = 0;
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
std::vector<int> n_tg = { 128, 256, };
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
if (argc >= 2) {
params.model = argv[1];
}
if (argc >= 3) {
n_kv_max = std::atoi(argv[2]);
}
if (argc >= 4) {
is_pp_shared = std::atoi(argv[3]);
}
if (argc >= 5) {
n_gpu_layers = std::atoi(argv[4]);
}
if (argc >= 6) {
mmq = std::atoi(argv[5]);
}
if (argc >= 7) {
n_pp = parse_list(argv[6]);
}
if (argc >= 8) {
n_tg = parse_list(argv[7]);
}
if (argc >= 9) {
n_pl = parse_list(argv[8]);
}
// init LLM
llama_backend_init(params.numa);
// initialize the model
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = n_gpu_layers;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = 512;
ctx_params.mul_mat_q = mmq;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
llama_batch batch = llama_batch_init(n_kv_max, 0);
// decode in batches of ctx_params.n_batch tokens
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
const int ret = llama_decode(ctx, batch_view);
if (ret != 0) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false;
}
}
return true;
};
// warm up
{
batch.n_tokens = 16;
for (int i = 0; i < batch.n_tokens; ++i) {
batch.token[i] = 0;
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
}
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
const int pp = n_pp[i_pp];
const int tg = n_tg[i_tg];
const int pl = n_pl[i_pl];
const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
if (n_ctx_req > n_kv_max) {
continue;
}
batch.n_tokens = is_pp_shared ? pp : pl*pp;
for (int i = 0; i < batch.n_tokens; ++i) {
batch.token[i] = 0;
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
}
batch.logits[batch.n_tokens - 1] = true;
const auto t_pp_start = ggml_time_us();
llama_kv_cache_tokens_rm(ctx, -1, -1);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
if (is_pp_shared) {
for (int32_t i = 1; i < pl; ++i) {
llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
}
}
const auto t_pp_end = ggml_time_us();
const auto t_tg_start = ggml_time_us();
for (int i = 0; i < tg; ++i) {
batch.n_tokens = pl;
for (int j = 0; j < pl; ++j) {
batch.token[j] = 0;
batch.pos[j] = pp + i;
batch.seq_id[j] = j;
batch.logits[j] = true;
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return 1;
}
}
const auto t_tg_end = ggml_time_us();
const int32_t n_kv = n_ctx_req;
const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
const float t = t_pp + t_tg;
const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
const float speed_tg = pl*tg / t_tg;
const float speed = n_kv / t;
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
}
}
}
llama_print_timings(ctx);
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
fprintf(stderr, "\n\n");
return 0;
}

9
examples/batched.swift/.gitignore vendored Normal file
View file

@ -0,0 +1,9 @@
.DS_Store
/.build
/Packages
xcuserdata/
DerivedData/
.swiftpm/configuration/registries.json
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
.netrc
batched_swift

View file

@ -0,0 +1,6 @@
.PHONY: build
build:
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
rm -f ./batched_swift
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift

View file

@ -0,0 +1,22 @@
// swift-tools-version: 5.5
// The swift-tools-version declares the minimum version of Swift required to build this package.
import PackageDescription
let package = Package(
name: "batched_swift",
platforms: [.macOS(.v12)],
dependencies: [
.package(name: "llama", path: "../../"),
],
targets: [
// Targets are the basic building blocks of a package, defining a module or a test suite.
// Targets can depend on other targets in this package and products from dependencies.
.executableTarget(
name: "batched_swift",
dependencies: ["llama"],
path: "Sources",
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
),
]
)

View file

@ -0,0 +1,4 @@
This is a swift clone of `examples/batched`.
$ `make`
$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`

View file

@ -0,0 +1,255 @@
import Foundation
import llama
let arguments = CommandLine.arguments
// Check that we have at least one argument (the model path)
guard arguments.count > 1 else {
print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
exit(1)
}
let modelPath: String = arguments[1]
let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
// total length of the sequences including the prompt
let n_len: Int = 32
// init LLM
llama_backend_init(false)
defer {
llama_backend_free()
}
let model_params = llama_model_default_params()
guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
print("Failed to load model")
exit(1)
}
defer {
llama_free_model(model)
}
var tokens = tokenize(text: prompt, add_bos: true)
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
var context_params = llama_context_default_params()
context_params.seed = 1234
context_params.n_ctx = n_kv_req
context_params.n_batch = UInt32(max(n_len, n_parallel))
context_params.n_threads = 8
context_params.n_threads_batch = 8
let context = llama_new_context_with_model(model, context_params)
guard context != nil else {
print("Failed to initialize context")
exit(1)
}
defer {
llama_free(context)
}
let n_ctx = llama_n_ctx(context)
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
if n_kv_req > n_ctx {
print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
exit(1)
}
var buffer: [CChar] = []
for id: llama_token in tokens {
print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
}
print("\n")
var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
defer {
llama_batch_free(batch)
}
// evaluate the initial prompt
batch.n_tokens = Int32(tokens.count)
for (i, token) in tokens.enumerated() {
batch.token[i] = token
batch.pos[i] = Int32(i)
batch.seq_id[i] = 0
batch.logits[i] = 0
}
// llama_decode will output logits only for the last token of the prompt
batch.logits[Int(batch.n_tokens) - 1] = 1
if llama_decode(context, batch) != 0 {
print("llama_decode() failed")
exit(1)
}
for i in 1 ..< n_parallel {
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
}
if n_parallel > 1 {
print("generating \(n_parallel) sequences ...\n")
}
var streams: [String] = .init(repeating: "", count: n_parallel)
var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
var n_cur = batch.n_tokens
var n_decode = 0
let t_main_start = ggml_time_us()
while n_cur <= n_len {
// prepare the next batch
batch.n_tokens = 0
// sample the next token for each parallel sequence / stream
for i in 0 ..< n_parallel {
if i_batch[i] < 0 {
// the stream has already finished
continue
}
var n_vocab = llama_n_vocab(model)
var logits = llama_get_logits_ith(context, i_batch[i])
var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
for token_id in 0 ..< n_vocab {
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
}
var candidates_p: llama_token_data_array = .init(
data: &candidates,
size: candidates.count,
sorted: false
)
let top_k: Int32 = 40
let top_p: Float = 0.9
let temp: Float = 0.4
llama_sample_top_k(context, &candidates_p, top_k, 1)
llama_sample_top_p(context, &candidates_p, top_p, 1)
llama_sample_temp(context, &candidates_p, temp)
let new_token_id = llama_sample_token(context, &candidates_p)
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished
if new_token_id == llama_token_eos(context) || n_cur == n_len {
i_batch[i] = -1
// print("")
if n_parallel > 1 {
print("stream \(i) finished at n_cur = \(n_cur)")
}
continue
}
let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
// if there is only one stream, we print immediately to stdout
if n_parallel == 1 {
print(nextStringPiece, terminator: "")
}
streams[i] += nextStringPiece
// push this new token for next evaluation
batch.token[Int(batch.n_tokens)] = new_token_id
batch.pos[Int(batch.n_tokens)] = n_cur
batch.seq_id[Int(batch.n_tokens)] = Int32(i)
batch.logits[Int(batch.n_tokens)] = 1
i_batch[i] = batch.n_tokens
batch.n_tokens += 1
n_decode += 1
}
// all streams are finished
if batch.n_tokens == 0 {
break
}
n_cur += 1
// evaluate the current batch with the transformer model
if llama_decode(context, batch) != 0 {
print("llama_decode() failed")
exit(1)
}
}
if n_parallel > 1 {
print("\n")
for (i, stream) in streams.enumerated() {
print("sequence \(i):\n\n\(prompt)\(stream)\n")
}
}
let t_main_end = ggml_time_us()
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
llama_print_timings(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let n_tokens = text.count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
var swiftTokens: [llama_token] = []
for i in 0 ..< tokenCount {
swiftTokens.append(tokens[Int(i)])
}
tokens.deallocate()
return swiftTokens
}
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
var result = [CChar](repeating: 0, count: 8)
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
if nTokens < 0 {
if result.count >= -Int(nTokens) {
result.removeLast(-Int(nTokens))
} else {
result.removeAll()
}
let check = llama_token_to_piece(
model,
token,
&result,
Int32(result.count)
)
assert(check == nTokens)
} else {
result.removeLast(result.count - Int(nTokens))
}
if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
return utfString
} else {
buffer.append(contentsOf: result)
let data = Data(buffer.map { UInt8(bitPattern: $0) })
if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
buffer = []
}
guard let bufferString = String(data: data, encoding: .utf8) else {
return nil
}
buffer = []
return bufferString
}
return nil
}

View file

@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
ctx_params.seed = 1234; ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req; ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_len, n_parallel); ctx_params.n_batch = std::max(n_len, n_parallel);
ctx_params.n_threads = params.n_threads; ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);

View file

@ -128,21 +128,22 @@ bool eval_string(struct MyModel * mymodel,const char* str){
llama_token sampling_id(struct MyModel* mymodel) { llama_token sampling_id(struct MyModel* mymodel) {
llama_context* ctx = mymodel->ctx; llama_context* ctx = mymodel->ctx;
gpt_params params = mymodel->params; gpt_params params = mymodel->params;
llama_sampling_params & sparams = params.sampling_params;
// int n_ctx = llama_n_ctx(ctx); // int n_ctx = llama_n_ctx(ctx);
// out of user input, sample next token // out of user input, sample next token
const float temp = params.temp; const float temp = sparams.temp;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k; const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
const float top_p = params.top_p; const float top_p = sparams.top_p;
const float tfs_z = params.tfs_z; const float tfs_z = sparams.tfs_z;
const float typical_p = params.typical_p; const float typical_p = sparams.typical_p;
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
// const float repeat_penalty = params.repeat_penalty; // const float repeat_penalty = params.repeat_penalty;
// const float alpha_presence = params.presence_penalty; // const float alpha_presence = params.presence_penalty;
// const float alpha_frequency = params.frequency_penalty; // const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat; const int mirostat = sparams.mirostat;
const float mirostat_tau = params.mirostat_tau; const float mirostat_tau = sparams.mirostat_tau;
const float mirostat_eta = params.mirostat_eta; const float mirostat_eta = sparams.mirostat_eta;
// const bool penalize_nl = params.penalize_nl; // const bool penalize_nl = params.penalize_nl;
llama_token id = 0; llama_token id = 0;
@ -151,7 +152,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
auto n_vocab = llama_n_vocab(llama_get_model(ctx)); auto n_vocab = llama_n_vocab(llama_get_model(ctx));
// Apply params.logit_bias map // Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
logits[it->first] += it->second; logits[it->first] += it->second;
} }

View file

@ -104,6 +104,7 @@ static void sigint_handler(int signo) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
llama_sampling_params & sparams = params.sampling_params;
g_params = &params; g_params = &params;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params)) {
@ -206,7 +207,7 @@ int main(int argc, char ** argv) {
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__); LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (params.cfg_scale > 1.f) { if (sparams.cfg_scale > 1.f) {
struct llama_context_params lparams = llama_context_params_from_gpt_params(params); struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
ctx_guidance = llama_new_context_with_model(model, lparams); ctx_guidance = llama_new_context_with_model(model, lparams);
} }
@ -269,9 +270,9 @@ int main(int argc, char ** argv) {
int guidance_offset = 0; int guidance_offset = 0;
int original_prompt_len = 0; int original_prompt_len = 0;
if (ctx_guidance) { if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt)); LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos); guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp)); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
@ -312,7 +313,7 @@ int main(int argc, char ** argv) {
if (ctx_guidance) { if (ctx_guidance) {
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) { for (int i = 0; i < (int) guidance_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@ -358,7 +359,7 @@ int main(int argc, char ** argv) {
} }
} }
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n"); LOG_TEE("\n\n");
@ -376,8 +377,8 @@ int main(int argc, char ** argv) {
LOG_TEE("\n"); LOG_TEE("\n");
{ {
auto it = params.logit_bias.find(llama_token_eos(ctx)); auto it = sparams.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) { if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
} }
} }
@ -434,6 +435,7 @@ int main(int argc, char ** argv) {
const int n_vocab = llama_n_vocab(model); const int n_vocab = llama_n_vocab(model);
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -552,7 +554,7 @@ int main(int argc, char ** argv) {
if ((int) embd_inp.size() <= n_consumed && !is_interacting) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates); const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
last_tokens.erase(last_tokens.begin()); last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id); last_tokens.push_back(id);

View file

@ -109,6 +109,7 @@ int main(int argc, char ** argv) {
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params)) {
return 1; return 1;
} }
llama_sampling_params & sparams = params.sampling_params;
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log")); log_set_target(log_filename_generator("main", "log"));
@ -179,7 +180,7 @@ int main(int argc, char ** argv) {
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__); LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (params.cfg_scale > 1.f) { if (sparams.cfg_scale > 1.f) {
struct llama_context_params lparams = llama_context_params_from_gpt_params(params); struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
ctx_guidance = llama_new_context_with_model(model, lparams); ctx_guidance = llama_new_context_with_model(model, lparams);
} }
@ -257,9 +258,9 @@ int main(int argc, char ** argv) {
int guidance_offset = 0; int guidance_offset = 0;
int original_prompt_len = 0; int original_prompt_len = 0;
if (ctx_guidance) { if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt)); LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos); guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp)); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
@ -296,6 +297,9 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size()); __func__, n_matching_session_tokens, embd_inp.size());
} }
// remove any "future" tokens that we might have inherited from the previous session
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
} }
LOGLN( LOGLN(
@ -343,7 +347,7 @@ int main(int argc, char ** argv) {
if (ctx_guidance) { if (ctx_guidance) {
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) { for (int i = 0; i < (int) guidance_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@ -395,7 +399,7 @@ int main(int argc, char ** argv) {
} }
} }
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n"); LOG_TEE("\n\n");
@ -413,8 +417,8 @@ int main(int argc, char ** argv) {
LOG_TEE("\n"); LOG_TEE("\n");
{ {
auto it = params.logit_bias.find(llama_token_eos(ctx)); auto it = sparams.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) { if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
} }
} }
@ -469,6 +473,7 @@ int main(int argc, char ** argv) {
const int n_vocab = llama_n_vocab(model); const int n_vocab = llama_n_vocab(model);
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -543,9 +548,6 @@ int main(int argc, char ** argv) {
if (i > 0) { if (i > 0) {
embd.erase(embd.begin(), embd.begin() + i); embd.erase(embd.begin(), embd.begin() + i);
} }
// remove any "future" tokens that we might have inherited from the session from the KV cache
llama_kv_cache_tokens_rm(ctx, n_past, -1);
} }
// evaluate tokens in batches // evaluate tokens in batches
@ -625,7 +627,7 @@ int main(int argc, char ** argv) {
LOG("saved session to %s\n", path_session.c_str()); LOG("saved session to %s\n", path_session.c_str());
} }
const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates); const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
last_tokens.erase(last_tokens.begin()); last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id); last_tokens.push_back(id);

View file

@ -125,6 +125,8 @@ int main(int argc, char ** argv) {
params.logits_all = true; params.logits_all = true;
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
// load the prompts from an external file if there are any // load the prompts from an external file if there are any
if (params.prompt.empty()) { if (params.prompt.empty()) {
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@ -339,7 +341,7 @@ int main(int argc, char ** argv) {
//printf("client %d, seq %d, token %d, pos %d, batch %d\n", //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i); const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
if (client.n_decoded == 1) { if (client.n_decoded == 1) {
// start measuring generation time after the first token to make sure all concurrent clients // start measuring generation time after the first token to make sure all concurrent clients
@ -384,7 +386,7 @@ int main(int argc, char ** argv) {
n_total_prompt += client.n_prompt; n_total_prompt += client.n_prompt;
n_total_gen += client.n_decoded; n_total_gen += client.n_decoded;
llama_sampling_context_reset(ctx_sampling, client.seq_id);
client.seq_id = -1; client.seq_id = -1;
} }

View file

@ -8,9 +8,10 @@
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
llama_sampling_params & sparams = params.sampling_params;
params.seed = 42; params.seed = 42;
params.n_threads = 4; params.n_threads = 4;
params.repeat_last_n = 64; sparams.repeat_last_n = 64;
params.prompt = "The quick brown fox"; params.prompt = "The quick brown fox";
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params)) {
@ -24,7 +25,7 @@ int main(int argc, char ** argv) {
} }
auto n_past = 0; auto n_past = 0;
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0); auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
// init // init
llama_model * model; llama_model * model;

File diff suppressed because it is too large Load diff

View file

@ -136,6 +136,11 @@
display: block; display: block;
} }
fieldset label.slim {
margin: 0 0.5em;
display: inline;
}
header, footer { header, footer {
text-align: center; text-align: center;
} }
@ -145,6 +150,14 @@
color: #888; color: #888;
} }
.mode-chat textarea[name=prompt] {
height: 4.5em;
}
.mode-completion textarea[name=prompt] {
height: 10em;
}
@keyframes loading-bg-wipe { @keyframes loading-bg-wipe {
0% { 0% {
@ -187,7 +200,7 @@
template: "{{prompt}}\n\n{{history}}\n{{char}}:", template: "{{prompt}}\n\n{{history}}\n{{char}}:",
historyTemplate: "{{name}}: {{message}}", historyTemplate: "{{name}}: {{message}}",
transcript: [], transcript: [],
type: "chat", type: "chat", // "chat" | "completion"
char: "Llama", char: "Llama",
user: "User", user: "User",
}) })
@ -365,13 +378,44 @@
return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key])); return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
} }
async function runLlama(prompt, llamaParams, char) {
const currentMessages = [];
const history = session.value.transcript;
if (controller.value) {
throw new Error("already running");
}
controller.value = new AbortController();
for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
const data = chunk.data;
if (data.stop) {
while (
currentMessages.length > 0 &&
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
) {
currentMessages.pop();
}
transcriptUpdate([...history, [char, currentMessages]])
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
} else {
currentMessages.push(data);
transcriptUpdate([...history, [char, currentMessages]])
}
if (data.timings) {
llamaStats.value = data.timings;
}
}
controller.value = null;
}
// send message to server // send message to server
const chat = async (msg) => { const chat = async (msg) => {
if (controller.value) { if (controller.value) {
console.log('already running...'); console.log('already running...');
return; return;
} }
controller.value = new AbortController();
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]]) transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
@ -391,55 +435,41 @@
).join("\n"), ).join("\n"),
}); });
const currentMessages = []; await runLlama(prompt, {
const history = session.value.transcript
const llamaParams = {
...params.value, ...params.value,
stop: ["</s>", template("{{char}}:"), template("{{user}}:")], stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
}, "{{char}}");
}
const runCompletion = async () => {
if (controller.value) {
console.log('already running...');
return;
} }
const {prompt} = session.value;
transcriptUpdate([...session.value.transcript, ["", prompt]]);
await runLlama(prompt, {
...params.value,
stop: [],
}, "");
}
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) { const stop = (e) => {
const data = chunk.data; e.preventDefault();
if (controller.value) {
if (data.stop) { controller.value.abort();
while ( controller.value = null;
currentMessages.length > 0 &&
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
) {
currentMessages.pop();
}
transcriptUpdate([...history, ["{{char}}", currentMessages]])
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
} else {
currentMessages.push(data);
transcriptUpdate([...history, ["{{char}}", currentMessages]])
}
if (data.timings) {
llamaStats.value = data.timings;
}
} }
}
controller.value = null; const reset = (e) => {
stop(e);
transcriptUpdate([]);
} }
function MessageInput() { function MessageInput() {
const message = useSignal("") const message = useSignal("")
const stop = (e) => {
e.preventDefault();
if (controller.value) {
controller.value.abort();
controller.value = null;
}
}
const reset = (e) => {
stop(e);
transcriptUpdate([]);
}
const submit = (e) => { const submit = (e) => {
stop(e); stop(e);
chat(message.value); chat(message.value);
@ -474,6 +504,19 @@
` `
} }
function CompletionControls() {
const submit = (e) => {
stop(e);
runCompletion();
}
return html`
<div>
<button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
<button onclick=${reset}>Reset</button>
</div>`;
}
const ChatLog = (props) => { const ChatLog = (props) => {
const messages = session.value.transcript; const messages = session.value.transcript;
const container = useRef(null) const container = useRef(null)
@ -497,7 +540,11 @@
data; data;
message = html`<${Markdownish} text=${template(text)} />` message = html`<${Markdownish} text=${template(text)} />`
} }
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>` if(user) {
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
} else {
return html`<p key=${index}>${message}</p>`
}
}; };
return html` return html`
@ -574,18 +621,31 @@
userTemplateAutosave() userTemplateAutosave()
}, [session.value, params.value]) }, [session.value, params.value])
return html` const GrammarControl = () => (
<form> html`
<fieldset> <div>
<${UserTemplateResetButton}/> <label for="template">Grammar</label>
</fieldset> <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
<input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
<button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
</div>
`
);
<fieldset> const PromptControlFieldSet = () => (
<div> html`
<label for="prompt">Prompt</label> <fieldset>
<textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/> <div>
</div> <label htmlFor="prompt">Prompt</label>
</fieldset> <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
</div>
</fieldset>
`
);
const ChatConfigForm = () => (
html`
${PromptControlFieldSet()}
<fieldset class="two"> <fieldset class="two">
<div> <div>
@ -609,15 +669,30 @@
<label for="template">Chat history template</label> <label for="template">Chat history template</label>
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/> <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
</div> </div>
${GrammarControl()}
</fieldset>
`
);
const CompletionConfigForm = () => (
html`
${PromptControlFieldSet()}
<fieldset>${GrammarControl()}</fieldset>
`
);
return html`
<form>
<fieldset class="two">
<${UserTemplateResetButton}/>
<div> <div>
<label for="template">Grammar</label> <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
<textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/> <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
<input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
<button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
</div> </div>
</fieldset> </fieldset>
${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
<fieldset class="two"> <fieldset class="two">
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})} ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})} ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
@ -851,7 +926,7 @@
function App(props) { function App(props) {
return html` return html`
<div> <div class="mode-${session.value.type}">
<header> <header>
<h1>llama.cpp</h1> <h1>llama.cpp</h1>
</header> </header>
@ -861,7 +936,7 @@
</main> </main>
<section id="write"> <section id="write">
<${MessageInput} /> <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
</section> </section>
<footer> <footer>

View file

@ -200,6 +200,7 @@ struct llama_server_context
llama_model *model = nullptr; llama_model *model = nullptr;
llama_context *ctx = nullptr; llama_context *ctx = nullptr;
gpt_params params; gpt_params params;
llama_sampling_context ctx_sampling;
int n_ctx; int n_ctx;
grammar_parser::parse_state parsed_grammar; grammar_parser::parse_state parsed_grammar;
@ -254,6 +255,7 @@ struct llama_server_context
if (grammar != nullptr) { if (grammar != nullptr) {
llama_grammar_free(grammar); llama_grammar_free(grammar);
grammar = nullptr; grammar = nullptr;
ctx_sampling = llama_sampling_context_init(params, NULL);
} }
} }
@ -329,8 +331,8 @@ struct llama_server_context
grammar_parser::print_grammar(stderr, parsed_grammar); grammar_parser::print_grammar(stderr, parsed_grammar);
{ {
auto it = params.logit_bias.find(llama_token_eos(ctx)); auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) { if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {}); LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
} }
} }
@ -339,6 +341,7 @@ struct llama_server_context
grammar = llama_grammar_init( grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
} }
ctx_sampling = llama_sampling_context_init(params, grammar);
return true; return true;
} }
@ -402,6 +405,7 @@ struct llama_server_context
// compare the evaluated prompt with the new prompt // compare the evaluated prompt with the new prompt
n_past = common_part(embd, prompt_tokens); n_past = common_part(embd, prompt_tokens);
embd = prompt_tokens; embd = prompt_tokens;
if (n_past == num_prompt_tokens) if (n_past == num_prompt_tokens)
{ {
// we have to evaluate at least 1 token to generate logits. // we have to evaluate at least 1 token to generate logits.
@ -409,6 +413,9 @@ struct llama_server_context
n_past--; n_past--;
} }
// since #3228 we now have to manually manage the KV cache
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
LOG_VERBOSE("prompt ingested", { LOG_VERBOSE("prompt ingested", {
{"n_past", n_past}, {"n_past", n_past},
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@ -458,9 +465,6 @@ struct llama_server_context
// compare the evaluated prompt with the new prompt // compare the evaluated prompt with the new prompt
n_past = common_part(embd, prompt_tokens); n_past = common_part(embd, prompt_tokens);
// since #3228 we now have to manually manage the KV cache
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
embd = prompt_tokens; embd = prompt_tokens;
if (n_past == num_prompt_tokens) if (n_past == num_prompt_tokens)
{ {
@ -468,6 +472,9 @@ struct llama_server_context
n_past--; n_past--;
} }
// since #3228 we now have to manually manage the KV cache
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
LOG_VERBOSE("prompt ingested", { LOG_VERBOSE("prompt ingested", {
{"n_past", n_past}, {"n_past", n_past},
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@ -550,12 +557,12 @@ struct llama_server_context
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve(llama_n_vocab(model)); candidates.reserve(llama_n_vocab(model));
result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates); result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
const int32_t n_probs = params.n_probs; const int32_t n_probs = params.sampling_params.n_probs;
if (params.temp <= 0 && n_probs > 0) if (params.sampling_params.temp <= 0 && n_probs > 0)
{ {
// For llama_sample_token_greedy we need to sort candidates // For llama_sample_token_greedy we need to sort candidates
llama_sample_softmax(ctx, &candidates_p); llama_sample_softmax(ctx, &candidates_p);
@ -630,7 +637,7 @@ struct llama_server_context
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
generated_text += token_text; generated_text += token_text;
if (params.n_probs > 0) if (params.sampling_params.n_probs > 0)
{ {
generated_token_probs.push_back(token_with_probs); generated_token_probs.push_back(token_with_probs);
} }
@ -711,15 +718,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf("usage: %s [options]\n", argv0); printf("usage: %s [options]\n", argv0);
printf("\n"); printf("\n");
printf("options:\n"); printf("options:\n");
printf(" -h, --help show this help message and exit\n"); printf(" -h, --help show this help message and exit\n");
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
if (llama_mlock_supported()) if (llama_mlock_supported())
{ {
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
@ -864,6 +872,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
params.n_threads = std::stoi(argv[i]); params.n_threads = std::stoi(argv[i]);
} }
else if (arg == "--threads-batch" || arg == "-tb")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_threads_batch = std::stoi(argv[i]);
}
else if (arg == "-b" || arg == "--batch-size") else if (arg == "-b" || arg == "--batch-size")
{ {
if (++i >= argc) if (++i >= argc)
@ -1018,34 +1035,35 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
static json format_generation_settings(llama_server_context &llama) static json format_generation_settings(llama_server_context &llama)
{ {
const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx)); const auto & sparams = llama.params.sampling_params;
const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
eos_bias->second < 0.0f && std::isinf(eos_bias->second); eos_bias->second < 0.0f && std::isinf(eos_bias->second);
return json{ return json{
{"n_ctx", llama.n_ctx}, {"n_ctx", llama.n_ctx},
{"model", llama.params.model_alias}, {"model", llama.params.model_alias},
{"seed", llama.params.seed}, {"seed", llama.params.seed},
{"temp", llama.params.temp}, {"temp", sparams.temp},
{"top_k", llama.params.top_k}, {"top_k", sparams.top_k},
{"top_p", llama.params.top_p}, {"top_p", sparams.top_p},
{"tfs_z", llama.params.tfs_z}, {"tfs_z", sparams.tfs_z},
{"typical_p", llama.params.typical_p}, {"typical_p", sparams.typical_p},
{"repeat_last_n", llama.params.repeat_last_n}, {"repeat_last_n", sparams.repeat_last_n},
{"repeat_penalty", llama.params.repeat_penalty}, {"repeat_penalty", sparams.repeat_penalty},
{"presence_penalty", llama.params.presence_penalty}, {"presence_penalty", sparams.presence_penalty},
{"frequency_penalty", llama.params.frequency_penalty}, {"frequency_penalty", sparams.frequency_penalty},
{"mirostat", llama.params.mirostat}, {"mirostat", sparams.mirostat},
{"mirostat_tau", llama.params.mirostat_tau}, {"mirostat_tau", sparams.mirostat_tau},
{"mirostat_eta", llama.params.mirostat_eta}, {"mirostat_eta", sparams.mirostat_eta},
{"penalize_nl", llama.params.penalize_nl}, {"penalize_nl", sparams.penalize_nl},
{"stop", llama.params.antiprompt}, {"stop", llama.params.antiprompt},
{"n_predict", llama.params.n_predict}, {"n_predict", llama.params.n_predict},
{"n_keep", llama.params.n_keep}, {"n_keep", llama.params.n_keep},
{"ignore_eos", ignore_eos}, {"ignore_eos", ignore_eos},
{"stream", llama.stream}, {"stream", llama.stream},
{"logit_bias", llama.params.logit_bias}, {"logit_bias", sparams.logit_bias},
{"n_probs", llama.params.n_probs}, {"n_probs", sparams.n_probs},
{"grammar", llama.params.grammar}, {"grammar", llama.params.grammar},
}; };
} }
@ -1094,7 +1112,7 @@ static json format_final_response(llama_server_context &llama, const std::string
{"timings", format_timings(llama)}, {"timings", format_timings(llama)},
}; };
if (llama.params.n_probs > 0) if (llama.params.sampling_params.n_probs > 0)
{ {
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
} }
@ -1110,7 +1128,7 @@ static json format_partial_response(
{"stop", false}, {"stop", false},
}; };
if (llama.params.n_probs > 0) if (llama.params.sampling_params.n_probs > 0)
{ {
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
} }
@ -1142,26 +1160,28 @@ static T json_value(const json &body, const std::string &key, const T &default_v
static void parse_options_completion(const json &body, llama_server_context &llama) static void parse_options_completion(const json &body, llama_server_context &llama)
{ {
gpt_params default_params; gpt_params default_params;
const auto & default_sparams = default_params.sampling_params;
auto & sparams = llama.params.sampling_params;
llama.stream = json_value(body, "stream", false); llama.stream = json_value(body, "stream", false);
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict); llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
llama.params.top_k = json_value(body, "top_k", default_params.top_k); sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
llama.params.top_p = json_value(body, "top_p", default_params.top_p); sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z); sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p); sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n); sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
llama.params.temp = json_value(body, "temperature", default_params.temp); sparams.temp = json_value(body, "temperature", default_sparams.temp);
llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty); sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty); sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty); sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat); sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau); sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta); sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl); sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep); llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
llama.params.seed = json_value(body, "seed", default_params.seed); llama.params.seed = json_value(body, "seed", default_params.seed);
llama.params.grammar = json_value(body, "grammar", default_params.grammar); llama.params.grammar = json_value(body, "grammar", default_params.grammar);
llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs); sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
if (body.count("prompt") != 0) if (body.count("prompt") != 0)
{ {
@ -1172,10 +1192,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla
llama.prompt = ""; llama.prompt = "";
} }
llama.params.logit_bias.clear(); sparams.logit_bias.clear();
if (json_value(body, "ignore_eos", false)) if (json_value(body, "ignore_eos", false))
{ {
llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY; sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
} }
const auto &logit_bias = body.find("logit_bias"); const auto &logit_bias = body.find("logit_bias");
@ -1191,11 +1211,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
{ {
if (el[1].is_number()) if (el[1].is_number())
{ {
llama.params.logit_bias[tok] = el[1].get<float>(); sparams.logit_bias[tok] = el[1].get<float>();
} }
else if (el[1].is_boolean() && !el[1].get<bool>()) else if (el[1].is_boolean() && !el[1].get<bool>())
{ {
llama.params.logit_bias[tok] = -INFINITY; sparams.logit_bias[tok] = -INFINITY;
} }
} }
} }
@ -1215,6 +1235,8 @@ static void parse_options_completion(const json &body, llama_server_context &lla
} }
} }
llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama)); LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
} }
@ -1423,7 +1445,7 @@ int main(int argc, char **argv)
} }
auto probs = llama.generated_token_probs; auto probs = llama.generated_token_probs;
if (llama.params.n_probs > 0 && llama.stopped_word) { if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false); const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size()); probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
} }
@ -1475,7 +1497,7 @@ int main(int argc, char **argv)
std::vector<completion_token_output> probs_output = {}; std::vector<completion_token_output> probs_output = {};
if (llama.params.n_probs > 0) { if (llama.params.sampling_params.n_probs > 0) {
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false); const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@ -1596,7 +1618,7 @@ int main(int argc, char **argv)
std::vector<completion_token_output> probs_output = {}; std::vector<completion_token_output> probs_output = {};
if (llama.params.n_probs > 0) { if (llama.params.sampling_params.n_probs > 0) {
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false); const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());

View file

@ -125,6 +125,8 @@ int main(int argc, char ** argv) {
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
} }
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
const auto t_dec_start = ggml_time_us(); const auto t_dec_start = ggml_time_us();
while (true) { while (true) {
@ -134,7 +136,7 @@ int main(int argc, char ** argv) {
while (true) { while (true) {
// sample from the target model // sample from the target model
llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft); llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
// remember which tokens were sampled - used for repetition penalties during sampling // remember which tokens were sampled - used for repetition penalties during sampling
last_tokens.erase(last_tokens.begin()); last_tokens.erase(last_tokens.begin());
@ -211,7 +213,13 @@ int main(int argc, char ** argv) {
if (grammar_dft) { if (grammar_dft) {
llama_grammar_free(grammar_dft); llama_grammar_free(grammar_dft);
} }
grammar_dft = llama_grammar_copy(grammar_tgt); // Note: Hardcoded to sequence id 0, if this ever supports parallel generation
// that will need to change.
auto it = ctx_sampling.sequence_contexts.find(0);
GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
// This is necessary because each sequence id in sequence_contexts
// uses a copy of the original grammar.
grammar_dft = llama_grammar_copy(it->second.grammar);
LOG("copied target grammar to draft grammar\n"); LOG("copied target grammar to draft grammar\n");
} }

View file

@ -88,29 +88,31 @@ class MODEL_ARCH(IntEnum):
PERSIMMON : int = auto() PERSIMMON : int = auto()
REFACT : int = auto() REFACT : int = auto()
BERT : int = auto() BERT : int = auto()
BLOOM : int = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
TOKEN_EMBD : int = auto() TOKEN_EMBD : int = auto()
TOKEN_TYPES : int = auto() TOKEN_EMBD_NORM : int = auto()
POS_EMBD : int = auto() TOKEN_TYPES : int = auto()
OUTPUT : int = auto() POS_EMBD : int = auto()
OUTPUT_NORM : int = auto() OUTPUT : int = auto()
ROPE_FREQS : int = auto() OUTPUT_NORM : int = auto()
ATTN_Q : int = auto() ROPE_FREQS : int = auto()
ATTN_K : int = auto() ATTN_Q : int = auto()
ATTN_V : int = auto() ATTN_K : int = auto()
ATTN_QKV : int = auto() ATTN_V : int = auto()
ATTN_OUT : int = auto() ATTN_QKV : int = auto()
ATTN_NORM : int = auto() ATTN_OUT : int = auto()
ATTN_NORM_2 : int = auto() ATTN_NORM : int = auto()
ATTN_ROT_EMBD: int = auto() ATTN_NORM_2 : int = auto()
FFN_GATE : int = auto() ATTN_ROT_EMBD : int = auto()
FFN_DOWN : int = auto() FFN_GATE : int = auto()
FFN_UP : int = auto() FFN_DOWN : int = auto()
FFN_NORM : int = auto() FFN_UP : int = auto()
ATTN_Q_NORM : int = auto() FFN_NORM : int = auto()
ATTN_K_NORM : int = auto() ATTN_Q_NORM : int = auto()
ATTN_K_NORM : int = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -125,29 +127,31 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.PERSIMMON: "persimmon", MODEL_ARCH.PERSIMMON: "persimmon",
MODEL_ARCH.REFACT: "refact", MODEL_ARCH.REFACT: "refact",
MODEL_ARCH.BERT: "bert", MODEL_ARCH.BERT: "bert",
MODEL_ARCH.BLOOM: "bloom",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.TOKEN_TYPES: "token_types", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.TOKEN_TYPES: "token_types",
MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT: "output", MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.ROPE_FREQS: "rope_freqs", MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
} }
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -282,6 +286,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.BLOOM: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.GPT2: [ MODEL_ARCH.GPT2: [
# TODO # TODO
], ],
@ -311,6 +327,7 @@ class TensorNameMap:
"gpt_neox.embed_in", # gptneox "gpt_neox.embed_in", # gptneox
"transformer.wte", # gpt2 gpt-j mpt refact "transformer.wte", # gpt2 gpt-j mpt refact
"transformer.word_embeddings", # falcon "transformer.word_embeddings", # falcon
"word_embeddings", # bloom
"model.embed_tokens", # llama-hf "model.embed_tokens", # llama-hf
"tok_embeddings", # llama-pth "tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert "embeddings.word_embeddings", # bert
@ -322,6 +339,11 @@ class TensorNameMap:
"embeddings.token_type_embeddings", # bert "embeddings.token_type_embeddings", # bert
), ),
# Normalization of token embeddings
MODEL_TENSOR.TOKEN_EMBD_NORM: (
"word_embeddings_layernorm", # bloom
),
# Position embeddings # Position embeddings
MODEL_TENSOR.POS_EMBD: ( MODEL_TENSOR.POS_EMBD: (
"transformer.wpe", # gpt2 "transformer.wpe", # gpt2
@ -332,7 +354,7 @@ class TensorNameMap:
MODEL_TENSOR.OUTPUT: ( MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox "embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan "lm_head", # gpt2 mpt falcon llama-hf baichuan
"output", # llama-pth "output", # llama-pth bloom
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
), ),
@ -344,7 +366,7 @@ class TensorNameMap:
"norm", # llama-pth "norm", # llama-pth
"embeddings.LayerNorm", # bert "embeddings.LayerNorm", # bert
"transformer.norm_f", # mpt "transformer.norm_f", # mpt
"ln_f", # refact "ln_f", # refact bloom
"language_model.encoder.final_layernorm", # persimmon "language_model.encoder.final_layernorm", # persimmon
), ),
@ -361,6 +383,7 @@ class TensorNameMap:
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
"transformer.blocks.{bid}.norm_1", # mpt "transformer.blocks.{bid}.norm_1", # mpt
"transformer.h.{bid}.input_layernorm", # falcon7b "transformer.h.{bid}.input_layernorm", # falcon7b
"h.{bid}.input_layernorm", # bloom
"transformer.h.{bid}.ln_mlp", # falcon40b "transformer.h.{bid}.ln_mlp", # falcon40b
"model.layers.{bid}.input_layernorm", # llama-hf "model.layers.{bid}.input_layernorm", # llama-hf
"layers.{bid}.attention_norm", # llama-pth "layers.{bid}.attention_norm", # llama-pth
@ -379,6 +402,7 @@ class TensorNameMap:
"transformer.h.{bid}.attn.c_attn", # gpt2 "transformer.h.{bid}.attn.c_attn", # gpt2
"transformer.blocks.{bid}.attn.Wqkv", # mpt "transformer.blocks.{bid}.attn.Wqkv", # mpt
"transformer.h.{bid}.self_attention.query_key_value", # falcon "transformer.h.{bid}.self_attention.query_key_value", # falcon
"h.{bid}.self_attention.query_key_value", # bloom
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
), ),
@ -412,6 +436,7 @@ class TensorNameMap:
"transformer.h.{bid}.attn.c_proj", # gpt2 refact "transformer.h.{bid}.attn.c_proj", # gpt2 refact
"transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.blocks.{bid}.attn.out_proj", # mpt
"transformer.h.{bid}.self_attention.dense", # falcon "transformer.h.{bid}.self_attention.dense", # falcon
"h.{bid}.self_attention.dense", # bloom
"model.layers.{bid}.self_attn.o_proj", # llama-hf "model.layers.{bid}.self_attn.o_proj", # llama-hf
"layers.{bid}.attention.wo", # llama-pth "layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert "encoder.layer.{bid}.attention.output.dense", # bert
@ -429,6 +454,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_NORM: ( MODEL_TENSOR.FFN_NORM: (
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
"transformer.h.{bid}.ln_2", # gpt2 refact "transformer.h.{bid}.ln_2", # gpt2 refact
"h.{bid}.post_attention_layernorm", # bloom
"transformer.blocks.{bid}.norm_2", # mpt "transformer.blocks.{bid}.norm_2", # mpt
"model.layers.{bid}.post_attention_layernorm", # llama-hf "model.layers.{bid}.post_attention_layernorm", # llama-hf
"layers.{bid}.ffn_norm", # llama-pth "layers.{bid}.ffn_norm", # llama-pth
@ -442,6 +468,7 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.c_fc", # gpt2 "transformer.h.{bid}.mlp.c_fc", # gpt2
"transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.blocks.{bid}.ffn.up_proj", # mpt
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
"h.{bid}.mlp.dense_h_to_4h", # bloom
"model.layers.{bid}.mlp.up_proj", # llama-hf refact "model.layers.{bid}.mlp.up_proj", # llama-hf refact
"layers.{bid}.feed_forward.w3", # llama-pth "layers.{bid}.feed_forward.w3", # llama-pth
"encoder.layer.{bid}.intermediate.dense", # bert "encoder.layer.{bid}.intermediate.dense", # bert
@ -461,6 +488,7 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact "transformer.h.{bid}.mlp.c_proj", # gpt2 refact
"transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.blocks.{bid}.ffn.down_proj", # mpt
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
"h.{bid}.mlp.dense_4h_to_h", # bloom
"model.layers.{bid}.mlp.down_proj", # llama-hf "model.layers.{bid}.mlp.down_proj", # llama-hf
"layers.{bid}.feed_forward.w2", # llama-pth "layers.{bid}.feed_forward.w2", # llama-pth
"encoder.layer.{bid}.output.dense", # bert "encoder.layer.{bid}.output.dense", # bert

392
llama.cpp
View file

@ -189,6 +189,7 @@ enum llm_arch {
LLM_ARCH_STARCODER, LLM_ARCH_STARCODER,
LLM_ARCH_PERSIMMON, LLM_ARCH_PERSIMMON,
LLM_ARCH_REFACT, LLM_ARCH_REFACT,
LLM_ARCH_BLOOM,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
@ -202,7 +203,8 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
{ LLM_ARCH_BAICHUAN, "baichuan" }, { LLM_ARCH_BAICHUAN, "baichuan" },
{ LLM_ARCH_STARCODER, "starcoder" }, { LLM_ARCH_STARCODER, "starcoder" },
{ LLM_ARCH_PERSIMMON, "persimmon" }, { LLM_ARCH_PERSIMMON, "persimmon" },
{ LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BLOOM, "bloom" },
}; };
enum llm_kv { enum llm_kv {
@ -305,6 +307,7 @@ struct LLM_KV {
enum llm_tensor { enum llm_tensor {
LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_TOKEN_EMBD_NORM,
LLM_TENSOR_POS_EMBD, LLM_TENSOR_POS_EMBD,
LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT,
LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT_NORM,
@ -467,6 +470,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
}, },
}, },
{
LLM_ARCH_BLOOM,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
},
},
{ {
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
{ {
@ -1212,6 +1230,8 @@ struct llama_model {
struct ggml_tensor * tok_embeddings; struct ggml_tensor * tok_embeddings;
struct ggml_tensor * pos_embeddings; struct ggml_tensor * pos_embeddings;
struct ggml_tensor * tok_norm;
struct ggml_tensor * tok_norm_b;
struct ggml_tensor * output_norm; struct ggml_tensor * output_norm;
struct ggml_tensor * output_norm_b; struct ggml_tensor * output_norm_b;
@ -2062,13 +2082,13 @@ static void llm_load_hparams(
} }
} break; } break;
case LLM_ARCH_PERSIMMON: case LLM_ARCH_PERSIMMON:
{ {
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 36: model.type = e_model::MODEL_8B; break; case 36: model.type = e_model::MODEL_8B; break;
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_REFACT: case LLM_ARCH_REFACT:
{ {
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
@ -2077,6 +2097,19 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_BLOOM:
{
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_1B; break;
case 30:
switch (hparams.n_embd) {
case 2560: model.type = e_model::MODEL_3B; break;
case 4096: model.type = e_model::MODEL_7B; break;
} break;
}
} break;
case LLM_ARCH_MPT: case LLM_ARCH_MPT:
{ {
hparams.f_clamp_kqv = 0.0f; hparams.f_clamp_kqv = 0.0f;
@ -2682,6 +2715,88 @@ static void llm_load_tensors(
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
} }
} break; } break;
case LLM_ARCH_BLOOM:
{
// TODO: CPU-only for now
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
// output
{
ggml_backend_type backend_norm;
ggml_backend_type backend_output;
if (n_gpu_layers > int(n_layer)) {
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
backend_norm = LLAMA_BACKEND_OFFLOAD;
#else
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
#endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
}
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
vram_weights += ggml_nbytes(model.output_norm);
vram_weights += ggml_nbytes(model.output_norm_b);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
vram_weights += ggml_nbytes(model.output);
}
}
const uint32_t n_ff = hparams.n_ff;
const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
}
}
} break;
case LLM_ARCH_MPT: case LLM_ARCH_MPT:
{ {
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
@ -5002,6 +5117,248 @@ static struct ggml_cgraph * llm_build_persimmon(
return gf; return gf;
} }
static struct ggml_cgraph * llm_build_bloom(
llama_context & lctx,
const llama_batch & batch) {
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
const auto & kv_self = lctx.kv_self;
GGML_ASSERT(!!kv_self.ctx);
const int64_t n_embd = hparams.n_embd;
const int64_t n_layer = hparams.n_layer;
const int64_t n_ctx = cparams.n_ctx;
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();
GGML_ASSERT(n_embd_head == hparams.n_rot);
const float norm_eps = hparams.f_norm_eps;
const int32_t n_tokens = batch.n_tokens;
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
auto & buf_compute = lctx.buf_compute;
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute.size,
/*.mem_buffer =*/ buf_compute.data,
/*.no_alloc =*/ false,
};
params.no_alloc = true;
struct ggml_context * ctx0 = ggml_init(params);
ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur;
struct ggml_tensor * token;
struct ggml_tensor * inpL;
if (batch.token) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
ggml_allocr_alloc(lctx.alloc, inp_tokens);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
}
ggml_set_name(inp_tokens, "inp_tokens");
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
} else {
#ifdef GGML_USE_MPI
GGML_ASSERT(false && "not implemented");
#endif
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
ggml_allocr_alloc(lctx.alloc, token);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
}
}
// KQ_scale
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
ggml_allocr_alloc(lctx.alloc, KQ_scale);
if (!ggml_allocr_is_measure(lctx.alloc)) {
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
}
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
ggml_set_name(KQ_mask, "KQ_mask");
ggml_allocr_alloc(lctx.alloc, KQ_mask);
if (!ggml_allocr_is_measure(lctx.alloc)) {
float * data = (float *) KQ_mask->data;
memset(data, 0, ggml_nbytes(KQ_mask));
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
const llama_pos pos = batch.pos[j];
const llama_seq_id seq_id = batch.seq_id[j];
for (int i = 0; i < n_kv; ++i) {
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
}
}
}
}
}
// norm
{
inpL = ggml_norm(ctx0, token, norm_eps);
inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
}
ggml_set_name(inpL, "inpL");
for (int il = 0; il < n_layer; ++il) {
{
// Norm
cur = ggml_norm(ctx0, inpL, norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
}
{
// Self Attention
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
struct ggml_tensor * Qcur = tmpq;
struct ggml_tensor * Kcur = tmpk;
// store key and value to memory
{
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
ggml_set_name(Vcur, "Vcur");
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
ggml_set_name(k, "k");
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
}
struct ggml_tensor * Q =
ggml_permute(ctx0,
ggml_cpy(ctx0,
Qcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
0, 2, 1, 3);
ggml_set_name(Q, "Q");
struct ggml_tensor * K =
ggml_view_3d(ctx0, kv_self.k,
n_embd_head, n_kv, n_head_kv,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
ggml_set_name(K, "K");
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
ggml_set_name(KQ, "KQ");
// KQ_scaled = KQ / sqrt(n_embd_head)
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
ggml_set_name(KQ_scaled, "KQ_scaled");
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
ggml_set_name(KQ_masked, "KQ_masked");
// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
ggml_set_name(KQ_soft_max, "KQ_soft_max");
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
n_kv, n_embd_head, n_head_kv,
ggml_element_size(kv_self.v)*n_ctx,
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
ggml_set_name(V, "V");
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
ggml_set_name(KQV, "KQV");
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
ggml_set_name(KQV_merged, "KQV_merged");
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
ggml_set_name(cur, "KQV_merged_contiguous");
}
// Projection
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
// Add the input
cur = ggml_add(ctx0, cur, inpL);
struct ggml_tensor * inpFF = cur;
// FF
{
// Norm
{
cur = ggml_norm(ctx0, inpFF, norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
}
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
// GELU activation
cur = ggml_gelu(ctx0, cur);
// Projection
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
}
inpL = ggml_add(ctx0, cur, inpFF);
}
// Output Norm
{
cur = ggml_norm(ctx0, inpL, norm_eps);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
}
ggml_set_name(cur, "result_norm");
cur = ggml_mul_mat(ctx0, model.output, cur);
ggml_set_name(cur, "result_output");
ggml_build_forward_expand(gf, cur);
ggml_free(ctx0);
return gf;
}
static struct ggml_cgraph * llm_build_mpt( static struct ggml_cgraph * llm_build_mpt(
llama_context & lctx, llama_context & lctx,
const llama_batch & batch) { const llama_batch & batch) {
@ -5031,9 +5388,6 @@ static struct ggml_cgraph * llm_build_mpt(
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
//printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
// kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
auto & buf_compute = lctx.buf_compute; auto & buf_compute = lctx.buf_compute;
struct ggml_init_params params = { struct ggml_init_params params = {
@ -5354,6 +5708,10 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm_build_refact(lctx, batch); result = llm_build_refact(lctx, batch);
} break; } break;
case LLM_ARCH_BLOOM:
{
result = llm_build_bloom(lctx, batch);
} break;
case LLM_ARCH_MPT: case LLM_ARCH_MPT:
{ {
result = llm_build_mpt(lctx, batch); result = llm_build_mpt(lctx, batch);
@ -5990,7 +6348,6 @@ private:
for (int i = 0; i < (int)text_utf.size(); i++) { for (int i = 0; i < (int)text_utf.size(); i++) {
const std::string & utf_char = text_utf[i]; const std::string & utf_char = text_utf[i];
bool split_condition = false; bool split_condition = false;
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
int bytes_remain = text_utf.size() - i; int bytes_remain = text_utf.size() - i;
// forward backward lookups // forward backward lookups
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
@ -6016,9 +6373,9 @@ private:
if (!split_condition && bytes_remain >= 3) { if (!split_condition && bytes_remain >= 3) {
// 're|'ve|'ll // 're|'ve|'ll
if (utf_char == "\'" && ( if (utf_char == "\'" && (
(utf_char_next == "r" || utf_char_next_next == "e") || (utf_char_next == "r" && utf_char_next_next == "e") ||
(utf_char_next == "v" || utf_char_next_next == "e") || (utf_char_next == "v" && utf_char_next_next == "e") ||
(utf_char_next == "l" || utf_char_next_next == "l")) (utf_char_next == "l" && utf_char_next_next == "l"))
) { ) {
split_condition = true; split_condition = true;
} }
@ -6069,7 +6426,7 @@ private:
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
split_condition = true; split_condition = true;
} }
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) { else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
split_condition = true; split_condition = true;
} }
} }
@ -7588,8 +7945,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const std::string name = ggml_get_name(meta); const std::string name = ggml_get_name(meta);
// TODO: avoid hardcoded tensor names - use the TN_* constants // TODO: avoid hardcoded tensor names - use the TN_* constants
if (name.find("attn_v.weight") != std::string::npos || if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
name.find("attn_qkv.weight") != std::string::npos) {
++n_attention_wv; ++n_attention_wv;
} }
else if (name.find("ffn_down.weight") != std::string::npos) { else if (name.find("ffn_down.weight") != std::string::npos) {

93
prompts/mnemonics.txt Normal file
View file

@ -0,0 +1,93 @@
For each kanji character, write a Markdownformatted mnemonic that uses its keyword and the keyword of all its components.
Kanji: 欠 (lack of)
Components: 𠂊 (hook claw), 人 (person)
Mnemonic: This **person** is a pirate. He lost his hand to a crocodile many years ago. Nowadays, the ***lack of*** a hand does not bother him too much. In fact, the **hook claw** that replaces it is the mark of a true pirate, so he is quite proud of it!
Kanji: 類 (kind (of something))
Components: 米 (rice), 大 (large), 頁 (page)
Mnemonic: The waiter at a Chinese restaurant hands you a **large** menu. Each **page** has all ***kinds*** of **rice** on offer!
Kanji: 燃 (burn)
Components: 火 (fire), 然 (sort of thing)
Mnemonic: ***Burning*** things up with **fire** is just my **sort of thing**. (Spoken like a true pyromaniac.)
Kanji: 頂 (top of)
Components: 丁 (street), 頁 (page)
Mnemonic: To be at the ***top of*** your game, you need both practical knowledge (**street** smarts) and theoretical knowledge (having read many **pages**).
Kanji: 険 (risky and steep)
Components: 阝 (small village), 㑒 (consensus)
Mnemonic: Everyone agrees (there is **consensus**) that the path to the **small village** is ***risky and steep***.
Kanji: 困 (distressed)
Components: 囗 (closed box), 木 (tree)
Mnemonic: You would feel ***distressed*** too if you were a **tree** trapped in a **closed box**! I have no place to grow!
Kanji: 頭 (head)
Components: 豆 (bean), 頁 (page)
Mnemonic: What do you have in that ***head*** of yours? A **bean** for a brain? Go read more **pages** and become more knowledgeable about the world!
Kanji: 確 (certain)
Components: 石 (stone), 冖 (roof without a chimney), 隹 (old bird)
Mnemonic: An **old bird** has made a nest on your **roof**. What do you do? You call Misaka from a <cite>A ***Certain*** Scientific Railgun</cite> to get rid of it, of course! But she doesnt really want to vaporize the poor thing, so she just throws a **stone** to scare it away. (What was the point of calling her, then‽)
Kanji: 魚 (fish)
Components: 𠂊 (hook claw), 田 (rice field), 灬 (fire sparks)
Mnemonic: Catch ***fish*** with a **hook**, collect rice from the **rice field**, cook them with **fire**… And my meal is ready!
Kanji: 警 (to police (something))
Components: 敬 (respect), 言 (say)
Mnemonic: ***To police something*** is to make people **respect** what the law **says**.
Kanji: 筆 (writing brush)
Components: 竹 (bamboo), 聿 (brush)
Mnemonic: A traditional ***writing brush*** is a **brush** made of **bamboo**.
Kanji: 獄 (prison)
Components: 犭 (animal), 言 (say), 犬 (dog)
Mnemonic: In ***prison***, like in the **animal** kingdom, only the toughest survive. You have to watch what you **say**. Its a **dog**eatdog world.
Kanji: 新 (new)
Components: 立 (standing up), 木 (tree), 斤 (axe)
Mnemonic: In order for a ***new*** construction to be made, an empty lot is needed. If there are any **trees** **standing up**, they must be cut down with an **axe**.
Kanji: 怪 (suspicious)
Components: 忄 (weak heart), 圣 (sacred)
Mnemonic: That painting of the **Sacred** **Heart** of Jesus looks ***suspicious***. I think it might be a forgery.
Kanji: 温 (warm (to the touch))
Components: 氵 (water drops), 日 (sun), 皿 (dish)
Mnemonic: If you leave **water** on a **dish** in the **sun**, it will get ***warm***.
Kanji: 階 (floor (of a building))
Components: 阝 (small village), 皆 (all)
Mnemonic: It might be a **small village**, but, despite that, **all** of its buildings have many ***floors***. Its a village of skyscrapers!
Kanji: 多 (many)
Components: 夕 (evening (before sunset)), 夕 (evening (before sunset))
Mnemonic: Two **evenings** in a day would be one too ***many***.
Kanji: 別 (separate)
Components: 口 (mouth), 万 (ten thousand), 刂 (knife)
Mnemonic: Tom Six is at it again. For his next flick, he wants to stitch together **ten thousand** people, **mouth**toanus. One of the most graphic and disturbing scenes will feature one of the victims using a **knife** to ***separate*** perself.
Kanji: 並 (line up)
Components: 䒑 (antlers on a wall), 业 (runway)
Mnemonic: In order to land a plane you have to ***line up*** properly with the **runway**. The things that look like **antlers** at the end of the runway are the control towers; you should follow their instructions.
Kanji: 姿 (figure)
Components: 次 (next), 女 (woman)
Mnemonic: The **next** **woman** that I date will have a perfect **figure**. Because Im done with 3D women—it will *literally* be an anime figure!
Kanji: 実 (real)
Components: 宀 (roof with a chimney), 𡗗 (three people)
Mnemonic: Living under a **roof with a chimney** with **three people** (a wife and two children)—a happy family life—is not something I could have ever imagined. It does not feel ***real***.
Kanji: 謝 (apologize)
Components: 言 (say), 射 (shoot)
Mnemonic: **Shot** first, ***apologize*** (**say** you are sorry) later.
Kanji: 提 (propose)
Components: 扌 (left hand), 是 (go with)
Mnemonic:

View file

@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{ " Hello" , { 258, 23090, }, }, { " Hello" , { 258, 23090, }, },
{ " Hello" , { 466, 23090, }, }, { " Hello" , { 466, 23090, }, },
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, { " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
{ "\n =" , { 1212, 40, }, },
{ "' era" , { 18, 4932, }, },
}; };
return _k_tests; return _k_tests;
@ -155,7 +157,7 @@ int main(int argc, char **argv) {
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
const std::vector<llama_token> res = llama_tokenize(ctx, text, true); const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
@ -169,10 +171,8 @@ int main(int argc, char **argv) {
} }
for (const auto & tok : res) { for (const auto & tok : res) {
ofs << tok << " "; ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
} }
ofs << "\n";
} }
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());

View file

@ -41,6 +41,8 @@ tests = [
" Hello", " Hello",
" Hello", " Hello",
" Hello\n Hello", " Hello\n Hello",
"\n =",
"' era",
] ]
for text in tests: for text in tests:
@ -69,15 +71,14 @@ fname_tok = args.fname_tok
if fname_tok: if fname_tok:
print('tokenizing file: ', fname_tok) print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok' fname_out = fname_tok + '.tok'
with open(fname_tok, 'r') as f: with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines() lines = f.readlines()
s = ''.join(lines) s = ''.join(lines)
res = tokenizer.encode(s) res = tokenizer.encode(s)
# write to file # write to file
with open(fname_out, 'w') as f: with open(fname_out, 'w', encoding='utf-8') as f:
for x in res: for x in res:
f.write(str(x) + ' ') f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
f.write('\n')
print('len(res): ', len(res)) print('len(res): ', len(res))
print('len(lines): ', len(lines)) print('len(lines): ', len(lines))
print('results written to: ', fname_out) print('results written to: ', fname_out)

View file

@ -174,10 +174,8 @@ int main(int argc, char **argv) {
} }
for (const auto & tok : res) { for (const auto & tok : res) {
ofs << tok << " "; ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
} }
ofs << "\n";
} }
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());

View file

@ -81,15 +81,14 @@ fname_tok = args.fname_tok
if fname_tok: if fname_tok:
print('tokenizing file: ', fname_tok) print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok' fname_out = fname_tok + '.tok'
with open(fname_tok, 'r') as f: with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines() lines = f.readlines()
s = ''.join(lines) s = ''.join(lines)
res = tokenizer.encode(s, add_bos=True) res = tokenizer.encode(s, add_bos=True)
# write to file # write to file
with open(fname_out, 'w') as f: with open(fname_out, 'w', encoding='utf-8') as f:
for x in res: for x in res:
f.write(str(x) + ' ') f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
f.write('\n')
print('len(res): ', len(res)) print('len(res): ', len(res))
print('len(lines): ', len(lines)) print('len(lines): ', len(lines))
print('results written to: ', fname_out) print('results written to: ', fname_out)