common : use common_ prefix for common library functions (#9805)

* common : use common_ prefix for common library functions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Diego Devesa 2024-10-10 22:57:42 +02:00 committed by GitHub
parent 0e9f760eb1
commit 7eee341bee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
45 changed files with 1284 additions and 1284 deletions

File diff suppressed because it is too large Load diff

View file

@ -10,7 +10,7 @@
// CLI argument parsing
//
struct llama_arg {
struct common_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::vector<const char *> args;
const char * value_hint = nullptr; // help text or example for arg value
@ -18,60 +18,60 @@ struct llama_arg {
const char * env = nullptr;
std::string help;
bool is_sparam = false; // is current arg a sampling param?
void (*handler_void) (gpt_params & params) = nullptr;
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (gpt_params & params, int) = nullptr;
void (*handler_void) (common_params & params) = nullptr;
void (*handler_string) (common_params & params, const std::string &) = nullptr;
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (common_params & params, int) = nullptr;
llama_arg(
common_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &)
void (*handler)(common_params & params, const std::string &)
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
llama_arg(
common_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, int)
void (*handler)(common_params & params, int)
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
llama_arg(
common_arg(
const std::initializer_list<const char *> & args,
const std::string & help,
void (*handler)(gpt_params & params)
void (*handler)(common_params & params)
) : args(args), help(help), handler_void(handler) {}
// support 2 values for arg
llama_arg(
common_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const char * value_hint_2,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &, const std::string &)
void (*handler)(common_params & params, const std::string &, const std::string &)
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
llama_arg & set_env(const char * env);
llama_arg & set_sparam();
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
common_arg & set_env(const char * env);
common_arg & set_sparam();
bool in_example(enum llama_example ex);
bool get_value_from_env(std::string & output);
bool has_value_from_env();
std::string to_string();
};
struct gpt_params_context {
struct common_params_context {
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
gpt_params & params;
std::vector<llama_arg> options;
common_params & params;
std::vector<common_arg> options;
void(*print_usage)(int, char **) = nullptr;
gpt_params_context(gpt_params & params) : params(params) {}
common_params_context(common_params & params) : params(params) {}
};
// parse input arguments from CLI
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
// function to be used by test-arg-parser
gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

View file

@ -362,10 +362,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
return true;
}
void gpt_init() {
void common_init() {
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
gpt_log_add(gpt_log_main(), level, "%s", text);
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
common_log_add(common_log_main(), level, "%s", text);
}
}, NULL);
@ -378,7 +378,7 @@ void gpt_init() {
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
}
std::string gpt_params_get_system_info(const gpt_params & params) {
std::string common_params_get_system_info(const common_params & params) {
std::ostringstream os;
os << "system_info: n_threads = " << params.cpuparams.n_threads;
@ -493,7 +493,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
first = false;
}
auto detokenized = llama_token_to_piece(ctx, token);
auto detokenized = common_token_to_piece(ctx, token);
detokenized.erase(
std::remove_if(
@ -524,7 +524,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
first = false;
}
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
detokenized.erase(
std::remove_if(
@ -819,16 +819,16 @@ std::string fs_get_cache_file(const std::string & filename) {
//
// Model utils
//
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
llama_init_result iparams;
auto mparams = llama_model_params_from_gpt_params(params);
struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams;
auto mparams = common_model_params_to_llama(params);
llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else if (!params.model_url.empty()) {
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
}
@ -863,7 +863,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
}
}
auto cparams = llama_context_params_from_gpt_params(params);
auto cparams = common_context_params_to_llama(params);
llama_context * lctx = llama_new_context_with_model(model, cparams);
if (lctx == NULL) {
@ -876,7 +876,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
const auto cvec = llama_control_vector_load(params.control_vectors);
const auto cvec = common_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) {
llama_free(lctx);
llama_free_model(model);
@ -900,7 +900,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_lora_adapter_container loaded_la;
common_lora_adapter_container loaded_la;
loaded_la.path = la.path;
loaded_la.scale = la.scale;
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
@ -913,7 +913,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
}
if (!params.lora_init_without_apply) {
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
common_lora_adapters_apply(lctx, iparams.lora_adapters);
}
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@ -961,7 +961,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
return iparams;
}
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
llama_lora_adapter_clear(ctx);
for (auto & la : lora_adapters) {
if (la.scale != 0.0f) {
@ -970,7 +970,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
}
}
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
struct llama_model_params common_model_params_to_llama(const common_params & params) {
auto mparams = llama_model_default_params();
if (params.n_gpu_layers != -1) {
@ -1022,7 +1022,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
throw std::runtime_error("Invalid cache type: " + s);
}
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
struct llama_context_params common_context_params_to_llama(const common_params & params) {
auto cparams = llama_context_default_params();
cparams.n_ctx = params.n_ctx;
@ -1112,7 +1112,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
return false;
}
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@ -1182,15 +1182,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
}
// Send a HEAD request to retrieve the etag and last-modified headers
struct llama_load_model_from_url_headers {
struct common_load_model_from_url_headers {
std::string etag;
std::string last_modified;
};
llama_load_model_from_url_headers headers;
common_load_model_from_url_headers headers;
{
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase);
@ -1326,7 +1326,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
return true;
}
struct llama_model * llama_load_model_from_url(
struct llama_model * common_load_model_from_url(
const char * model_url,
const char * path_model,
const char * hf_token,
@ -1337,7 +1337,7 @@ struct llama_model * llama_load_model_from_url(
return NULL;
}
if (!llama_download_file(model_url, path_model, hf_token)) {
if (!common_download_file(model_url, path_model, hf_token)) {
return NULL;
}
@ -1390,7 +1390,7 @@ struct llama_model * llama_load_model_from_url(
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
return llama_download_file(split_url, split_path, hf_token);
return common_download_file(split_url, split_path, hf_token);
}, idx));
}
@ -1405,7 +1405,7 @@ struct llama_model * llama_load_model_from_url(
return llama_load_model_from_file(path_model, params);
}
struct llama_model * llama_load_model_from_hf(
struct llama_model * common_load_model_from_hf(
const char * repo,
const char * model,
const char * path_model,
@ -1425,12 +1425,12 @@ struct llama_model * llama_load_model_from_hf(
model_url += "/resolve/main/";
model_url += model;
return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
}
#else
struct llama_model * llama_load_model_from_url(
struct llama_model * common_load_model_from_url(
const char * /*model_url*/,
const char * /*path_model*/,
const char * /*hf_token*/,
@ -1439,7 +1439,7 @@ struct llama_model * llama_load_model_from_url(
return nullptr;
}
struct llama_model * llama_load_model_from_hf(
struct llama_model * common_load_model_from_hf(
const char * /*repo*/,
const char * /*model*/,
const char * /*path_model*/,
@ -1455,11 +1455,11 @@ struct llama_model * llama_load_model_from_hf(
// Batch utils
//
void llama_batch_clear(struct llama_batch & batch) {
void common_batch_clear(struct llama_batch & batch) {
batch.n_tokens = 0;
}
void llama_batch_add(
void common_batch_add(
struct llama_batch & batch,
llama_token id,
llama_pos pos,
@ -1482,15 +1482,15 @@ void llama_batch_add(
// Vocab utils
//
std::vector<llama_token> llama_tokenize(
std::vector<llama_token> common_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_special,
bool parse_special) {
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
}
std::vector<llama_token> llama_tokenize(
std::vector<llama_token> common_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_special,
@ -1509,7 +1509,7 @@ std::vector<llama_token> llama_tokenize(
return result;
}
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
@ -1525,7 +1525,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
return piece;
}
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
@ -1545,15 +1545,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
// Chat template utils
//
bool llama_chat_verify_template(const std::string & tmpl) {
bool common_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0;
}
std::string llama_chat_apply_template(const struct llama_model * model,
std::string common_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & msgs,
const std::vector<common_chat_msg> & msgs,
bool add_ass) {
int alloc_size = 0;
bool fallback = false; // indicate if we must fallback to default chatml
@ -1595,42 +1595,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
return formatted_chat;
}
std::string llama_chat_format_single(const struct llama_model * model,
std::string common_chat_format_single(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & past_msg,
const llama_chat_msg & new_msg,
const std::vector<common_chat_msg> & past_msg,
const common_chat_msg & new_msg,
bool add_ass) {
std::ostringstream ss;
auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg);
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
std::vector<common_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
ss << "\n";
};
// format chat with new_msg
chat_new.push_back(new_msg);
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
// get the diff part
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return ss.str();
}
std::string llama_chat_format_example(const struct llama_model * model,
std::string common_chat_format_example(const struct llama_model * model,
const std::string & tmpl) {
std::vector<llama_chat_msg> msgs = {
std::vector<common_chat_msg> msgs = {
{"system", "You are a helpful assistant"},
{"user", "Hello"},
{"assistant", "Hi there"},
{"user", "How are you?"},
};
return llama_chat_apply_template(model, tmpl, msgs, true);
return common_chat_apply_template(model, tmpl, msgs, true);
}
//
// KV cache utils
//
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
@ -1653,7 +1653,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
printf("\n=== Done dumping\n");
}
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
@ -1705,7 +1705,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
// Embedding utils
//
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
double sum = 0.0;
switch (embd_norm) {
@ -1739,7 +1739,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
}
}
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
double sum = 0.0;
double sum1 = 0.0;
double sum2 = 0.0;
@ -1765,8 +1765,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
// Control vector utils
//
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
llama_control_vector_data result = { -1, {} };
static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
common_control_vector_data result = { -1, {} };
ggml_context * ctx = nullptr;
struct gguf_init_params meta_gguf_params = {
@ -1850,11 +1850,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
return result;
}
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
llama_control_vector_data result = { -1, {} };
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
common_control_vector_data result = { -1, {} };
for (const auto & info : load_infos) {
auto cur = llama_control_vector_load_one(info);
auto cur = common_control_vector_load_one(info);
if (cur.n_embd == -1) {
result.n_embd = -1;
@ -1946,7 +1946,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
}
}
void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
const auto & sparams = params.sparams;

View file

@ -24,12 +24,12 @@
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
struct llama_lora_adapter_info {
struct common_lora_adapter_info {
std::string path;
float scale;
};
struct llama_lora_adapter_container : llama_lora_adapter_info {
struct common_lora_adapter_container : common_lora_adapter_info {
struct llama_lora_adapter * adapter;
};
@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT;
extern char const * LLAMA_COMPILER;
extern char const * LLAMA_BUILD_TARGET;
struct llama_control_vector_load_info;
struct common_control_vector_load_info;
//
// CPU utils
@ -82,14 +82,14 @@ enum llama_example {
LLAMA_EXAMPLE_COUNT,
};
enum gpt_sampler_type {
GPT_SAMPLER_TYPE_NONE = 0,
GPT_SAMPLER_TYPE_TOP_K = 1,
GPT_SAMPLER_TYPE_TOP_P = 2,
GPT_SAMPLER_TYPE_MIN_P = 3,
GPT_SAMPLER_TYPE_TFS_Z = 4,
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
enum common_sampler_type {
COMMON_SAMPLER_TYPE_NONE = 0,
COMMON_SAMPLER_TYPE_TOP_K = 1,
COMMON_SAMPLER_TYPE_TOP_P = 2,
COMMON_SAMPLER_TYPE_MIN_P = 3,
COMMON_SAMPLER_TYPE_TFS_Z = 4,
COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
};
// dimensionality reduction methods, used by cvector-generator
@ -99,7 +99,7 @@ enum dimre_method {
};
// sampler parameters
struct gpt_sampler_params {
struct common_sampler_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
int32_t n_prev = 64; // number of previous tokens to remember
@ -124,13 +124,13 @@ struct gpt_sampler_params {
bool ignore_eos = false;
bool no_perf = false; // disable performance metrics
std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K,
GPT_SAMPLER_TYPE_TFS_Z,
GPT_SAMPLER_TYPE_TYPICAL_P,
GPT_SAMPLER_TYPE_TOP_P,
GPT_SAMPLER_TYPE_MIN_P,
GPT_SAMPLER_TYPE_TEMPERATURE
std::vector<enum common_sampler_type> samplers = {
COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TFS_Z,
COMMON_SAMPLER_TYPE_TYPICAL_P,
COMMON_SAMPLER_TYPE_TOP_P,
COMMON_SAMPLER_TYPE_MIN_P,
COMMON_SAMPLER_TYPE_TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling
@ -141,7 +141,7 @@ struct gpt_sampler_params {
std::string print() const;
};
struct gpt_params {
struct common_params {
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -183,7 +183,7 @@ struct gpt_params {
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
struct gpt_sampler_params sparams;
struct common_sampler_params sparams;
std::string model = ""; // model path // NOLINT
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
@ -208,9 +208,9 @@ struct gpt_params {
std::vector<llama_model_kv_override> kv_overrides;
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
int32_t verbosity = 0;
int32_t control_vector_layer_start = -1; // layer range for control vector
@ -348,9 +348,9 @@ struct gpt_params {
// call once at the start of a program if it uses libcommon
// initializes the logging system and prints info about the build
void gpt_init();
void common_init();
std::string gpt_params_get_system_info(const gpt_params & params);
std::string common_params_get_system_info(const common_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
@ -404,29 +404,29 @@ std::string fs_get_cache_file(const std::string & filename);
// Model utils
//
struct llama_init_result {
struct common_init_result {
struct llama_model * model = nullptr;
struct llama_context * context = nullptr;
std::vector<llama_lora_adapter_container> lora_adapters;
std::vector<common_lora_adapter_container> lora_adapters;
};
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
struct common_init_result common_init_from_params(common_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
struct llama_model_params common_model_params_to_llama (const common_params & params);
struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
// clear LoRA adapters from context, then apply new list of adapters
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
// Batch utils
void llama_batch_clear(struct llama_batch & batch);
void common_batch_clear(struct llama_batch & batch);
void llama_batch_add(
void common_batch_add(
struct llama_batch & batch,
llama_token id,
llama_pos pos,
@ -439,13 +439,13 @@ void llama_batch_add(
// tokenizes a string into a vector of tokens
// should work similar to Python's `tokenizer.encode`
std::vector<llama_token> llama_tokenize(
std::vector<llama_token> common_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_special,
bool parse_special = false);
std::vector<llama_token> llama_tokenize(
std::vector<llama_token> common_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_special,
@ -453,7 +453,7 @@ std::vector<llama_token> llama_tokenize(
// tokenizes a token into a piece, optionally renders special/control tokens
// should work similar to Python's `tokenizer.id_to_piece`
std::string llama_token_to_piece(
std::string common_token_to_piece(
const struct llama_context * ctx,
llama_token token,
bool special = true);
@ -461,7 +461,7 @@ std::string llama_token_to_piece(
// detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode`
// optionally renders special/control tokens
std::string llama_detokenize(
std::string common_detokenize(
llama_context * ctx,
const std::vector<llama_token> & tokens,
bool special = true);
@ -471,31 +471,31 @@ std::string llama_detokenize(
//
// same with llama_chat_message, but uses std::string
struct llama_chat_msg {
struct common_chat_msg {
std::string role;
std::string content;
};
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool llama_chat_verify_template(const std::string & tmpl);
bool common_chat_verify_template(const std::string & tmpl);
// CPP wrapper for llama_chat_apply_template
// If the built-in template is not supported, we default to chatml
// If the custom "tmpl" is not supported, we throw an error
std::string llama_chat_apply_template(const struct llama_model * model,
std::string common_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & chat,
const std::vector<common_chat_msg> & chat,
bool add_ass);
// Format single message, while taking into account the position of that message in chat history
std::string llama_chat_format_single(const struct llama_model * model,
std::string common_chat_format_single(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & past_msg,
const llama_chat_msg & new_msg,
const std::vector<common_chat_msg> & past_msg,
const common_chat_msg & new_msg,
bool add_ass);
// Returns an example of formatted chat
std::string llama_chat_format_example(const struct llama_model * model,
std::string common_chat_format_example(const struct llama_model * model,
const std::string & tmpl);
//
@ -503,31 +503,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
//
// Dump the KV cache view with the number of sequences per cell.
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output).
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
//
// Embedding utils
//
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
//
// Control vector utils
//
struct llama_control_vector_data {
struct common_control_vector_data {
int n_embd;
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
std::vector<float> data;
};
struct llama_control_vector_load_info {
struct common_control_vector_load_info {
float strength;
std::string fname;
@ -535,7 +535,7 @@ struct llama_control_vector_load_info {
// Load control vectors, scale each by strength, and add them together.
// On error, returns {-1, empty}
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
//
// Split utils
@ -554,5 +554,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
FILE * stream, const common_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

View file

@ -8,10 +8,10 @@
#include <thread>
#include <vector>
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
void gpt_log_set_verbosity_thold(int verbosity) {
gpt_log_verbosity_thold = verbosity;
void common_log_set_verbosity_thold(int verbosity) {
common_log_verbosity_thold = verbosity;
}
#define LOG_COL_DEFAULT "\033[0m"
@ -29,16 +29,16 @@ static int64_t t_us() {
}
// colors
enum gpt_log_col : int {
GPT_LOG_COL_DEFAULT = 0,
GPT_LOG_COL_BOLD,
GPT_LOG_COL_RED,
GPT_LOG_COL_GREEN,
GPT_LOG_COL_YELLOW,
GPT_LOG_COL_BLUE,
GPT_LOG_COL_MAGENTA,
GPT_LOG_COL_CYAN,
GPT_LOG_COL_WHITE,
enum common_log_col : int {
COMMON_LOG_COL_DEFAULT = 0,
COMMON_LOG_COL_BOLD,
COMMON_LOG_COL_RED,
COMMON_LOG_COL_GREEN,
COMMON_LOG_COL_YELLOW,
COMMON_LOG_COL_BLUE,
COMMON_LOG_COL_MAGENTA,
COMMON_LOG_COL_CYAN,
COMMON_LOG_COL_WHITE,
};
// disable colors by default
@ -54,7 +54,7 @@ static std::vector<const char *> g_col = {
"",
};
struct gpt_log_entry {
struct common_log_entry {
enum ggml_log_level level;
bool prefix;
@ -71,7 +71,7 @@ struct gpt_log_entry {
if (!fcur) {
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
// these messages will still be logged to a file
if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
return;
}
@ -86,19 +86,19 @@ struct gpt_log_entry {
if (timestamp) {
// [M.s.ms.us]
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
g_col[GPT_LOG_COL_BLUE],
g_col[COMMON_LOG_COL_BLUE],
(int) (timestamp / 1000000 / 60),
(int) (timestamp / 1000000 % 60),
(int) (timestamp / 1000 % 1000),
(int) (timestamp % 1000),
g_col[GPT_LOG_COL_DEFAULT]);
g_col[COMMON_LOG_COL_DEFAULT]);
}
switch (level) {
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break;
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break;
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break;
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break;
default:
break;
}
@ -107,18 +107,18 @@ struct gpt_log_entry {
fprintf(fcur, "%s", msg.data());
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
}
fflush(fcur);
}
};
struct gpt_log {
struct common_log {
// default capacity - will be expanded if needed
gpt_log() : gpt_log(256) {}
common_log() : common_log(256) {}
gpt_log(size_t capacity) {
common_log(size_t capacity) {
file = nullptr;
prefix = false;
timestamps = false;
@ -137,7 +137,7 @@ struct gpt_log {
resume();
}
~gpt_log() {
~common_log() {
pause();
if (file) {
fclose(file);
@ -158,12 +158,12 @@ private:
int64_t t_start;
// ring buffer of entries
std::vector<gpt_log_entry> entries;
std::vector<common_log_entry> entries;
size_t head;
size_t tail;
// worker thread copies into this
gpt_log_entry cur;
common_log_entry cur;
public:
void add(enum ggml_log_level level, const char * fmt, va_list args) {
@ -219,7 +219,7 @@ public:
tail = (tail + 1) % entries.size();
if (tail == head) {
// expand the buffer
std::vector<gpt_log_entry> new_entries(2*entries.size());
std::vector<common_log_entry> new_entries(2*entries.size());
size_t new_tail = 0;
@ -320,15 +320,15 @@ public:
pause();
if (colors) {
g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD;
g_col[COMMON_LOG_COL_RED] = LOG_COL_RED;
g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN;
g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW;
g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE;
g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
} else {
for (size_t i = 0; i < g_col.size(); i++) {
g_col[i] = "";
@ -355,47 +355,47 @@ public:
// public API
//
struct gpt_log * gpt_log_init() {
return new gpt_log;
struct common_log * common_log_init() {
return new common_log;
}
struct gpt_log * gpt_log_main() {
static struct gpt_log log;
struct common_log * common_log_main() {
static struct common_log log;
return &log;
}
void gpt_log_pause(struct gpt_log * log) {
void common_log_pause(struct common_log * log) {
log->pause();
}
void gpt_log_resume(struct gpt_log * log) {
void common_log_resume(struct common_log * log) {
log->resume();
}
void gpt_log_free(struct gpt_log * log) {
void common_log_free(struct common_log * log) {
delete log;
}
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {
va_list args;
va_start(args, fmt);
log->add(level, fmt, args);
va_end(args);
}
void gpt_log_set_file(struct gpt_log * log, const char * file) {
void common_log_set_file(struct common_log * log, const char * file) {
log->set_file(file);
}
void gpt_log_set_colors(struct gpt_log * log, bool colors) {
void common_log_set_colors(struct common_log * log, bool colors) {
log->set_colors(colors);
}
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
void common_log_set_prefix(struct common_log * log, bool prefix) {
log->set_prefix(prefix);
}
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
void common_log_set_timestamps(struct common_log * log, bool timestamps) {
log->set_timestamps(timestamps);
}

View file

@ -14,23 +14,23 @@
#define LOG_DEFAULT_LLAMA 0
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
// set via gpt_log_set_verbosity()
extern int gpt_log_verbosity_thold;
// set via common_log_set_verbosity()
extern int common_log_verbosity_thold;
void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
// the gpt_log uses an internal worker thread to print/write log messages
// the common_log uses an internal worker thread to print/write log messages
// when the worker thread is paused, incoming log messages are discarded
struct gpt_log;
struct common_log;
struct gpt_log * gpt_log_init();
struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
void gpt_log_free (struct gpt_log * log);
struct common_log * common_log_init();
struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
void common_log_free (struct common_log * log);
LOG_ATTRIBUTE_FORMAT(3, 4)
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...);
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
//
@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * f
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
//
void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
// helper macros for logging
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@ -66,13 +66,13 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
//
// LOG_DBG("this is a debug message: %d\n", expensive_function());
//
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
//
#define LOG_TMPL(level, verbosity, ...) \
do { \
if ((verbosity) <= gpt_log_verbosity_thold) { \
gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
if ((verbosity) <= common_log_verbosity_thold) { \
common_log_add(common_log_main(), (level), __VA_ARGS__); \
} \
} while (0)

View file

@ -8,7 +8,7 @@
#include <fstream>
#include <thread>
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
std::vector<llama_token> & inp, int nnew, bool print_progress) {
const int64_t t_start_ms = ggml_time_ms();
const int64_t inp_size = inp.size();
@ -20,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
for (int64_t i = i_start; i < inp_size; ++i) {
const int64_t ngram_start = i - ngram_size;
llama_ngram ngram(&inp[ngram_start], ngram_size);
common_ngram ngram(&inp[ngram_start], ngram_size);
const llama_token token = inp[i];
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
if (part_it == ngram_cache.end()) {
llama_ngram_cache_part part;
common_ngram_cache_part part;
part.emplace(token, 1);
ngram_cache.emplace(ngram, part);
} else {
llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
if (token_count_it == part_it->second.end()) {
part_it->second.emplace(token, 1);
} else {
@ -62,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
// Helper function that tries to draft a token from only the static ngram cache:
static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
if (part_static_it == nc_static.end()) {
return -1;
}
const llama_ngram_cache_part part_static = part_static_it->second;
const common_ngram_cache_part part_static = part_static_it->second;
int max_count_static = 0;
int sum_count_static = 0;
@ -95,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
static llama_token try_draft(
llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
const int * min_sample_size, const int * min_percent) {
llama_token drafted_token = -1;
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
const llama_ngram ngram_primary = ngrams_primary[i];
const common_ngram ngram_primary = ngrams_primary[i];
llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
if (part_primary_it == nc_primary.end()) {
continue;
}
const llama_ngram_cache_part part_primary = part_primary_it->second;
const common_ngram_cache_part part_primary = part_primary_it->second;
int max_count_primary = 0;
int max_count_static = 0;
@ -117,7 +117,7 @@ static llama_token try_draft(
for (std::pair<llama_token, int> token_count_primary : part_primary) {
const llama_token token = token_count_primary.first;
llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
const int32_t count_primary = token_count_primary.second;
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
@ -142,9 +142,9 @@ static llama_token try_draft(
return drafted_token;
}
void llama_ngram_cache_draft(
void common_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
) {
GGML_ASSERT(draft.size() == 1);
const int inp_size = inp.size();
@ -157,21 +157,21 @@ void llama_ngram_cache_draft(
llama_token drafted_token = -1;
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
llama_ngram ngram_static;
common_ngram ngram_static;
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
}
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
llama_ngram_cache_part part_static;
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
common_ngram_cache_part part_static;
if (part_static_it != nc_static.end()) {
part_static = part_static_it->second;
}
// cd = context + dynamic
std::vector<llama_ngram> ngrams_cd;
std::vector<common_ngram> ngrams_cd;
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
llama_ngram ngram_cd;
common_ngram ngram_cd;
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
}
@ -196,16 +196,16 @@ void llama_ngram_cache_draft(
}
}
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
std::ofstream file_out(filename, std::ios::binary);
for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
const llama_ngram ngram = item.first;
llama_ngram_cache_part token_counts = item.second;
for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
const common_ngram ngram = item.first;
common_ngram_cache_part token_counts = item.second;
GGML_ASSERT(!token_counts.empty());
const int32_t ntokens = token_counts.size();
GGML_ASSERT(ntokens > 0);
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram));
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
for (std::pair<llama_token, int32_t> item2 : token_counts) {
const llama_token token = item2.first;
@ -219,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
}
llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
common_ngram_cache common_ngram_cache_load(std::string & filename) {
std::ifstream hashmap_file(filename, std::ios::binary);
if (!hashmap_file) {
throw std::ifstream::failure("Unable to open file " + filename);
}
llama_ngram_cache ngram_cache;
common_ngram_cache ngram_cache;
llama_ngram ngram;
common_ngram ngram;
int32_t ntokens;
llama_token token;
int32_t count;
@ -235,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
char * ntokensc = reinterpret_cast<char*>(&ntokens);
char * tokenc = reinterpret_cast<char*>(&token);
char * countc = reinterpret_cast<char*>(&count);
while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
GGML_ASSERT(!hashmap_file.eof());
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
GGML_ASSERT(ntokens > 0);
llama_ngram_cache_part token_counts;
common_ngram_cache_part token_counts;
for (int i = 0; i < ntokens; ++i) {
GGML_ASSERT(!hashmap_file.eof());
@ -257,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
return ngram_cache;
}
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
const llama_ngram ngram = ngram_part.first;
llama_ngram_cache_part part = ngram_part.second;
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
const common_ngram ngram = ngram_part.first;
common_ngram_cache_part part = ngram_part.second;
llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
if (part_merged_it == ngram_cache_target.end()) {
ngram_cache_target.emplace(ngram, part);
continue;
@ -273,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram
const int32_t count = token_count.second;
GGML_ASSERT(count > 0);
llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
if (token_count_merged_it == part_merged_it->second.end()) {
part_merged_it->second.emplace(token, count);
continue;

View file

@ -12,22 +12,22 @@
// Data structures to map n-grams to empirical token probabilities:
struct llama_ngram {
struct common_ngram {
llama_token tokens[LLAMA_NGRAM_MAX];
llama_ngram() {
common_ngram() {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = -1;
}
}
llama_ngram(const llama_token * input, const int ngram_size) {
common_ngram(const llama_token * input, const int ngram_size) {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = i < ngram_size ? input[i] : -1;
}
}
bool operator==(const llama_ngram & other) const {
bool operator==(const common_ngram & other) const {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
if (tokens[i] != other.tokens[i]) {
return false;
@ -37,28 +37,28 @@ struct llama_ngram {
}
};
struct llama_token_hash_function {
struct common_token_hash_function {
size_t operator()(const llama_token token) const {
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
return token * 11400714819323198485llu;
}
};
struct llama_ngram_hash_function {
size_t operator()(const llama_ngram & ngram) const {
size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
struct common_ngram_hash_function {
size_t operator()(const common_ngram & ngram) const {
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
hash ^= llama_token_hash_function{}(ngram.tokens[i]);
hash ^= common_token_hash_function{}(ngram.tokens[i]);
}
return hash;
}
};
// token -> number of times token has been seen
typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
// n-gram -> empirical distribution of following tokens
typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
// Update an ngram cache with tokens.
@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
//
// In order to get correct results inp_data can ONLY BE APPENDED TO.
// Changes in the middle need a complete rebuild.
void llama_ngram_cache_update(
llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
void common_ngram_cache_update(
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
// Try to draft tokens from ngram caches.
// inp: the tokens generated so far.
@ -81,21 +81,21 @@ void llama_ngram_cache_update(
// nc_context: ngram cache based on current context.
// nc_dynamic: ngram cache based on previous user generations.
// nc_static: ngram cache generated from a large text corpus, used for validation.
void llama_ngram_cache_draft(
void common_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
// Save an ngram cache to a file.
// ngram_cache: the ngram cache to save.
// filename: the path under which to save the ngram cache.
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
// Load an ngram cache saved with llama_ngram_cache_save.
// Load an ngram cache saved with common_ngram_cache_save.
// filename: the path from which to load the ngram cache.
// returns: an ngram cache containing the information saved to filename.
llama_ngram_cache llama_ngram_cache_load(std::string & filename);
common_ngram_cache common_ngram_cache_load(std::string & filename);
// Merge two ngram caches.
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);

View file

@ -98,8 +98,8 @@ struct ring_buffer {
std::vector<T> data;
};
struct gpt_sampler {
gpt_sampler_params params;
struct common_sampler {
common_sampler_params params;
struct llama_sampler * grmr;
struct llama_sampler * chain;
@ -125,7 +125,7 @@ struct gpt_sampler {
}
};
std::string gpt_sampler_params::print() const {
std::string common_sampler_params::print() const {
char result[1024];
snprintf(result, sizeof(result),
@ -139,12 +139,12 @@ std::string gpt_sampler_params::print() const {
return std::string(result);
}
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
lparams.no_perf = params.no_perf;
auto * result = new gpt_sampler {
auto * result = new common_sampler {
/* .params = */ params,
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
/* .chain = */ llama_sampler_chain_init(lparams),
@ -175,22 +175,22 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K:
case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break;
case GPT_SAMPLER_TYPE_TOP_P:
case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break;
case GPT_SAMPLER_TYPE_MIN_P:
case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break;
case GPT_SAMPLER_TYPE_TFS_Z:
case COMMON_SAMPLER_TYPE_TFS_Z:
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
break;
case GPT_SAMPLER_TYPE_TYPICAL_P:
case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
case GPT_SAMPLER_TYPE_TEMPERATURE:
case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break;
default:
@ -224,7 +224,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
return result;
}
void gpt_sampler_free(struct gpt_sampler * gsmpl) {
void common_sampler_free(struct common_sampler * gsmpl) {
if (gsmpl) {
llama_sampler_free(gsmpl->grmr);
@ -234,7 +234,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
}
}
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
if (accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token);
}
@ -244,14 +244,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce
gsmpl->prev.push_back(token);
}
void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
void common_sampler_reset(struct common_sampler * gsmpl) {
llama_sampler_reset(gsmpl->grmr);
llama_sampler_reset(gsmpl->chain);
}
struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
return new gpt_sampler {
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
return new common_sampler {
/* .params = */ gsmpl->params,
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
/* .chain = */ llama_sampler_clone(gsmpl->chain),
@ -261,7 +261,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
};
}
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
// TODO: measure grammar performance
if (gsmpl) {
@ -272,7 +272,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
}
}
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
gsmpl->set_logits(ctx, idx);
auto & grmr = gsmpl->grmr;
@ -318,21 +318,21 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
return cur_p.data[cur_p.selected].id;
}
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
return llama_sampler_get_seed(gsmpl->chain);
}
// helpers
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
return &gsmpl->cur_p;
}
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
return gsmpl->prev.rat(0);
}
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
std::string common_sampler_print(const struct common_sampler * gsmpl) {
std::string result = "logits ";
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
@ -343,7 +343,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
return result;
}
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
n = std::min(n, (int) gsmpl->prev.size());
if (n <= 0) {
@ -358,63 +358,63 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
result += llama_token_to_piece(ctx_main, id);
result += common_token_to_piece(ctx_main, id);
}
return result;
}
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K: return 'k';
case GPT_SAMPLER_TYPE_TFS_Z: return 'f';
case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y';
case GPT_SAMPLER_TYPE_TOP_P: return 'p';
case GPT_SAMPLER_TYPE_MIN_P: return 'm';
case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
case COMMON_SAMPLER_TYPE_TFS_Z: return 'f';
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
default : return '?';
}
}
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
switch (cnstr) {
case GPT_SAMPLER_TYPE_TOP_K: return "top_k";
case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z";
case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
case GPT_SAMPLER_TYPE_TOP_P: return "top_p";
case GPT_SAMPLER_TYPE_MIN_P: return "min_p";
case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
case COMMON_SAMPLER_TYPE_TFS_Z: return "tfs_z";
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
default : return "";
}
}
std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
{ "top_k", GPT_SAMPLER_TYPE_TOP_K },
{ "top_p", GPT_SAMPLER_TYPE_TOP_P },
{ "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P },
{ "min_p", GPT_SAMPLER_TYPE_MIN_P },
{ "tfs_z", GPT_SAMPLER_TYPE_TFS_Z },
{ "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
{ "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z },
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
};
// since samplers names are written multiple ways
// make it ready for both system names and input names
std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
{ "top-k", GPT_SAMPLER_TYPE_TOP_K },
{ "top-p", GPT_SAMPLER_TYPE_TOP_P },
{ "nucleus", GPT_SAMPLER_TYPE_TOP_P },
{ "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P },
{ "typical", GPT_SAMPLER_TYPE_TYPICAL_P },
{ "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P },
{ "typ", GPT_SAMPLER_TYPE_TYPICAL_P },
{ "min-p", GPT_SAMPLER_TYPE_MIN_P },
{ "tfs-z", GPT_SAMPLER_TYPE_TFS_Z },
{ "tfs", GPT_SAMPLER_TYPE_TFS_Z },
{ "temp", GPT_SAMPLER_TYPE_TEMPERATURE },
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
{ "tfs-z", COMMON_SAMPLER_TYPE_TFS_Z },
{ "tfs", COMMON_SAMPLER_TYPE_TFS_Z },
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
};
std::vector<gpt_sampler_type> samplers;
std::vector<common_sampler_type> samplers;
samplers.reserve(names.size());
for (const auto & name : names) {
@ -434,17 +434,17 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
return samplers;
}
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
std::unordered_map<char, common_sampler_type> sampler_name_map = {
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z), COMMON_SAMPLER_TYPE_TFS_Z },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }
};
std::vector<gpt_sampler_type> samplers;
std::vector<common_sampler_type> samplers;
samplers.reserve(chars.size());
for (const auto & c : chars) {

View file

@ -7,7 +7,7 @@
#include <string>
#include <vector>
// gpt_sampler extends llama_sampler with additional functionality:
// common_sampler extends llama_sampler with additional functionality:
//
// - grammar support
// - custom sampler logic based on the parameters
@ -23,30 +23,30 @@
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
// grammar constraints are applied to the full vocabulary and the token is resampled.
//
// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
// be moved into the core llama library.
//
// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
// For convenience, the common_sampler also maintains a container with the current candidate tokens.
// This can be used to access the probabilities of the rest of the non-sampled tokens.
//
// TODO: measure grammar performance
//
struct gpt_sampler;
struct common_sampler;
// llama_sampler API overloads
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
void gpt_sampler_free(struct gpt_sampler * gsmpl);
void common_sampler_free(struct common_sampler * gsmpl);
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
void gpt_sampler_reset (struct gpt_sampler * gsmpl);
struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
void common_sampler_reset (struct common_sampler * gsmpl);
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
// arguments can be nullptr to skip printing
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
// extended sampling implementation:
//
@ -58,26 +58,26 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
// if grammar_first is true, the grammar is applied before the samplers (slower)
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
//
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
// helpers
// access the internal list of current candidate tokens
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
// get the last accepted token
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
llama_token common_sampler_last(const struct common_sampler * gsmpl);
// print the sampler chain into a string
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
std::string common_sampler_print(const struct common_sampler * gsmpl);
// get a string representation of the last accepted tokens
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);