Merge branch 'ggerganov:master' into prompt-lookup
This commit is contained in:
commit
340484161f
62 changed files with 10127 additions and 2195 deletions
|
@ -278,8 +278,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.yarn_beta_slow = std::stof(argv[i]);
|
||||
} else if (arg == "--memory-f32") {
|
||||
params.memory_f16 = false;
|
||||
} else if (arg == "--samplers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.samplers_sequence = parse_samplers_input(argv[i]);
|
||||
} else if (arg == "--sampling-seq") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.samplers_sequence = argv[i];
|
||||
} else if (arg == "--top-p") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -498,6 +508,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
params.infill = true;
|
||||
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
||||
params.dump_kv_cache = true;
|
||||
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
||||
params.no_kv_offload = true;
|
||||
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||
params.cache_type_k = argv[++i];
|
||||
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
||||
params.cache_type_v = argv[++i];
|
||||
} else if (arg == "--multiline-input") {
|
||||
params.multiline_input = true;
|
||||
} else if (arg == "--simple-io") {
|
||||
|
@ -640,6 +656,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
} else if (arg == "-h" || arg == "--help") {
|
||||
return false;
|
||||
|
||||
} else if (arg == "--version") {
|
||||
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||
exit(0);
|
||||
} else if (arg == "--random-prompt") {
|
||||
params.random_prompt = true;
|
||||
} else if (arg == "--in-prefix-bos") {
|
||||
|
@ -678,6 +698,47 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(sparams.grammar)
|
||||
);
|
||||
} else if (arg == "--override-kv") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
char * sep = strchr(argv[i], '=');
|
||||
if (sep == nullptr || sep - argv[i] >= 128) {
|
||||
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
struct llama_model_kv_override kvo;
|
||||
std::strncpy(kvo.key, argv[i], sep - argv[i]);
|
||||
kvo.key[sep - argv[i]] = 0;
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
kvo.bool_value = false;
|
||||
} else {
|
||||
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.kv_overrides.push_back(kvo);
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
// Parse args for logging parameters
|
||||
} else if ( log_param_single_parse( argv[i] ) ) {
|
||||
|
@ -721,6 +782,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
}
|
||||
}
|
||||
|
||||
if (!params.kv_overrides.empty()) {
|
||||
params.kv_overrides.emplace_back(llama_model_kv_override());
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -732,6 +798,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" --version show version and build info\n");
|
||||
printf(" -i, --interactive run in interactive mode\n");
|
||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||
|
@ -761,6 +828,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
|
||||
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
|
||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||
|
@ -798,8 +867,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
||||
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||
|
@ -840,6 +907,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" --verbose-prompt print prompt before generation\n");
|
||||
printf(" -dkvc, --dump-kv-cache\n");
|
||||
printf(" verbose print of the KV cache\n");
|
||||
printf(" -nkvo, --no-kv-offload\n");
|
||||
printf(" disable KV offload\n");
|
||||
printf(" -ctk TYPE, --cache-type-k TYPE\n");
|
||||
printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
|
||||
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
||||
printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
|
||||
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||
|
@ -850,6 +923,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
|
||||
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
||||
printf(" path under which to save YAML logs (no logging if unset)\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf("\n");
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_print_usage();
|
||||
|
@ -886,6 +962,48 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
|||
GGML_UNREACHABLE();
|
||||
}
|
||||
|
||||
//
|
||||
// String parsing
|
||||
//
|
||||
|
||||
std::string parse_samplers_input(std::string input) {
|
||||
std::string output = "";
|
||||
// since samplers names are written multiple ways
|
||||
// make it ready for both system names and input names
|
||||
std::unordered_map<std::string, char> samplers_symbols {
|
||||
{"top_k", 'k'},
|
||||
{"top-k", 'k'},
|
||||
{"top_p", 'p'},
|
||||
{"top-p", 'p'},
|
||||
{"nucleus", 'p'},
|
||||
{"typical_p", 'y'},
|
||||
{"typical-p", 'y'},
|
||||
{"typical", 'y'},
|
||||
{"min_p", 'm'},
|
||||
{"min-p", 'm'},
|
||||
{"tfs_z", 'f'},
|
||||
{"tfs-z", 'f'},
|
||||
{"tfs", 'f'},
|
||||
{"temp", 't'},
|
||||
{"temperature",'t'}
|
||||
};
|
||||
// expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
|
||||
size_t separator = input.find(';');
|
||||
while (separator != input.npos) {
|
||||
std::string name = input.substr(0,separator);
|
||||
input = input.substr(separator+1);
|
||||
separator = input.find(';');
|
||||
|
||||
if (samplers_symbols.find(name) != samplers_symbols.end()) {
|
||||
output += samplers_symbols[name];
|
||||
}
|
||||
}
|
||||
if (samplers_symbols.find(input) != samplers_symbols.end()) {
|
||||
output += samplers_symbols[input];
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
//
|
||||
// Model utils
|
||||
//
|
||||
|
@ -900,10 +1018,39 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
} else {
|
||||
GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
|
||||
mparams.kv_overrides = params.kv_overrides.data();
|
||||
}
|
||||
|
||||
return mparams;
|
||||
}
|
||||
|
||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
if (s == "f16") {
|
||||
return GGML_TYPE_F16;
|
||||
}
|
||||
if (s == "q8_0") {
|
||||
return GGML_TYPE_Q8_0;
|
||||
}
|
||||
if (s == "q4_0") {
|
||||
return GGML_TYPE_Q4_0;
|
||||
}
|
||||
if (s == "q4_1") {
|
||||
return GGML_TYPE_Q4_1;
|
||||
}
|
||||
if (s == "q5_0") {
|
||||
return GGML_TYPE_Q5_0;
|
||||
}
|
||||
if (s == "q5_1") {
|
||||
return GGML_TYPE_Q5_1;
|
||||
}
|
||||
|
||||
throw std::runtime_error("Invalid cache type: " + s);
|
||||
}
|
||||
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||
auto cparams = llama_context_default_params();
|
||||
|
||||
|
@ -913,7 +1060,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
cparams.mul_mat_q = params.mul_mat_q;
|
||||
cparams.seed = params.seed;
|
||||
cparams.f16_kv = params.memory_f16;
|
||||
cparams.logits_all = params.logits_all;
|
||||
cparams.embedding = params.embedding;
|
||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||
|
@ -924,6 +1070,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||
cparams.offload_kqv = !params.no_kv_offload;
|
||||
|
||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
||||
|
||||
return cparams;
|
||||
}
|
||||
|
@ -1336,7 +1486,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||
}
|
||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
||||
|
|
|
@ -86,6 +86,8 @@ struct gpt_params {
|
|||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
|
||||
// TODO: avoid tuple, use struct
|
||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
@ -98,7 +100,6 @@ struct gpt_params {
|
|||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||
|
||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
|
@ -123,10 +124,14 @@ struct gpt_params {
|
|||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool infill = false; // use infill mode
|
||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
|
||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||
|
||||
// multimodal models (see examples/llava)
|
||||
std::string mmproj = ""; // path to multimodal projector
|
||||
std::string image = ""; // path to an image file
|
||||
std::string image = ""; // path to an image file
|
||||
};
|
||||
|
||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
||||
|
@ -141,6 +146,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);
|
|||
|
||||
void process_escapes(std::string& input);
|
||||
|
||||
//
|
||||
// String parsing
|
||||
//
|
||||
|
||||
std::string parse_samplers_input(std::string input);
|
||||
|
||||
//
|
||||
// Model utils
|
||||
//
|
||||
|
|
|
@ -61,13 +61,13 @@
|
|||
// #define LOG_TARGET stderr
|
||||
// #include "log.h"
|
||||
//
|
||||
// The log target can also be redirected to a diffrent function
|
||||
// The log target can also be redirected to a different function
|
||||
// like so:
|
||||
//
|
||||
// #define LOG_TARGET log_handler_diffrent()
|
||||
// #define LOG_TARGET log_handler_different()
|
||||
// #include "log.h"
|
||||
//
|
||||
// FILE* log_handler_diffrent()
|
||||
// FILE* log_handler_different()
|
||||
// {
|
||||
// return stderr;
|
||||
// }
|
||||
|
@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
|
|||
|
||||
// Disables logs entirely at runtime.
|
||||
// Makes LOG() and LOG_TEE() produce no output,
|
||||
// untill enabled back.
|
||||
// until enabled back.
|
||||
#define log_disable() log_disable_impl()
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
|
|
|
@ -99,6 +99,56 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
|||
return std::string(result);
|
||||
}
|
||||
|
||||
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||
std::string result = "CFG -> Penalties ";
|
||||
if (params.mirostat == 0) {
|
||||
for (auto s : params.samplers_sequence) {
|
||||
switch (s) {
|
||||
case 'k': result += "-> top_k "; break;
|
||||
case 'f': result += "-> tfs_z "; break;
|
||||
case 'y': result += "-> typical_p "; break;
|
||||
case 'p': result += "-> top_p "; break;
|
||||
case 'm': result += "-> min_p "; break;
|
||||
case 't': result += "-> temp "; break;
|
||||
default : break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
result += "-> mirostat ";
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// no reasons to expose this function in header
|
||||
static void sampler_queue(
|
||||
struct llama_context * ctx_main,
|
||||
const llama_sampling_params & params,
|
||||
llama_token_data_array & cur_p,
|
||||
size_t & min_keep) {
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float min_p = params.min_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
const std::string & samplers_sequence = params.samplers_sequence;
|
||||
|
||||
for (auto s : samplers_sequence) {
|
||||
switch (s){
|
||||
case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
||||
case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
||||
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
|
||||
default : break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llama_token llama_sampling_sample(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
|
@ -109,11 +159,6 @@ llama_token llama_sampling_sample(
|
|||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float min_p = params.min_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
||||
const float penalty_repeat = params.penalty_repeat;
|
||||
const float penalty_freq = params.penalty_freq;
|
||||
|
@ -188,12 +233,7 @@ llama_token llama_sampling_sample(
|
|||
// temperature sampling
|
||||
size_t min_keep = std::max(1, params.n_probs);
|
||||
|
||||
llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep);
|
||||
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
||||
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
||||
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
||||
llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep);
|
||||
llama_sample_temp (ctx_main, &cur_p, temp);
|
||||
sampler_queue(ctx_main, params, cur_p, min_keep);
|
||||
|
||||
id = llama_sample_token(ctx_main, &cur_p);
|
||||
|
||||
|
|
|
@ -10,22 +10,23 @@
|
|||
|
||||
// sampling parameters
|
||||
typedef struct llama_sampling_params {
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // 1.0 = disabled
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // 1.0 = disabled
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
||||
|
@ -80,6 +81,9 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
|
|||
// Print sampling parameters into a string
|
||||
std::string llama_sampling_print(const llama_sampling_params & params);
|
||||
|
||||
// Print sampling order into a string
|
||||
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
||||
|
||||
// this is a common sampling function used across the examples for convenience
|
||||
// it can serve as a starting point for implementing your own sampling function
|
||||
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||
|
|
|
@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)
|
|||
|
||||
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
|
||||
float scale = 1.0f; // xavier
|
||||
switch (tensor->n_dims) {
|
||||
switch (ggml_n_dims(tensor)) {
|
||||
case 1:
|
||||
scale /= sqrtf((float) tensor->ne[0]);
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
|
@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
|
|||
}
|
||||
|
||||
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
|
||||
switch (tensor->n_dims) {
|
||||
switch (ggml_n_dims(tensor)) {
|
||||
case 1:
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
|
||||
|
@ -183,25 +183,27 @@ float fclamp(const float v, const float min, const float max) {
|
|||
}
|
||||
|
||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
||||
GGML_ASSERT(tensor->n_dims == 1);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == 1);
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
}
|
||||
|
||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
||||
GGML_ASSERT(tensor->n_dims == 2);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
}
|
||||
|
||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
||||
GGML_ASSERT(tensor->n_dims == 3);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
}
|
||||
|
||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||
GGML_ASSERT(tensor->n_dims == 4);
|
||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
||||
|
@ -225,8 +227,8 @@ int64_t get_example_targets_batch(
|
|||
bool sample_random_offsets
|
||||
) {
|
||||
GGML_ASSERT(samples_count > 0);
|
||||
GGML_ASSERT(tokens_input->n_dims == 2);
|
||||
GGML_ASSERT(target_probs->n_dims == 3);
|
||||
GGML_ASSERT(ggml_is_matrix(tokens_input));
|
||||
GGML_ASSERT(ggml_is_3d(target_probs));
|
||||
int64_t n_vocab = target_probs->ne[0];
|
||||
int64_t n_tokens = tokens_input->ne[0];
|
||||
int64_t n_batch = tokens_input->ne[1];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue