llama : remove sampling from llama_context
ggml-ci
This commit is contained in:
parent
cc53500f65
commit
ae9d3f68e9
25 changed files with 75 additions and 137 deletions
|
@ -264,6 +264,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
params.kv_overrides.back().key[0] = 0;
|
params.kv_overrides.back().key[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.sparams.seed == LLAMA_DEFAULT_SEED) {
|
||||||
|
params.sparams.seed = time(NULL);
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -294,8 +298,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
|
|
||||||
if (arg == "-s" || arg == "--seed") {
|
if (arg == "-s" || arg == "--seed") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
// TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
|
|
||||||
params.seed = std::stoul(argv[i]);
|
|
||||||
sparams.seed = std::stoul(argv[i]);
|
sparams.seed = std::stoul(argv[i]);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -1414,7 +1416,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
|
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
|
||||||
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
||||||
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
||||||
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
|
||||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
||||||
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
||||||
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
||||||
|
@ -1465,6 +1466,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
||||||
|
|
||||||
options.push_back({ "sampling" });
|
options.push_back({ "sampling" });
|
||||||
|
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", sparams.seed });
|
||||||
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
||||||
"(default: %s)", sampler_type_names.c_str() });
|
"(default: %s)", sampler_type_names.c_str() });
|
||||||
options.push_back({ "*", " --sampling-seq SEQUENCE",
|
options.push_back({ "*", " --sampling-seq SEQUENCE",
|
||||||
|
@ -2239,7 +2241,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.n_ubatch = params.n_ubatch;
|
cparams.n_ubatch = params.n_ubatch;
|
||||||
cparams.n_threads = params.n_threads;
|
cparams.n_threads = params.n_threads;
|
||||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||||
cparams.seed = params.seed;
|
|
||||||
cparams.logits_all = params.logits_all;
|
cparams.logits_all = params.logits_all;
|
||||||
cparams.embeddings = params.embedding;
|
cparams.embeddings = params.embedding;
|
||||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||||
|
@ -3249,7 +3250,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||||
|
|
||||||
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
||||||
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
||||||
fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
|
|
||||||
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
||||||
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
||||||
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
||||||
|
|
|
@ -68,8 +68,6 @@ enum dimre_method {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
|
||||||
|
|
||||||
int32_t n_threads = cpu_get_num_math();
|
int32_t n_threads = cpu_get_num_math();
|
||||||
int32_t n_threads_draft = -1;
|
int32_t n_threads_draft = -1;
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||||
|
|
|
@ -3,19 +3,10 @@
|
||||||
#include <random>
|
#include <random>
|
||||||
|
|
||||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, const struct llama_model * model) {
|
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, const struct llama_model * model) {
|
||||||
auto result = llama_sampling_init(params, llama_sampling_init(model, params.grammar.c_str(), "root"));
|
|
||||||
|
|
||||||
result->owned = true;
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_sampling * smpl) {
|
|
||||||
struct llama_sampling_context * result = new llama_sampling_context();
|
struct llama_sampling_context * result = new llama_sampling_context();
|
||||||
|
|
||||||
result->params = params;
|
result->params = params;
|
||||||
result->owned = false;
|
result->smpl = llama_sampling_init(model, params.grammar.c_str(), "root");
|
||||||
result->smpl = smpl;
|
|
||||||
|
|
||||||
result->prev.resize(params.n_prev);
|
result->prev.resize(params.n_prev);
|
||||||
|
|
||||||
|
@ -27,9 +18,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sampling_free(struct llama_sampling_context * ctx) {
|
void llama_sampling_free(struct llama_sampling_context * ctx) {
|
||||||
if (ctx->owned) {
|
llama_sampling_free(ctx->smpl);
|
||||||
llama_sampling_free(ctx->smpl);
|
|
||||||
}
|
|
||||||
|
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
|
@ -71,8 +71,6 @@ struct llama_sampling_context {
|
||||||
// mirostat sampler state
|
// mirostat sampler state
|
||||||
float mirostat_mu;
|
float mirostat_mu;
|
||||||
|
|
||||||
bool owned;
|
|
||||||
|
|
||||||
llama_sampling * smpl;
|
llama_sampling * smpl;
|
||||||
|
|
||||||
// TODO: replace with ring-buffer
|
// TODO: replace with ring-buffer
|
||||||
|
@ -86,7 +84,6 @@ struct llama_sampling_context {
|
||||||
|
|
||||||
// Create a new sampling context instance.
|
// Create a new sampling context instance.
|
||||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, const struct llama_model * model);
|
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, const struct llama_model * model);
|
||||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params, struct llama_sampling * smpl);
|
|
||||||
|
|
||||||
void llama_sampling_free(struct llama_sampling_context * ctx);
|
void llama_sampling_free(struct llama_sampling_context * ctx);
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,6 @@ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), mo
|
||||||
print("Failed to load model")
|
print("Failed to load model")
|
||||||
exit(1)
|
exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer {
|
defer {
|
||||||
llama_free_model(model)
|
llama_free_model(model)
|
||||||
}
|
}
|
||||||
|
@ -37,24 +36,29 @@ var tokens = tokenize(text: prompt, add_bos: true)
|
||||||
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
|
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
|
||||||
|
|
||||||
var context_params = llama_context_default_params()
|
var context_params = llama_context_default_params()
|
||||||
context_params.seed = 1234
|
|
||||||
context_params.n_ctx = n_kv_req
|
context_params.n_ctx = n_kv_req
|
||||||
context_params.n_batch = UInt32(max(n_len, n_parallel))
|
context_params.n_batch = UInt32(max(n_len, n_parallel))
|
||||||
context_params.n_threads = 8
|
context_params.n_threads = 8
|
||||||
context_params.n_threads_batch = 8
|
context_params.n_threads_batch = 8
|
||||||
|
|
||||||
let context = llama_new_context_with_model(model, context_params)
|
let context = llama_new_context_with_model(model, context_params)
|
||||||
let smpl = llama_get_sampling(context)
|
|
||||||
|
|
||||||
guard context != nil else {
|
guard context != nil else {
|
||||||
print("Failed to initialize context")
|
print("Failed to initialize context")
|
||||||
exit(1)
|
exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer {
|
defer {
|
||||||
llama_free(context)
|
llama_free(context)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let smpl = llama_sampling_init(model, nil, nil)
|
||||||
|
guard smpl != nil else {
|
||||||
|
print("Failed to initialize sampling")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
defer {
|
||||||
|
llama_sampling_free(smpl)
|
||||||
|
}
|
||||||
|
|
||||||
let n_ctx = llama_n_ctx(context)
|
let n_ctx = llama_n_ctx(context)
|
||||||
|
|
||||||
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
||||||
|
|
|
@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
|
||||||
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
llama_sampling * smpl = llama_get_sampling(ctx);
|
llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||||
|
|
|
@ -90,13 +90,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
|
@ -151,8 +151,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
|
|
@ -92,11 +92,10 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
|
static std::string generate(llama_context * ctx, llama_sampling * smpl, const std::string & prompt, bool stream) {
|
||||||
std::string result;
|
std::string result;
|
||||||
|
|
||||||
const llama_model * model = llama_get_model(ctx);
|
const llama_model * model = llama_get_model(ctx);
|
||||||
llama_sampling * smpl = llama_get_sampling(ctx);
|
|
||||||
llama_token eos_token = llama_token_eos(model);
|
llama_token eos_token = llama_token_eos(model);
|
||||||
|
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
@ -117,7 +116,7 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
|
||||||
inputs.clear();
|
inputs.clear();
|
||||||
|
|
||||||
llama_decode(ctx, bat);
|
llama_decode(ctx, bat);
|
||||||
auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
|
auto * logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
|
||||||
|
|
||||||
auto candidates = std::vector<llama_token_data>(llama_n_vocab(model));
|
auto candidates = std::vector<llama_token_data>(llama_n_vocab(model));
|
||||||
auto n_candidates = (int32_t)candidates.size();
|
auto n_candidates = (int32_t)candidates.size();
|
||||||
|
@ -173,6 +172,8 @@ int main(int argc, char * argv[]) {
|
||||||
// create generation context
|
// create generation context
|
||||||
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
|
||||||
|
llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr);
|
||||||
|
|
||||||
// ### Embedding/Representation ###
|
// ### Embedding/Representation ###
|
||||||
// samples taken from: https://github.com/ContextualAI/gritlm#basic
|
// samples taken from: https://github.com/ContextualAI/gritlm#basic
|
||||||
{
|
{
|
||||||
|
@ -209,9 +210,10 @@ int main(int argc, char * argv[]) {
|
||||||
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
||||||
{
|
{
|
||||||
const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
|
const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
|
||||||
std::string response = generate(ctx, prompt, true);
|
std::string response = generate(ctx, smpl, prompt, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_sampling_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
|
@ -156,16 +156,9 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
print_build_info();
|
||||||
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
LOG("%s: llama backend init\n", __func__);
|
LOG("%s: llama backend init\n", __func__);
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
@ -351,7 +344,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
|
|
||||||
ctx_sampling = llama_sampling_init(sparams, llama_get_sampling(ctx));
|
ctx_sampling = llama_sampling_init(sparams, model);
|
||||||
|
|
||||||
while (n_remain != 0 || params.interactive) {
|
while (n_remain != 0 || params.interactive) {
|
||||||
// predict
|
// predict
|
||||||
|
|
|
@ -120,7 +120,6 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
|
||||||
LOGi("Using %d threads", n_threads);
|
LOGi("Using %d threads", n_threads);
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
ctx_params.seed = 1234;
|
|
||||||
ctx_params.n_ctx = 2048;
|
ctx_params.n_ctx = 2048;
|
||||||
ctx_params.n_threads = n_threads;
|
ctx_params.n_threads = n_threads;
|
||||||
ctx_params.n_threads_batch = n_threads;
|
ctx_params.n_threads_batch = n_threads;
|
||||||
|
@ -380,12 +379,13 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||||
JNIEnv * env,
|
JNIEnv * env,
|
||||||
jobject,
|
jobject,
|
||||||
jlong context_pointer,
|
jlong context_pointer,
|
||||||
|
jlong sampling_pointer,
|
||||||
jlong batch_pointer,
|
jlong batch_pointer,
|
||||||
jint n_len,
|
jint n_len,
|
||||||
jobject intvar_ncur
|
jobject intvar_ncur
|
||||||
) {
|
) {
|
||||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||||
const auto sampling = reinterpret_cast<llama_sampling *>(llama_get_sampling(context));
|
const auto sampling = reinterpret_cast<llama_sampling *>(sampling_pointer);
|
||||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||||
const auto model = llama_get_model(context);
|
const auto model = llama_get_model(context);
|
||||||
|
|
||||||
|
|
|
@ -43,14 +43,14 @@ actor LlamaContext {
|
||||||
self.tokens_list = []
|
self.tokens_list = []
|
||||||
self.batch = llama_batch_init(512, 0, 1)
|
self.batch = llama_batch_init(512, 0, 1)
|
||||||
self.temporary_invalid_cchars = []
|
self.temporary_invalid_cchars = []
|
||||||
self.sampling = llama_get_sampling(context)
|
self.sampling = llama_sampling_init(context, nil, nil);
|
||||||
}
|
}
|
||||||
|
|
||||||
deinit {
|
deinit {
|
||||||
|
llama_sampling_free(sampling)
|
||||||
llama_batch_free(batch)
|
llama_batch_free(batch)
|
||||||
llama_free(context)
|
llama_free(context)
|
||||||
llama_free_model(model)
|
llama_free_model(model)
|
||||||
llama_sampling_free(sampling)
|
|
||||||
llama_backend_free()
|
llama_backend_free()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,7 +72,6 @@ actor LlamaContext {
|
||||||
print("Using \(n_threads) threads")
|
print("Using \(n_threads) threads")
|
||||||
|
|
||||||
var ctx_params = llama_context_default_params()
|
var ctx_params = llama_context_default_params()
|
||||||
ctx_params.seed = 1234
|
|
||||||
ctx_params.n_ctx = 2048
|
ctx_params.n_ctx = 2048
|
||||||
ctx_params.n_threads = UInt32(n_threads)
|
ctx_params.n_threads = UInt32(n_threads)
|
||||||
ctx_params.n_threads_batch = UInt32(n_threads)
|
ctx_params.n_threads_batch = UInt32(n_threads)
|
||||||
|
|
|
@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams, llama_get_sampling(ctx_llava->ctx_llama));
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams, ctx_llava->model);
|
||||||
if (!ctx_sampling) {
|
if (!ctx_sampling) {
|
||||||
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
exit(1);
|
exit(1);
|
||||||
|
|
|
@ -161,7 +161,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
|
||||||
struct llama_context * ctx_llama,
|
struct llama_context * ctx_llama,
|
||||||
int * n_past) {
|
int * n_past) {
|
||||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
|
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
|
||||||
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
|
llama_sampling_accept(ctx_sampling, id, true);
|
||||||
static std::string ret;
|
static std::string ret;
|
||||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||||
ret = "</s>";
|
ret = "</s>";
|
||||||
|
@ -218,7 +218,7 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams, ctx_llava->model);
|
||||||
return ctx_sampling;
|
return ctx_sampling;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -299,7 +299,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
llama_print_timings(ctx_llava->ctx_llama);
|
llama_print_timings(ctx_llava->ctx_llama, nullptr);
|
||||||
|
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -118,7 +117,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||||
|
|
||||||
// target model sampling context
|
// target model sampling context
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, llama_get_sampling(ctx));
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, model);
|
||||||
|
|
||||||
// verification n-grams
|
// verification n-grams
|
||||||
std::vector<ngram_data> ngrams_cur(G);
|
std::vector<ngram_data> ngrams_cur(G);
|
||||||
|
|
|
@ -3,13 +3,11 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
@ -106,7 +104,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, llama_get_sampling(ctx));
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, model);
|
||||||
|
|
||||||
std::vector<llama_token> draft;
|
std::vector<llama_token> draft;
|
||||||
|
|
||||||
|
|
|
@ -183,16 +183,9 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
print_build_info();
|
||||||
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
LOG("%s: llama backend init\n", __func__);
|
LOG("%s: llama backend init\n", __func__);
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
@ -535,7 +528,7 @@ int main(int argc, char ** argv) {
|
||||||
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
|
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx_sampling = llama_sampling_init(sparams, llama_get_sampling(ctx));
|
ctx_sampling = llama_sampling_init(sparams, model);
|
||||||
if (!ctx_sampling) {
|
if (!ctx_sampling) {
|
||||||
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
exit(1);
|
exit(1);
|
||||||
|
|
|
@ -26,8 +26,6 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
|
|
||||||
|
|
||||||
int n_junk = params.n_junk;
|
int n_junk = params.n_junk;
|
||||||
int n_keep = params.n_keep;
|
int n_keep = params.n_keep;
|
||||||
int n_grp = params.grp_attn_n;
|
int n_grp = params.grp_attn_n;
|
||||||
|
@ -85,7 +83,7 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampling * smpl = llama_get_sampling(ctx);
|
llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
|
@ -274,6 +272,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
llama_sampling_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
|
|
|
@ -2007,13 +2007,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
|
@ -319,8 +319,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
auto cparams = llama_context_default_params();
|
auto cparams = llama_context_default_params();
|
||||||
cparams.n_ctx = 256;
|
cparams.n_ctx = 256;
|
||||||
cparams.seed = 1;
|
|
||||||
|
|
||||||
ctx = llama_new_context_with_model(model, cparams);
|
ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <chrono>
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
@ -38,7 +37,7 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampling * smpl = llama_get_sampling(ctx);
|
llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr);
|
||||||
|
|
||||||
// tokenize prompt
|
// tokenize prompt
|
||||||
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
||||||
|
@ -98,7 +97,7 @@ int main(int argc, char ** argv) {
|
||||||
// make new context
|
// make new context
|
||||||
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||||
|
|
||||||
llama_sampling * smpl2 = llama_get_sampling(ctx2);
|
llama_sampling * smpl2 = llama_sampling_init(model, nullptr, nullptr);
|
||||||
|
|
||||||
printf("\nsecond run: %s", params.prompt.c_str());
|
printf("\nsecond run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
|
@ -163,7 +162,7 @@ int main(int argc, char ** argv) {
|
||||||
// make new context
|
// make new context
|
||||||
auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||||
|
|
||||||
llama_sampling * smpl3 = llama_get_sampling(ctx3);
|
llama_sampling * smpl3 = llama_sampling_init(model, nullptr, nullptr);
|
||||||
|
|
||||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
|
@ -246,6 +245,10 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
|
llama_sampling_free(smpl);
|
||||||
|
llama_sampling_free(smpl2);
|
||||||
|
llama_sampling_free(smpl3);
|
||||||
|
|
||||||
llama_free(ctx3);
|
llama_free(ctx3);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,7 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampling * smpl = llama_get_sampling(ctx);
|
llama_sampling * smpl = llama_sampling_init(model, nullptr, nullptr);
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
|
||||||
|
@ -168,6 +168,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
llama_sampling_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
|
|
|
@ -43,10 +43,7 @@ int main(int argc, char ** argv) {
|
||||||
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
||||||
const float p_split = params.p_split;
|
const float p_split = params.p_split;
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
std::default_random_engine rng(params.sparams.seed);
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
std::default_random_engine rng(params.seed);
|
|
||||||
std::uniform_real_distribution<> u_dist;
|
std::uniform_real_distribution<> u_dist;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
|
@ -179,7 +176,7 @@ int main(int argc, char ** argv) {
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
||||||
// target model sampling context (reuse the llama_context's sampling instance)
|
// target model sampling context (reuse the llama_context's sampling instance)
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, llama_get_sampling(ctx_tgt));
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams, model_tgt);
|
||||||
|
|
||||||
// draft sequence data
|
// draft sequence data
|
||||||
std::vector<seq_draft> drafts(n_seq_dft);
|
std::vector<seq_draft> drafts(n_seq_dft);
|
||||||
|
|
|
@ -300,7 +300,6 @@ extern "C" {
|
||||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||||
// https://github.com/ggerganov/llama.cpp/pull/7544
|
// https://github.com/ggerganov/llama.cpp/pull/7544
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
uint32_t seed; // RNG seed, -1 for random
|
|
||||||
uint32_t n_ctx; // text context, 0 = from model
|
uint32_t n_ctx; // text context, 0 = from model
|
||||||
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
||||||
uint32_t n_ubatch; // physical maximum batch size
|
uint32_t n_ubatch; // physical maximum batch size
|
||||||
|
@ -407,6 +406,7 @@ extern "C" {
|
||||||
|
|
||||||
LLAMA_API void llama_free_model(struct llama_model * model);
|
LLAMA_API void llama_free_model(struct llama_model * model);
|
||||||
|
|
||||||
|
// TODO: rename to llama_init_from_model
|
||||||
LLAMA_API struct llama_context * llama_new_context_with_model(
|
LLAMA_API struct llama_context * llama_new_context_with_model(
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
struct llama_context_params params);
|
struct llama_context_params params);
|
||||||
|
@ -432,8 +432,7 @@ extern "C" {
|
||||||
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
||||||
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
||||||
|
|
||||||
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
||||||
LLAMA_API struct llama_sampling * llama_get_sampling( struct llama_context * ctx);
|
|
||||||
|
|
||||||
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
||||||
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
||||||
|
@ -663,7 +662,7 @@ extern "C" {
|
||||||
//
|
//
|
||||||
|
|
||||||
// Returns the *actual* size in bytes of the state
|
// Returns the *actual* size in bytes of the state
|
||||||
// (rng, logits, embedding and kv_cache)
|
// (logits, embedding and kv_cache)
|
||||||
// Only use when saving the state, not when restoring it, otherwise the size may be too small.
|
// Only use when saving the state, not when restoring it, otherwise the size may be too small.
|
||||||
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
|
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
|
||||||
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
|
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
|
||||||
|
|
|
@ -2673,7 +2673,6 @@ struct llama_model {
|
||||||
struct llama_context {
|
struct llama_context {
|
||||||
llama_context(const llama_model & model)
|
llama_context(const llama_model & model)
|
||||||
: model(model)
|
: model(model)
|
||||||
, sampling(model.vocab, nullptr, nullptr) // by default, no grammar
|
|
||||||
, t_start_us(model.t_start_us)
|
, t_start_us(model.t_start_us)
|
||||||
, t_load_us(model.t_load_us) {}
|
, t_load_us(model.t_load_us) {}
|
||||||
|
|
||||||
|
@ -2690,7 +2689,6 @@ struct llama_context {
|
||||||
const struct llama_model & model;
|
const struct llama_model & model;
|
||||||
|
|
||||||
struct llama_cparams cparams;
|
struct llama_cparams cparams;
|
||||||
struct llama_sampling sampling;
|
|
||||||
struct llama_kv_cache kv_self;
|
struct llama_kv_cache kv_self;
|
||||||
struct llama_control_vector cvec;
|
struct llama_control_vector cvec;
|
||||||
|
|
||||||
|
@ -16442,7 +16440,6 @@ struct llama_model_params llama_model_default_params() {
|
||||||
|
|
||||||
struct llama_context_params llama_context_default_params() {
|
struct llama_context_params llama_context_default_params() {
|
||||||
struct llama_context_params result = {
|
struct llama_context_params result = {
|
||||||
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
|
||||||
/*.n_ctx =*/ 512,
|
/*.n_ctx =*/ 512,
|
||||||
/*.n_batch =*/ 2048,
|
/*.n_batch =*/ 2048,
|
||||||
/*.n_ubatch =*/ 512,
|
/*.n_ubatch =*/ 512,
|
||||||
|
@ -16721,10 +16718,6 @@ struct llama_context * llama_new_context_with_model(
|
||||||
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
||||||
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
||||||
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
||||||
|
@ -16735,8 +16728,6 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->abort_callback = params.abort_callback;
|
ctx->abort_callback = params.abort_callback;
|
||||||
ctx->abort_callback_data = params.abort_callback_data;
|
ctx->abort_callback_data = params.abort_callback_data;
|
||||||
|
|
||||||
llama_sampling_set_rng_seed_impl(ctx->sampling, params.seed);
|
|
||||||
|
|
||||||
ctx->logits_all = params.logits_all;
|
ctx->logits_all = params.logits_all;
|
||||||
|
|
||||||
// build worst-case graph for encoder if a model contains encoder
|
// build worst-case graph for encoder if a model contains encoder
|
||||||
|
@ -17056,10 +17047,6 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
|
||||||
return &ctx->model;
|
return &ctx->model;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_sampling * llama_get_sampling(struct llama_context * ctx) {
|
|
||||||
return &ctx->sampling;
|
|
||||||
}
|
|
||||||
|
|
||||||
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
|
||||||
return ctx->cparams.pooling_type;
|
return ctx->cparams.pooling_type;
|
||||||
}
|
}
|
||||||
|
@ -17532,14 +17519,14 @@ struct llama_data_write {
|
||||||
// TODO: add more model-specific info which should prevent loading the session file if not identical
|
// TODO: add more model-specific info which should prevent loading the session file if not identical
|
||||||
}
|
}
|
||||||
|
|
||||||
void write_rng(const std::mt19937 & rng) {
|
//void write_rng(const std::mt19937 & rng) {
|
||||||
std::ostringstream rng_ss;
|
// std::ostringstream rng_ss;
|
||||||
rng_ss << rng;
|
// rng_ss << rng;
|
||||||
|
|
||||||
const std::string & rng_str = rng_ss.str();
|
// const std::string & rng_str = rng_ss.str();
|
||||||
|
|
||||||
write_string(rng_str);
|
// write_string(rng_str);
|
||||||
}
|
//}
|
||||||
|
|
||||||
void write_output_ids(const struct llama_context * ctx) {
|
void write_output_ids(const struct llama_context * ctx) {
|
||||||
const uint32_t n_outputs = ctx->n_outputs;
|
const uint32_t n_outputs = ctx->n_outputs;
|
||||||
|
@ -17757,17 +17744,17 @@ struct llama_data_read {
|
||||||
// TODO: add more info which needs to be identical but which is not verified otherwise
|
// TODO: add more info which needs to be identical but which is not verified otherwise
|
||||||
}
|
}
|
||||||
|
|
||||||
void read_rng(std::mt19937 & rng) {
|
//void read_rng(std::mt19937 & rng) {
|
||||||
std::string rng_str;
|
// std::string rng_str;
|
||||||
read_string(rng_str);
|
// read_string(rng_str);
|
||||||
|
|
||||||
std::istringstream rng_ss(rng_str);
|
// std::istringstream rng_ss(rng_str);
|
||||||
rng_ss >> rng;
|
// rng_ss >> rng;
|
||||||
|
|
||||||
if (rng_ss.fail()) {
|
// if (rng_ss.fail()) {
|
||||||
throw std::runtime_error("failed to load RNG state");
|
// throw std::runtime_error("failed to load RNG state");
|
||||||
}
|
// }
|
||||||
}
|
//}
|
||||||
|
|
||||||
void read_output_ids(struct llama_context * ctx) {
|
void read_output_ids(struct llama_context * ctx) {
|
||||||
std::vector<int32_t> output_pos;
|
std::vector<int32_t> output_pos;
|
||||||
|
@ -18181,8 +18168,6 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da
|
||||||
|
|
||||||
data_ctx.write_model_info(ctx);
|
data_ctx.write_model_info(ctx);
|
||||||
|
|
||||||
data_ctx.write_rng(ctx->sampling.rng);
|
|
||||||
|
|
||||||
// copy outputs
|
// copy outputs
|
||||||
data_ctx.write_output_ids(ctx);
|
data_ctx.write_output_ids(ctx);
|
||||||
data_ctx.write_logits(ctx);
|
data_ctx.write_logits(ctx);
|
||||||
|
@ -18220,9 +18205,6 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da
|
||||||
|
|
||||||
data_ctx.read_model_info(ctx);
|
data_ctx.read_model_info(ctx);
|
||||||
|
|
||||||
// set rng
|
|
||||||
data_ctx.read_rng(ctx->sampling.rng);
|
|
||||||
|
|
||||||
// set outputs
|
// set outputs
|
||||||
data_ctx.read_output_ids(ctx);
|
data_ctx.read_output_ids(ctx);
|
||||||
data_ctx.read_logits(ctx);
|
data_ctx.read_logits(ctx);
|
||||||
|
@ -19261,12 +19243,12 @@ void llama_print_timings(struct llama_context * ctx, struct llama_sampling * smp
|
||||||
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
||||||
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
||||||
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
||||||
/*.t_sampling_ms =*/ 1e-3 * (smpl ? smpl->t_total_us : ctx->sampling.t_total_us),
|
/*.t_sampling_ms =*/ 1e-3 * (smpl ? smpl->t_total_us : 0.0),
|
||||||
/*.t_grammar_ms =*/ 1e-3 * (smpl && smpl->grammar ? smpl->grammar->t_total_us : 0.0),
|
/*.t_grammar_ms =*/ 1e-3 * (smpl && smpl->grammar ? smpl->grammar->t_total_us : 0.0),
|
||||||
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
||||||
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
||||||
|
|
||||||
/*.n_sampling =*/ std::max(0, smpl ? smpl->n_sample : ctx->sampling.n_sample),
|
/*.n_sampling =*/ std::max(0, smpl ? smpl->n_sample : 0),
|
||||||
/*.n_grammar_sample =*/ std::max(0, smpl && smpl->grammar ? smpl->grammar->n_sample : 0),
|
/*.n_grammar_sample =*/ std::max(0, smpl && smpl->grammar ? smpl->grammar->n_sample : 0),
|
||||||
/*.n_grammar_accept =*/ std::max(0, smpl && smpl->grammar ? smpl->grammar->n_accept : 0),
|
/*.n_grammar_accept =*/ std::max(0, smpl && smpl->grammar ? smpl->grammar->n_accept : 0),
|
||||||
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue