diff --git a/common/common.cpp b/common/common.cpp index ce14d66b8..eb5ebb20f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -107,7 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { std::string arg; gpt_params default_params; const std::string arg_prefix = "--"; - llama_sampling_params & sparams = params.sampling_params; + llama_sampling_params & sparams = params.sparams; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -572,7 +572,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.grammar = argv[i]; + sparams.grammar = argv[i]; } else if (arg == "--grammar-file") { if (++i >= argc) { invalid_param = true; @@ -587,7 +587,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { std::copy( std::istreambuf_iterator(file), std::istreambuf_iterator(), - std::back_inserter(params.grammar) + std::back_inserter(sparams.grammar) ); #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters @@ -640,7 +640,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - const llama_sampling_params & sparams = params.sampling_params; + const llama_sampling_params & sparams = params.sparams; printf("usage: %s [options]\n", argv[0]); printf("\n"); @@ -878,7 +878,7 @@ std::tuple llama_init_from_gpt_par } if (params.ignore_eos) { - params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY; + params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY; } { @@ -1123,28 +1123,28 @@ std::string get_sortable_timestamp() { void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { - const llama_sampling_params & sparams = params.sampling_params; + const llama_sampling_params & sparams = params.sparams; fprintf(stream, "build_commit: %s\n", BUILD_COMMIT); fprintf(stream, "build_number: %d\n", BUILD_NUMBER); - fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); - fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); - fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); + fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); - fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); - fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); - fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); - fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); - fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); - fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); - fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); - fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); - fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); - fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); + fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); + fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); + fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); + fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); + fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); + fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); + fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); #ifdef NDEBUG fprintf(stream, "debug: false\n"); @@ -1179,7 +1179,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty); - dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); + dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str()); fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); diff --git a/common/common.h b/common/common.h index 65d3d20cd..84523a4fb 100644 --- a/common/common.h +++ b/common/common.h @@ -56,7 +56,7 @@ struct gpt_params { float rope_freq_scale = 0.0f; // RoPE frequency scaling factor // // sampling parameters - struct llama_sampling_params sampling_params; + struct llama_sampling_params sparams; std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model_draft = ""; // draft model for speculative decoding @@ -66,7 +66,6 @@ struct gpt_params { std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string input_prefix = ""; // string to prefix user inputs with std::string input_suffix = ""; // string to suffix user inputs with - std::string grammar = ""; // optional BNF-like grammar to constrain sampling std::vector antiprompt; // string upon seeing which more user input is prompted std::string logdir = ""; // directory in which to save YAML log files diff --git a/common/sampling.cpp b/common/sampling.cpp index 0b2466581..7a10ddbd0 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -1,9 +1,9 @@ #include "sampling.h" -struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) { +struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) { struct llama_sampling_context * result = new llama_sampling_context(); - result->params = params.sampling_params; + result->params = params; result->grammar = nullptr; // if there is a grammar, parse it @@ -23,7 +23,7 @@ struct llama_sampling_context * llama_sampling_init(const struct gpt_params & pa grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root")); } - result->prev.resize(params.n_ctx); + result->prev.resize(params.n_prev); return result; } diff --git a/common/sampling.h b/common/sampling.h index 50afcbc12..80b6a6b57 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -22,6 +22,7 @@ typedef struct llama_sampling_params { int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate + int32_t n_prev = 256; // number of previous tokens to remember bool penalize_nl = true; // consider newlines as a repeatable token @@ -34,6 +35,8 @@ typedef struct llama_sampling_params { std::unordered_map logit_bias; // logit bias for specific tokens + std::string grammar = ""; // optional BNF-like grammar to constrain sampling + } llama_sampling_params; // general sampler context @@ -58,7 +61,7 @@ struct llama_sampling_context { #include "common.h" // Create a new sampling context instance. -struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params); +struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params); void llama_sampling_free(struct llama_sampling_context * ctx); diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index 3ce33842c..c150844cf 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -128,7 +128,7 @@ bool eval_string(struct MyModel * mymodel,const char* str){ llama_token sampling_id(struct MyModel* mymodel) { llama_context* ctx = mymodel->ctx; gpt_params params = mymodel->params; - llama_sampling_params & sparams = params.sampling_params; + llama_sampling_params & sparams = params.sparams; // int n_ctx = llama_n_ctx(ctx); // out of user input, sample next token diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 128d67080..00843fef7 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -39,8 +39,8 @@ static gpt_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; -static bool is_interacting = false; +static bool is_interacting = false; static void write_logfile( const llama_context * ctx, const gpt_params & params, const llama_model * model, @@ -104,7 +104,7 @@ static void sigint_handler(int signo) { int main(int argc, char ** argv) { gpt_params params; - llama_sampling_params & sparams = params.sampling_params; + llama_sampling_params & sparams = params.sparams; g_params = ¶ms; if (!gpt_params_parse(argc, argv, params)) { @@ -363,31 +363,6 @@ int main(int argc, char ** argv) { LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("\n\n"); - struct llama_grammar * grammar = NULL; - grammar_parser::parse_state parsed_grammar; - - if (!params.grammar.empty()) { - parsed_grammar = grammar_parser::parse(params.grammar.c_str()); - // will be empty (default) if there are parse errors - if (parsed_grammar.rules.empty()) { - return 1; - } - LOG_TEE("%s: grammar:\n", __func__); - grammar_parser::print_grammar(stderr, parsed_grammar); - LOG_TEE("\n"); - - { - auto it = sparams.logit_bias.find(llama_token_eos(ctx)); - if (it != sparams.logit_bias.end() && it->second == -INFINITY) { - LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); - } - } - - std::vector grammar_rules(parsed_grammar.c_rules()); - grammar = llama_grammar_init( - grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); - } - LOG_TEE("\n##### Infill mode #####\n\n"); if (params.infill) { printf("\n************\n"); @@ -430,7 +405,7 @@ int main(int argc, char ** argv) { std::vector embd; std::vector embd_guidance; - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); while (n_remain != 0 || params.interactive) { // predict @@ -740,15 +715,7 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - // reset grammar state if we're restarting generation - if (grammar != NULL) { - llama_grammar_free(grammar); - - std::vector grammar_rules(parsed_grammar.c_rules()); - grammar = llama_grammar_init( - grammar_rules.data(), grammar_rules.size(), - parsed_grammar.symbol_ids.at("root")); - } + llama_sampling_reset(ctx_sampling); } is_interacting = false; } @@ -778,9 +745,7 @@ int main(int argc, char ** argv) { llama_free(ctx); llama_free_model(model); - if (grammar != NULL) { - llama_grammar_free(grammar); - } + llama_sampling_free(ctx_sampling); llama_backend_free(); #ifndef LOG_DISABLE_LOGS diff --git a/examples/llava/llava-utils.h b/examples/llava/llava-utils.h index e050b59be..45b2b1ad3 100644 --- a/examples/llava/llava-utils.h +++ b/examples/llava/llava-utils.h @@ -58,28 +58,30 @@ inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n // TODO: use common/sampling.h inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) { - // out of user input, sample next token - const float temp = params.sampling_params.temp; - const int32_t top_k = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k; - const float top_p = params.sampling_params.top_p; - const float tfs_z = params.sampling_params.tfs_z; - const float typical_p = params.sampling_params.typical_p; - // const int32_t repeat_last_n = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n; - // const float repeat_penalty = params.sampling_params.repeat_penalty; - // const float alpha_presence = params.sampling_params.presence_penalty; - // const float alpha_frequency = params.sampling_params.frequency_penalty; - const int mirostat = params.sampling_params.mirostat; - const float mirostat_tau = params.sampling_params.mirostat_tau; - const float mirostat_eta = params.sampling_params.mirostat_eta; - // const bool penalize_nl = params.sampling_params.penalize_nl; + auto & sparams = params.sparams; + + // out of user input, sample next token + const float temp = sparams.temp; + const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k; + const float top_p = sparams.top_p; + const float tfs_z = sparams.tfs_z; + const float typical_p = sparams.typical_p; + // const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n; + // const float repeat_penalty = sparams.repeat_penalty; + // const float alpha_presence = sparams.presence_penalty; + // const float alpha_frequency = sparams.frequency_penalty; + const int mirostat = sparams.mirostat; + const float mirostat_tau = sparams.mirostat_tau; + const float mirostat_eta = sparams.mirostat_eta; + // const bool penalize_nl = sparams.penalize_nl; llama_token id = 0; { auto logits = llama_get_logits(ctx_llama); auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama)); - // Apply params.logit_bias map - for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) { + // Apply params.logit_bias map + for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) { logits[it->first] += it->second; } @@ -91,18 +93,18 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) { llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - // TODO: Apply penalties - // float nl_logit = logits[llama_token_nl(ctx)]; - // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); - // llama_sample_repetition_penalty(ctx, &candidates_p, - // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - // last_n_repeat, repeat_penalty); - // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - // last_n_repeat, alpha_frequency, alpha_presence); - // if (!penalize_nl) { - // logits[llama_token_nl(ctx)] = nl_logit; - // } + // TODO: Apply penalties + // float nl_logit = logits[llama_token_nl(ctx)]; + // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); + // llama_sample_repetition_penalty(ctx, &candidates_p, + // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + // last_n_repeat, repeat_penalty); + // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + // last_n_repeat, alpha_frequency, alpha_presence); + // if (!penalize_nl) { + // logits[llama_token_nl(ctx)] = nl_logit; + // } if (temp <= 0) { // Greedy sampling diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 1a5911c56..540708ef1 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -108,7 +108,7 @@ int main(int argc, char ** argv) { if (!gpt_params_parse(argc, argv, params)) { return 1; } - llama_sampling_params & sparams = params.sampling_params; + llama_sampling_params & sparams = params.sparams; #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("main", "log")); @@ -459,7 +459,7 @@ int main(int argc, char ** argv) { std::vector embd; std::vector embd_guidance; - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 69f9526a4..3af36ed58 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -157,7 +157,7 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < clients.size(); ++i) { auto & client = clients[i]; client.id = i; - client.ctx_sampling = llama_sampling_init(params); + client.ctx_sampling = llama_sampling_init(params.sparams); } std::vector tokens_system; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0471528a3..18fe901dc 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -232,7 +232,7 @@ struct llama_server_context void rewind() { params.antiprompt.clear(); - params.grammar.clear(); + params.sparams.grammar.clear(); num_prompt_tokens = 0; num_tokens_predicted = 0; generated_text = ""; @@ -250,7 +250,7 @@ struct llama_server_context if (ctx_sampling != nullptr) { llama_sampling_free(ctx_sampling); } - ctx_sampling = llama_sampling_init(params); + ctx_sampling = llama_sampling_init(params.sparams); } bool loadModel(const gpt_params ¶ms_) @@ -313,7 +313,7 @@ struct llama_server_context bool loadGrammar() { - ctx_sampling = llama_sampling_init(params); + ctx_sampling = llama_sampling_init(params.sparams); return true; } @@ -530,8 +530,8 @@ struct llama_server_context llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false }; - const int32_t n_probs = params.sampling_params.n_probs; - if (params.sampling_params.temp <= 0 && n_probs > 0) + const int32_t n_probs = params.sparams.n_probs; + if (params.sparams.temp <= 0 && n_probs > 0) { // For llama_sample_token_greedy we need to sort candidates llama_sample_softmax(ctx, &cur_p); @@ -606,7 +606,7 @@ struct llama_server_context const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); generated_text += token_text; - if (params.sampling_params.n_probs > 0) + if (params.sparams.n_probs > 0) { generated_token_probs.push_back(token_with_probs); } @@ -1004,7 +1004,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, static json format_generation_settings(llama_server_context &llama) { - const auto & sparams = llama.params.sampling_params; + const auto & sparams = llama.params.sparams; const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx)); const bool ignore_eos = eos_bias != sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); @@ -1033,7 +1033,7 @@ static json format_generation_settings(llama_server_context &llama) {"stream", llama.stream}, {"logit_bias", sparams.logit_bias}, {"n_probs", sparams.n_probs}, - {"grammar", llama.params.grammar}, + {"grammar", llama.params.sparams.grammar}, }; } @@ -1081,7 +1081,7 @@ static json format_final_response(llama_server_context &llama, const std::string {"timings", format_timings(llama)}, }; - if (llama.params.sampling_params.n_probs > 0) + if (llama.params.sparams.n_probs > 0) { res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); } @@ -1097,7 +1097,7 @@ static json format_partial_response( {"stop", false}, }; - if (llama.params.sampling_params.n_probs > 0) + if (llama.params.sparams.n_probs > 0) { res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); } @@ -1129,11 +1129,13 @@ static T json_value(const json &body, const std::string &key, const T &default_v static void parse_options_completion(const json &body, llama_server_context &llama) { gpt_params default_params; - const auto & default_sparams = default_params.sampling_params; - auto & sparams = llama.params.sampling_params; + const auto & default_sparams = default_params.sparams; + + auto & params = llama.params; + auto & sparams = llama.params.sparams; llama.stream = json_value(body, "stream", false); - llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict); + params.n_predict = json_value(body, "n_predict", default_params.n_predict); sparams.top_k = json_value(body, "top_k", default_sparams.top_k); sparams.top_p = json_value(body, "top_p", default_sparams.top_p); sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z); @@ -1147,9 +1149,9 @@ static void parse_options_completion(const json &body, llama_server_context &lla sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl); - llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep); - llama.params.seed = json_value(body, "seed", default_params.seed); - llama.params.grammar = json_value(body, "grammar", default_params.grammar); + params.n_keep = json_value(body, "n_keep", default_params.n_keep); + params.seed = json_value(body, "seed", default_params.seed); + sparams.grammar = json_value(body, "grammar", default_sparams.grammar); sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs); if (body.count("prompt") != 0) @@ -1204,7 +1206,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla } } - llama.ctx_sampling = llama_sampling_init(llama.params); + llama.ctx_sampling = llama_sampling_init(llama.params.sparams); LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama)); } @@ -1414,7 +1416,7 @@ int main(int argc, char **argv) } auto probs = llama.generated_token_probs; - if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) { + if (llama.params.sparams.n_probs > 0 && llama.stopped_word) { const std::vector stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false); probs = std::vector(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size()); } @@ -1466,7 +1468,7 @@ int main(int argc, char **argv) std::vector probs_output = {}; - if (llama.params.sampling_params.n_probs > 0) { + if (llama.params.sparams.n_probs > 0) { const std::vector to_send_toks = llama_tokenize(llama.ctx, to_send, false); size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); @@ -1587,7 +1589,7 @@ int main(int argc, char **argv) std::vector probs_output = {}; - if (llama.params.sampling_params.n_probs > 0) { + if (llama.params.sparams.n_probs > 0) { const std::vector to_send_toks = llama_tokenize(llama.ctx, to_send, false); size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 24f49012a..92fb9a43b 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -112,16 +112,16 @@ int main(int argc, char ** argv) { bool has_eos = false; // target model sampling context - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params); + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); // draft sequence data std::vector drafts(n_seq_dft); - params.grammar.clear(); // the draft samplers will copy the target sampler's grammar - params.sampling_params.temp = std::max(0.01f, params.sampling_params.temp); + params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar + params.sparams.temp = std::max(0.01f, params.sparams.temp); for (int s = 0; s < n_seq_dft; ++s) { - drafts[s].ctx_sampling = llama_sampling_init(params); + drafts[s].ctx_sampling = llama_sampling_init(params.sparams); } llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); diff --git a/llama.cpp b/llama.cpp index ec8ffad33..a2391e44e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1042,21 +1042,21 @@ struct llama_hparams { float f_max_alibi_bias; bool operator!=(const llama_hparams & other) const { - if (this->vocab_only != other.vocab_only) return true; - if (this->n_vocab != other.n_vocab) return true; + if (this->vocab_only != other.vocab_only) return true; + if (this->n_vocab != other.n_vocab) return true; if (this->n_ctx_train != other.n_ctx_train) return true; - if (this->n_embd != other.n_embd) return true; - if (this->n_head != other.n_head) return true; - if (this->n_head_kv != other.n_head_kv) return true; - if (this->n_layer != other.n_layer) return true; - if (this->n_rot != other.n_rot) return true; - if (this->n_ff != other.n_ff) return true; + if (this->n_embd != other.n_embd) return true; + if (this->n_head != other.n_head) return true; + if (this->n_head_kv != other.n_head_kv) return true; + if (this->n_layer != other.n_layer) return true; + if (this->n_rot != other.n_rot) return true; + if (this->n_ff != other.n_ff) return true; const float EPSILON = 1e-9; - if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; - if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; - if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; + if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; + if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; + if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; return false; @@ -1195,11 +1195,11 @@ struct llama_vocab { id special_sep_id = -1; id special_pad_id = -1; - id linefeed_id = 13; + id linefeed_id = 13; id special_prefix_id = 32007; id special_middle_id = 32009; id special_suffix_id = 32008; - id special_eot_id = 32010; + id special_eot_id = 32010; int find_bpe_rank(std::string token_left, std::string token_right) const { replace_all(token_left, " ", "\u0120"); @@ -1359,10 +1359,7 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - // TODO: this should be: - // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead()); - // change it and test that it works - cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead()); memset(cache.buf.data, 0, cache.buf.size); struct ggml_init_params params;