diff --git a/common/arg.cpp b/common/arg.cpp index 506f7185d..07cc33762 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -145,7 +145,7 @@ static std::string format(const char * fmt, ...) { return std::string(buf.data(), size); } -static void gpt_params_handle_model_default(gpt_params & params) { +static void common_params_handle_model_default(common_params & params) { if (!params.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model if (params.hf_file.empty()) { @@ -171,10 +171,10 @@ static void gpt_params_handle_model_default(gpt_params & params) { // CLI argument parsing functions // -static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx_arg) { +static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { std::string arg; const std::string arg_prefix = "--"; - gpt_params & params = ctx_arg.params; + common_params & params = ctx_arg.params; std::unordered_map arg_to_options; for (auto & opt : ctx_arg.options) { @@ -268,7 +268,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - gpt_params_handle_model_default(params); + common_params_handle_model_default(params); if (params.escape) { string_process_escapes(params.prompt); @@ -291,7 +291,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx return true; } -static void gpt_params_print_usage(gpt_params_context & ctx_arg) { +static void common_params_print_usage(common_params_context & ctx_arg) { auto print_options = [](std::vector & options) { for (common_arg * opt : options) { printf("%s", opt->to_string().c_str()); @@ -320,17 +320,17 @@ static void gpt_params_print_usage(gpt_params_context & ctx_arg) { print_options(specific_options); } -bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { - auto ctx_arg = gpt_params_parser_init(params, ex, print_usage); - const gpt_params params_org = ctx_arg.params; // the example can modify the default params +bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { + auto ctx_arg = common_params_parser_init(params, ex, print_usage); + const common_params params_org = ctx_arg.params; // the example can modify the default params try { - if (!gpt_params_parse_ex(argc, argv, ctx_arg)) { + if (!common_params_parse_ex(argc, argv, ctx_arg)) { ctx_arg.params = params_org; return false; } if (ctx_arg.params.usage) { - gpt_params_print_usage(ctx_arg); + common_params_print_usage(ctx_arg); if (ctx_arg.print_usage) { ctx_arg.print_usage(argc, argv); } @@ -345,16 +345,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example return true; } -gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { - gpt_params_context ctx_arg(params); +common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { + common_params_context ctx_arg(params); ctx_arg.print_usage = print_usage; ctx_arg.ex = ex; std::string sampler_type_chars; std::string sampler_type_names; for (const auto & sampler : params.sparams.samplers) { - sampler_type_chars += gpt_sampler_type_to_chr(sampler); - sampler_type_names += gpt_sampler_type_to_str(sampler) + ";"; + sampler_type_chars += common_sampler_type_to_chr(sampler); + sampler_type_names += common_sampler_type_to_str(sampler) + ";"; } sampler_type_names.pop_back(); @@ -376,14 +376,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-h", "--help", "--usage"}, "print usage and exit", - [](gpt_params & params) { + [](common_params & params) { params.usage = true; } )); add_opt(common_arg( {"--version"}, "show version and build info", - [](gpt_params &) { + [](common_params &) { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); @@ -392,28 +392,28 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--verbose-prompt"}, format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.verbose_prompt = true; } )); add_opt(common_arg( {"--no-display-prompt"}, format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.display_prompt = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-co", "--color"}, format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.use_color = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-t", "--threads"}, "N", format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.cpuparams.n_threads = value; if (params.cpuparams.n_threads <= 0) { params.cpuparams.n_threads = std::thread::hardware_concurrency(); @@ -423,7 +423,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-tb", "--threads-batch"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.cpuparams_batch.n_threads = value; if (params.cpuparams_batch.n_threads <= 0) { params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); @@ -433,7 +433,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-td", "--threads-draft"}, "N", "number of threads to use during generation (default: same as --threads)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams.n_threads = value; if (params.draft_cpuparams.n_threads <= 0) { params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); @@ -443,7 +443,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-tbd", "--threads-batch-draft"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams_batch.n_threads = value; if (params.draft_cpuparams_batch.n_threads <= 0) { params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); @@ -453,7 +453,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-C", "--cpu-mask"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.cpuparams.mask_valid = true; if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { throw std::invalid_argument("invalid cpumask"); @@ -463,7 +463,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-Cr", "--cpu-range"}, "lo-hi", "range of CPUs for affinity. Complements --cpu-mask", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.cpuparams.mask_valid = true; if (!parse_cpu_range(range, params.cpuparams.cpumask)) { throw std::invalid_argument("invalid range"); @@ -473,14 +473,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--cpu-strict"}, "<0|1>", format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.cpuparams.strict_cpu = std::stoul(value); } )); add_opt(common_arg( {"--prio"}, "N", format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), - [](gpt_params & params, int prio) { + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } @@ -490,14 +490,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--poll"}, "<0...100>", format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.cpuparams.poll = std::stoul(value); } )); add_opt(common_arg( {"-Cb", "--cpu-mask-batch"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.cpuparams_batch.mask_valid = true; if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); @@ -507,7 +507,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-Crb", "--cpu-range-batch"}, "lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.cpuparams_batch.mask_valid = true; if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid range"); @@ -517,14 +517,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--cpu-strict-batch"}, "<0|1>", "use strict CPU placement (default: same as --cpu-strict)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.cpuparams_batch.strict_cpu = value; } )); add_opt(common_arg( {"--prio-batch"}, "N", format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), - [](gpt_params & params, int prio) { + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } @@ -534,14 +534,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--poll-batch"}, "<0|1>", "use polling to wait for work (default: same as --poll)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.cpuparams_batch.poll = value; } )); add_opt(common_arg( {"-Cd", "--cpu-mask-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.draft_cpuparams.mask_valid = true; if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { throw std::invalid_argument("invalid cpumask"); @@ -551,7 +551,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-Crd", "--cpu-range-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.draft_cpuparams.mask_valid = true; if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { throw std::invalid_argument("invalid range"); @@ -561,14 +561,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--cpu-strict-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--prio-draft"}, "N", format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), - [](gpt_params & params, int prio) { + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } @@ -578,14 +578,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--poll-draft"}, "<0|1>", "Use polling to wait for draft model work (default: same as --poll])", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-Cbd", "--cpu-mask-batch-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.draft_cpuparams_batch.mask_valid = true; if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); @@ -595,7 +595,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.draft_cpuparams_batch.mask_valid = true; if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); @@ -605,14 +605,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--cpu-strict-batch-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams_batch.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--prio-batch-draft"}, "N", format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), - [](gpt_params & params, int prio) { + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } @@ -622,91 +622,91 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--poll-batch-draft"}, "<0|1>", "Use polling to wait for draft model work (default: --poll-draft)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams_batch.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--draft"}, "N", format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-ps", "--p-split"}, "N", format("speculative decoding split probability (default: %.1f)", (double)params.p_split), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.p_split = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.lookup_cache_static = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-lcd", "--lookup-cache-dynamic"}, "FNAME", "path to dynamic lookup cache to use for lookup decoding (updated by generation)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.lookup_cache_dynamic = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-c", "--ctx-size"}, "N", format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_ctx = value; } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(common_arg( {"-n", "--predict", "--n-predict"}, "N", format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_predict = value; } ).set_env("LLAMA_ARG_N_PREDICT")); add_opt(common_arg( {"-b", "--batch-size"}, "N", format("logical maximum batch size (default: %d)", params.n_batch), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_batch = value; } ).set_env("LLAMA_ARG_BATCH")); add_opt(common_arg( {"-ub", "--ubatch-size"}, "N", format("physical maximum batch size (default: %d)", params.n_ubatch), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_ubatch = value; } ).set_env("LLAMA_ARG_UBATCH")); add_opt(common_arg( {"--keep"}, "N", format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_keep = value; } )); add_opt(common_arg( {"--no-context-shift"}, format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), - [](gpt_params & params) { + [](common_params & params) { params.ctx_shift = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_chunks = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"-fa", "--flash-attn"}, format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), - [](gpt_params & params) { + [](common_params & params) { params.flash_attn = true; } ).set_env("LLAMA_ARG_FLASH_ATTN")); @@ -715,14 +715,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ex == LLAMA_EXAMPLE_MAIN ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" : "prompt to start generation with", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.prompt = value; } )); add_opt(common_arg( {"--no-perf"}, format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.no_perf = true; params.sparams.no_perf = true; } @@ -730,7 +730,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -746,7 +746,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -757,7 +757,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-bf", "--binary-file"}, "FNAME", "binary file containing the prompt (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -773,56 +773,56 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-e", "--escape"}, format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.escape = true; } )); add_opt(common_arg( {"--no-escape"}, "do not process escape sequences", - [](gpt_params & params) { + [](common_params & params) { params.escape = false; } )); add_opt(common_arg( {"-ptc", "--print-token-count"}, "N", format("print token count every N tokens (default: %d)", params.n_print), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_print = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache"}, "FNAME", "file to cache prompt state for faster startup (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.path_prompt_cache = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache-all"}, "if specified, saves user input and generations to cache as well\n", - [](gpt_params & params) { + [](common_params & params) { params.prompt_cache_all = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--prompt-cache-ro"}, "if specified, uses the prompt cache but does not update it", - [](gpt_params & params) { + [](common_params & params) { params.prompt_cache_ro = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-r", "--reverse-prompt"}, "PROMPT", "halt generation at PROMPT, return control in interactive mode\n", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.antiprompt.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-sp", "--special"}, format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.special = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); @@ -835,35 +835,35 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, "(default: %s)", params.conversation ? "true" : "false" ), - [](gpt_params & params) { + [](common_params & params) { params.conversation = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-i", "--interactive"}, format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.interactive = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-if", "--interactive-first"}, format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.interactive_first = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-mli", "--multiline-input"}, "allows you to write or paste multiple lines without ending each in '\\'", - [](gpt_params & params) { + [](common_params & params) { params.multiline_input = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-prefix-bos"}, "prefix BOS to user inputs, preceding the `--in-prefix` string", - [](gpt_params & params) { + [](common_params & params) { params.input_prefix_bos = true; params.enable_chat_template = false; } @@ -871,7 +871,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--in-prefix"}, "STRING", "string to prefix user inputs with (default: empty)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.input_prefix = value; params.enable_chat_template = false; } @@ -879,7 +879,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--in-suffix"}, "STRING", "string to suffix after user inputs with (default: empty)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.input_suffix = value; params.enable_chat_template = false; } @@ -887,7 +887,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--no-warmup"}, "skip warming up the model with an empty run", - [](gpt_params & params) { + [](common_params & params) { params.warmup = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); @@ -897,50 +897,50 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" ), - [](gpt_params & params) { + [](common_params & params) { params.spm_infill = true; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); add_opt(common_arg( {"--samplers"}, "SAMPLERS", format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { const auto sampler_names = string_split(value, ';'); - params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true); + params.sparams.samplers = common_sampler_types_from_names(sampler_names, true); } ).set_sparam()); add_opt(common_arg( {"-s", "--seed"}, "SEED", format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.seed = std::stoul(value); } ).set_sparam()); add_opt(common_arg( {"--sampling-seq"}, "SEQUENCE", format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), - [](gpt_params & params, const std::string & value) { - params.sparams.samplers = gpt_sampler_types_from_chars(value); + [](common_params & params, const std::string & value) { + params.sparams.samplers = common_sampler_types_from_chars(value); } ).set_sparam()); add_opt(common_arg( {"--ignore-eos"}, "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", - [](gpt_params & params) { + [](common_params & params) { params.sparams.ignore_eos = true; } ).set_sparam()); add_opt(common_arg( {"--penalize-nl"}, format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.sparams.penalize_nl = true; } ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", format("temperature (default: %.1f)", (double)params.sparams.temp), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.temp = std::stof(value); params.sparams.temp = std::max(params.sparams.temp, 0.0f); } @@ -948,42 +948,42 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--top-k"}, "N", format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.sparams.top_k = value; } ).set_sparam()); add_opt(common_arg( {"--top-p"}, "N", format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.top_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--min-p"}, "N", format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.min_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--tfs"}, "N", format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.tfs_z = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--typical"}, "N", format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.typ_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--repeat-last-n"}, "N", format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.sparams.penalty_last_n = value; params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); } @@ -991,35 +991,35 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--repeat-penalty"}, "N", format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.penalty_repeat = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--presence-penalty"}, "N", format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.penalty_present = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--frequency-penalty"}, "N", format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.penalty_freq = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.dynatemp_range = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dynatemp-exp"}, "N", format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.dynatemp_exponent = std::stof(value); } ).set_sparam()); @@ -1027,21 +1027,21 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, {"--mirostat"}, "N", format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.sparams.mirostat = value; } ).set_sparam()); add_opt(common_arg( {"--mirostat-lr"}, "N", format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.mirostat_eta = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat-ent"}, "N", format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.mirostat_tau = std::stof(value); } ).set_sparam()); @@ -1050,7 +1050,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, "modifies the likelihood of token appearing in the completion,\n" "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::stringstream ss(value); llama_token key; char sign; @@ -1070,14 +1070,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--grammar"}, "GRAMMAR", format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.grammar = value; } ).set_sparam()); add_opt(common_arg( {"--grammar-file"}, "FNAME", "file to read grammar from", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1092,14 +1092,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-j", "--json-schema"}, "SCHEMA", "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.grammar = json_schema_to_grammar(json::parse(value)); } ).set_sparam()); add_opt(common_arg( {"--pooling"}, "{none,mean,cls,last,rank}", "pooling type for embeddings, use model default if unspecified", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } @@ -1111,7 +1111,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--attention"}, "{causal,non,causal}", "attention type for embeddings, use model default if unspecified", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } else { throw std::invalid_argument("invalid value"); } @@ -1120,7 +1120,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--rope-scaling"}, "{none,linear,yarn}", "RoPE frequency scaling method, defaults to linear unless specified by the model", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } @@ -1130,91 +1130,91 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--rope-scale"}, "N", "RoPE context scaling factor, expands context by a factor of N", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.rope_freq_scale = 1.0f / std::stof(value); } ).set_env("LLAMA_ARG_ROPE_SCALE")); add_opt(common_arg( {"--rope-freq-base"}, "N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.rope_freq_base = std::stof(value); } ).set_env("LLAMA_ARG_ROPE_FREQ_BASE")); add_opt(common_arg( {"--rope-freq-scale"}, "N", "RoPE frequency scaling factor, expands context by a factor of 1/N", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.rope_freq_scale = std::stof(value); } ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE")); add_opt(common_arg( {"--yarn-orig-ctx"}, "N", format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.yarn_orig_ctx = value; } ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); add_opt(common_arg( {"--yarn-ext-factor"}, "N", format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.yarn_ext_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); add_opt(common_arg( {"--yarn-attn-factor"}, "N", format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.yarn_attn_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); add_opt(common_arg( {"--yarn-beta-slow"}, "N", format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.yarn_beta_slow = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); add_opt(common_arg( {"--yarn-beta-fast"}, "N", format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.yarn_beta_fast = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_FAST")); add_opt(common_arg( {"-gan", "--grp-attn-n"}, "N", format("group-attention factor (default: %d)", params.grp_attn_n), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.grp_attn_n = value; } ).set_env("LLAMA_ARG_GRP_ATTN_N")); add_opt(common_arg( {"-gaw", "--grp-attn-w"}, "N", format("group-attention width (default: %.1f)", (double)params.grp_attn_w), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.grp_attn_w = value; } ).set_env("LLAMA_ARG_GRP_ATTN_W")); add_opt(common_arg( {"-dkvc", "--dump-kv-cache"}, "verbose print of the KV cache", - [](gpt_params & params) { + [](common_params & params) { params.dump_kv_cache = true; } )); add_opt(common_arg( {"-nkvo", "--no-kv-offload"}, "disable KV offload", - [](gpt_params & params) { + [](common_params & params) { params.no_kv_offload = true; } ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_k = value; } @@ -1222,7 +1222,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-ctv", "--cache-type-v"}, "TYPE", format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_v = value; } @@ -1230,126 +1230,126 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--perplexity", "--all-logits"}, format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.logits_all = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--hellaswag"}, "compute HellaSwag score over random tasks from datafile supplied with -f", - [](gpt_params & params) { + [](common_params & params) { params.hellaswag = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--hellaswag-tasks"}, "N", format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.hellaswag_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--winogrande"}, "compute Winogrande score over random tasks from datafile supplied with -f", - [](gpt_params & params) { + [](common_params & params) { params.winogrande = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--winogrande-tasks"}, "N", format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.winogrande_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--multiple-choice"}, "compute multiple choice score over random tasks from datafile supplied with -f", - [](gpt_params & params) { + [](common_params & params) { params.multiple_choice = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--multiple-choice-tasks"}, "N", format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.multiple_choice_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--kl-divergence"}, "computes KL-divergence to logits provided via --kl-divergence-base", - [](gpt_params & params) { + [](common_params & params) { params.kl_divergence = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--save-all-logits", "--kl-divergence-base"}, "FNAME", "set logits file", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.logits_file = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--ppl-stride"}, "N", format("stride for perplexity calculation (default: %d)", params.ppl_stride), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.ppl_stride = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--ppl-output-type"}, "<0|1>", format("output type for perplexity calculation (default: %d)", params.ppl_output_type), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.ppl_output_type = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"-dt", "--defrag-thold"}, "N", format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.defrag_thold = std::stof(value); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); add_opt(common_arg( {"-np", "--parallel"}, "N", format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_parallel = value; } ).set_env("LLAMA_ARG_N_PARALLEL")); add_opt(common_arg( {"-ns", "--sequences"}, "N", format("number of sequences to decode (default: %d)", params.n_sequences), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_sequences = value; } ).set_examples({LLAMA_EXAMPLE_PARALLEL})); add_opt(common_arg( {"-cb", "--cont-batching"}, format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [](gpt_params & params) { + [](common_params & params) { params.cont_batching = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); add_opt(common_arg( {"-nocb", "--no-cont-batching"}, "disable continuous batching", - [](gpt_params & params) { + [](common_params & params) { params.cont_batching = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); add_opt(common_arg( {"--mmproj"}, "FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.mmproj = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.image.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_LLAVA})); @@ -1357,7 +1357,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--rpc"}, "SERVERS", "comma separated list of RPC servers", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.rpc_servers = value; } ).set_env("LLAMA_ARG_RPC")); @@ -1365,14 +1365,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--mlock"}, "force system to keep model in RAM rather than swapping or compressing", - [](gpt_params & params) { + [](common_params & params) { params.use_mlock = true; } ).set_env("LLAMA_ARG_MLOCK")); add_opt(common_arg( {"--no-mmap"}, "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [](gpt_params & params) { + [](common_params & params) { params.use_mmap = false; } ).set_env("LLAMA_ARG_NO_MMAP")); @@ -1384,7 +1384,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, "- numactl: use the CPU map provided by numactl\n" "if run without this previously, it is recommended to drop the system page cache before using this\n" "see https://github.com/ggerganov/llama.cpp/issues/1437", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } @@ -1394,7 +1394,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); @@ -1405,7 +1405,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_gpu_layers_draft = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); @@ -1419,7 +1419,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, "- none: use one GPU only\n" "- layer (default): split layers and KV across GPUs\n" "- row: split rows across GPUs", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::string arg_next = value; if (arg_next == "none") { params.split_mode = LLAMA_SPLIT_MODE_NONE; @@ -1442,7 +1442,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-ts", "--tensor-split"}, "N0,N1,N2,...", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::string arg_next = value; // split string by , and / @@ -1469,7 +1469,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-mg", "--main-gpu"}, "INDEX", format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.main_gpu = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); @@ -1479,7 +1479,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--check-tensors"}, format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.check_tensors = true; } )); @@ -1487,7 +1487,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, {"--override-kv"}, "KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); } @@ -1496,7 +1496,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--lora"}, "FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.lora_adapters.push_back({ std::string(value), 1.0 }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg @@ -1504,7 +1504,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--lora-scaled"}, "FNAME", "SCALE", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", - [](gpt_params & params, const std::string & fname, const std::string & scale) { + [](common_params & params, const std::string & fname, const std::string & scale) { params.lora_adapters.push_back({ fname, std::stof(scale) }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg @@ -1512,7 +1512,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--control-vector"}, "FNAME", "add a control vector\nnote: this argument can be repeated to add multiple control vectors", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.control_vectors.push_back({ 1.0f, value, }); } )); @@ -1520,14 +1520,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, {"--control-vector-scaled"}, "FNAME", "SCALE", "add a control vector with user defined scaling SCALE\n" "note: this argument can be repeated to add multiple scaled control vectors", - [](gpt_params & params, const std::string & fname, const std::string & scale) { + [](common_params & params, const std::string & fname, const std::string & scale) { params.control_vectors.push_back({ std::stof(scale), fname }); } )); add_opt(common_arg( {"--control-vector-layer-range"}, "START", "END", "layer range to apply the control vector(s) to, start and end inclusive", - [](gpt_params & params, const std::string & start, const std::string & end) { + [](common_params & params, const std::string & start, const std::string & end) { params.control_vector_layer_start = std::stoi(start); params.control_vector_layer_end = std::stoi(end); } @@ -1535,7 +1535,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-a", "--alias"}, "STRING", "set alias for model name (to be used by REST API)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model_alias = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); @@ -1547,49 +1547,49 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, "model path (default: `models/$filename` with filename from `--hf-file` " "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH ), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model = value; } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); add_opt(common_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model_url = value; } ).set_env("LLAMA_ARG_MODEL_URL")); add_opt(common_arg( {"-hfr", "--hf-repo"}, "REPO", "Hugging Face model repository (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); add_opt(common_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.hf_token = value; } ).set_env("HF_TOKEN")); add_opt(common_arg( {"--context-file"}, "FNAME", "file to load context from (repeat to specify multiple files)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1600,28 +1600,28 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--chunk-size"}, "N", format("minimum length of embedded text chunks (default: %d)", params.chunk_size), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.chunk_size = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--chunk-separator"}, "STRING", format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.chunk_separator = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--junk"}, "N", format("number of times to repeat the junk text (default: %d)", params.n_junk), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_junk = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( {"--pos"}, "N", format("position of the passkey in the junk text (default: %d)", params.i_pos), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.i_pos = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); @@ -1633,7 +1633,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR ? params.cvector_outfile.c_str() : params.out_file.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.out_file = value; params.cvector_outfile = value; params.lora_outfile = value; @@ -1642,49 +1642,49 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", format("output the imatrix every N iterations (default: %d)", params.n_out_freq), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_out_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--save-frequency"}, "N", format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_save_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--process-output"}, format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.process_output = true; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--no-ppl"}, format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.compute_ppl = false; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--chunk", "--from-chunk"}, "N", format("start processing the input from chunk N (default: %d)", params.i_chunk), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"-pps"}, format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), - [](gpt_params & params) { + [](common_params & params) { params.is_pp_shared = true; } ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( {"-npp"}, "n0,n1,...", "number of prompt tokens", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); } @@ -1692,7 +1692,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-ntg"}, "n0,n1,...", "number of text generation tokens", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); } @@ -1700,7 +1700,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-npl"}, "n0,n1,...", "number of parallel prompts", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); } @@ -1708,70 +1708,70 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--embd-normalize"}, "N", format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.embd_normalize = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--embd-output-format"}, "FORMAT", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.embd_out = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--embd-separator"}, "STRING", "separator of embendings (default \\n) for example \"<#sep#>\"", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.embd_sep = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--host"}, "HOST", format("ip address to listen (default: %s)", params.hostname.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.hostname = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); add_opt(common_arg( {"--port"}, "PORT", format("port to listen (default: %d)", params.port), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.port = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); add_opt(common_arg( {"--path"}, "PATH", format("path to serve static files from (default: %s)", params.public_path.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); add_opt(common_arg( {"--embedding", "--embeddings"}, format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), - [](gpt_params & params) { + [](common_params & params) { params.embedding = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); add_opt(common_arg( {"--reranking", "--rerank"}, format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), - [](gpt_params & params) { + [](common_params & params) { params.reranking = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING")); add_opt(common_arg( {"--api-key"}, "KEY", "API key to use for authentication (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.api_keys.push_back(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); add_opt(common_arg( {"--api-key-file"}, "FNAME", "path to file containing API keys (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream key_file(value); if (!key_file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1788,21 +1788,21 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--ssl-key-file"}, "FNAME", "path to file a PEM-encoded SSL private key", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.ssl_file_key = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE")); add_opt(common_arg( {"--ssl-cert-file"}, "FNAME", "path to file a PEM-encoded SSL certificate", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.ssl_file_cert = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); add_opt(common_arg( {"-to", "--timeout"}, "N", format("server read/write timeout in seconds (default: %d)", params.timeout_read), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.timeout_read = value; params.timeout_write = value; } @@ -1810,14 +1810,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--threads-http"}, "N", format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_threads_http = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); add_opt(common_arg( {"-spf", "--system-prompt-file"}, "FNAME", "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); @@ -1834,35 +1834,35 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--metrics"}, format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), - [](gpt_params & params) { + [](common_params & params) { params.endpoint_metrics = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); add_opt(common_arg( {"--slots"}, format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [](gpt_params & params) { + [](common_params & params) { params.endpoint_slots = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); add_opt(common_arg( {"--props"}, format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), - [](gpt_params & params) { + [](common_params & params) { params.endpoint_props = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); add_opt(common_arg( {"--no-slots"}, "disables slots monitoring endpoint", - [](gpt_params & params) { + [](common_params & params) { params.endpoint_slots = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); add_opt(common_arg( {"--slot-save-path"}, "PATH", "path to save slot kv cache (default: disabled)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.slot_save_path = value; // if doesn't end with DIRECTORY_SEPARATOR, add it if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { @@ -1875,7 +1875,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, "set custom jinja chat template (default: template taken from model's metadata)\n" "if suffix/prefix are specified, template will be disabled\n" "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { if (!common_chat_verify_template(value)) { throw std::runtime_error(format( "error: the supplied chat template is not supported: %s\n" @@ -1889,28 +1889,28 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.slot_prompt_similarity = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--lora-init-without-apply"}, format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), - [](gpt_params & params) { + [](common_params & params) { params.lora_init_without_apply = true; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--simple-io"}, "use basic IO for better compatibility in subprocesses and limited consoles", - [](gpt_params & params) { + [](common_params & params) { params.simple_io = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(common_arg( {"-ld", "--logdir"}, "LOGDIR", "path under which to save YAML logs (no logging if unset)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.logdir = value; if (params.logdir.back() != DIRECTORY_SEPARATOR) { @@ -1921,35 +1921,35 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--positive-file"}, "FNAME", format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.cvector_positive_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--negative-file"}, "FNAME", format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.cvector_negative_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--pca-batch"}, "N", format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_pca_batch = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--pca-iter"}, "N", format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_pca_iterations = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--method"}, "{pca, mean}", "dimensionality reduction method to be used (default: pca)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } else { throw std::invalid_argument("invalid value"); } @@ -1958,7 +1958,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--output-format"}, "{md,jsonl}", "output format for batched-bench results (default: md)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } else if (value == "md") { params.batched_bench_output_jsonl = false; } else { std::invalid_argument("invalid value"); } @@ -1967,52 +1967,52 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, add_opt(common_arg( {"--log-disable"}, "Log disable", - [](gpt_params &) { - gpt_log_pause(gpt_log_main()); + [](common_params &) { + common_log_pause(common_log_main()); } )); add_opt(common_arg( {"--log-file"}, "FNAME", "Log to file", - [](gpt_params &, const std::string & value) { - gpt_log_set_file(gpt_log_main(), value.c_str()); + [](common_params &, const std::string & value) { + common_log_set_file(common_log_main(), value.c_str()); } )); add_opt(common_arg( {"--log-colors"}, "Enable colored logging", - [](gpt_params &) { - gpt_log_set_colors(gpt_log_main(), true); + [](common_params &) { + common_log_set_colors(common_log_main(), true); } ).set_env("LLAMA_LOG_COLORS")); add_opt(common_arg( {"-v", "--verbose", "--log-verbose"}, "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", - [](gpt_params & params) { + [](common_params & params) { params.verbosity = INT_MAX; - gpt_log_set_verbosity_thold(INT_MAX); + common_log_set_verbosity_thold(INT_MAX); } )); add_opt(common_arg( {"-lv", "--verbosity", "--log-verbosity"}, "N", "Set the verbosity threshold. Messages with a higher verbosity will be ignored.", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.verbosity = value; - gpt_log_set_verbosity_thold(value); + common_log_set_verbosity_thold(value); } ).set_env("LLAMA_LOG_VERBOSITY")); add_opt(common_arg( {"--log-prefix"}, "Enable prefx in log messages", - [](gpt_params &) { - gpt_log_set_prefix(gpt_log_main(), true); + [](common_params &) { + common_log_set_prefix(common_log_main(), true); } ).set_env("LLAMA_LOG_PREFIX")); add_opt(common_arg( {"--log-timestamps"}, "Enable timestamps in log messages", - [](gpt_params &) { - gpt_log_set_timestamps(gpt_log_main(), true); + [](common_params &) { + common_log_set_timestamps(common_log_main(), true); } ).set_env("LLAMA_LOG_TIMESTAMPS")); diff --git a/common/arg.h b/common/arg.h index ffb47a762..a6700d323 100644 --- a/common/arg.h +++ b/common/arg.h @@ -18,29 +18,29 @@ struct common_arg { const char * env = nullptr; std::string help; bool is_sparam = false; // is current arg a sampling param? - void (*handler_void) (gpt_params & params) = nullptr; - void (*handler_string) (gpt_params & params, const std::string &) = nullptr; - void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr; - void (*handler_int) (gpt_params & params, int) = nullptr; + void (*handler_void) (common_params & params) = nullptr; + void (*handler_string) (common_params & params, const std::string &) = nullptr; + void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; + void (*handler_int) (common_params & params, int) = nullptr; common_arg( const std::initializer_list & args, const char * value_hint, const std::string & help, - void (*handler)(gpt_params & params, const std::string &) + void (*handler)(common_params & params, const std::string &) ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} common_arg( const std::initializer_list & args, const char * value_hint, const std::string & help, - void (*handler)(gpt_params & params, int) + void (*handler)(common_params & params, int) ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} common_arg( const std::initializer_list & args, const std::string & help, - void (*handler)(gpt_params & params) + void (*handler)(common_params & params) ) : args(args), help(help), handler_void(handler) {} // support 2 values for arg @@ -49,7 +49,7 @@ struct common_arg { const char * value_hint, const char * value_hint_2, const std::string & help, - void (*handler)(gpt_params & params, const std::string &, const std::string &) + void (*handler)(common_params & params, const std::string &, const std::string &) ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} common_arg & set_examples(std::initializer_list examples); @@ -61,17 +61,17 @@ struct common_arg { std::string to_string(); }; -struct gpt_params_context { +struct common_params_context { enum llama_example ex = LLAMA_EXAMPLE_COMMON; - gpt_params & params; + common_params & params; std::vector options; void(*print_usage)(int, char **) = nullptr; - gpt_params_context(gpt_params & params) : params(params) {} + common_params_context(common_params & params) : params(params) {} }; // parse input arguments from CLI // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) -bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); +bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); // function to be used by test-arg-parser -gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); +common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/common.cpp b/common/common.cpp index 678451945..e43615f31 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -362,10 +362,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD return true; } -void gpt_init() { +void common_init() { llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { - if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) { - gpt_log_add(gpt_log_main(), level, "%s", text); + if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) { + common_log_add(common_log_main(), level, "%s", text); } }, NULL); @@ -378,7 +378,7 @@ void gpt_init() { LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); } -std::string gpt_params_get_system_info(const gpt_params & params) { +std::string common_params_get_system_info(const common_params & params) { std::ostringstream os; os << "system_info: n_threads = " << params.cpuparams.n_threads; @@ -819,9 +819,9 @@ std::string fs_get_cache_file(const std::string & filename) { // // Model utils // -struct common_init_result llama_init_from_gpt_params(gpt_params & params) { +struct common_init_result common_init_from_common_params(common_params & params) { common_init_result iparams; - auto mparams = common_model_params_from_gpt_params(params); + auto mparams = common_model_params_from_common_params(params); llama_model * model = nullptr; @@ -863,7 +863,7 @@ struct common_init_result llama_init_from_gpt_params(gpt_params & params) { } } - auto cparams = common_context_params_from_gpt_params(params); + auto cparams = common_context_params_from_common_params(params); llama_context * lctx = llama_new_context_with_model(model, cparams); if (lctx == NULL) { @@ -970,7 +970,7 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector & prompt_tokens, const char * model_desc) { const auto & sparams = params.sparams; diff --git a/common/common.h b/common/common.h index 12449618d..a4c23a466 100644 --- a/common/common.h +++ b/common/common.h @@ -82,14 +82,14 @@ enum llama_example { LLAMA_EXAMPLE_COUNT, }; -enum gpt_sampler_type { - GPT_SAMPLER_TYPE_NONE = 0, - GPT_SAMPLER_TYPE_TOP_K = 1, - GPT_SAMPLER_TYPE_TOP_P = 2, - GPT_SAMPLER_TYPE_MIN_P = 3, - GPT_SAMPLER_TYPE_TFS_Z = 4, - GPT_SAMPLER_TYPE_TYPICAL_P = 5, - GPT_SAMPLER_TYPE_TEMPERATURE = 6, +enum common_sampler_type { + COMMON_SAMPLER_TYPE_NONE = 0, + COMMON_SAMPLER_TYPE_TOP_K = 1, + COMMON_SAMPLER_TYPE_TOP_P = 2, + COMMON_SAMPLER_TYPE_MIN_P = 3, + COMMON_SAMPLER_TYPE_TFS_Z = 4, + COMMON_SAMPLER_TYPE_TYPICAL_P = 5, + COMMON_SAMPLER_TYPE_TEMPERATURE = 6, }; // dimensionality reduction methods, used by cvector-generator @@ -99,7 +99,7 @@ enum dimre_method { }; // sampler parameters -struct gpt_sampler_params { +struct common_sampler_params { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler int32_t n_prev = 64; // number of previous tokens to remember @@ -124,13 +124,13 @@ struct gpt_sampler_params { bool ignore_eos = false; bool no_perf = false; // disable performance metrics - std::vector samplers = { - GPT_SAMPLER_TYPE_TOP_K, - GPT_SAMPLER_TYPE_TFS_Z, - GPT_SAMPLER_TYPE_TYPICAL_P, - GPT_SAMPLER_TYPE_TOP_P, - GPT_SAMPLER_TYPE_MIN_P, - GPT_SAMPLER_TYPE_TEMPERATURE + std::vector samplers = { + COMMON_SAMPLER_TYPE_TOP_K, + COMMON_SAMPLER_TYPE_TFS_Z, + COMMON_SAMPLER_TYPE_TYPICAL_P, + COMMON_SAMPLER_TYPE_TOP_P, + COMMON_SAMPLER_TYPE_MIN_P, + COMMON_SAMPLER_TYPE_TEMPERATURE }; std::string grammar; // optional BNF-like grammar to constrain sampling @@ -141,7 +141,7 @@ struct gpt_sampler_params { std::string print() const; }; -struct gpt_params { +struct common_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) @@ -183,7 +183,7 @@ struct gpt_params { enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - struct gpt_sampler_params sparams; + struct common_sampler_params sparams; std::string model = ""; // model path // NOLINT std::string model_draft = ""; // draft model for speculative decoding // NOLINT @@ -348,9 +348,9 @@ struct gpt_params { // call once at the start of a program if it uses libcommon // initializes the logging system and prints info about the build -void gpt_init(); +void common_init(); -std::string gpt_params_get_system_info(const gpt_params & params); +std::string common_params_get_system_info(const common_params & params); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]); @@ -410,10 +410,10 @@ struct common_init_result { std::vector lora_adapters; }; -struct common_init_result llama_init_from_gpt_params(gpt_params & params); +struct common_init_result common_init_from_common_params(common_params & params); -struct llama_model_params common_model_params_from_gpt_params (const gpt_params & params); -struct llama_context_params common_context_params_from_gpt_params (const gpt_params & params); +struct llama_model_params common_model_params_from_common_params (const common_params & params); +struct llama_context_params common_context_params_from_common_params(const common_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); @@ -554,5 +554,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data); void yaml_dump_non_result_info( - FILE * stream, const gpt_params & params, const llama_context * lctx, + FILE * stream, const common_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); diff --git a/common/log.cpp b/common/log.cpp index 5a844ed59..04c7c0ed1 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -8,10 +8,10 @@ #include #include -int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA; +int common_log_verbosity_thold = LOG_DEFAULT_LLAMA; -void gpt_log_set_verbosity_thold(int verbosity) { - gpt_log_verbosity_thold = verbosity; +void common_log_set_verbosity_thold(int verbosity) { + common_log_verbosity_thold = verbosity; } #define LOG_COL_DEFAULT "\033[0m" @@ -29,16 +29,16 @@ static int64_t t_us() { } // colors -enum gpt_log_col : int { - GPT_LOG_COL_DEFAULT = 0, - GPT_LOG_COL_BOLD, - GPT_LOG_COL_RED, - GPT_LOG_COL_GREEN, - GPT_LOG_COL_YELLOW, - GPT_LOG_COL_BLUE, - GPT_LOG_COL_MAGENTA, - GPT_LOG_COL_CYAN, - GPT_LOG_COL_WHITE, +enum common_log_col : int { + COMMON_LOG_COL_DEFAULT = 0, + COMMON_LOG_COL_BOLD, + COMMON_LOG_COL_RED, + COMMON_LOG_COL_GREEN, + COMMON_LOG_COL_YELLOW, + COMMON_LOG_COL_BLUE, + COMMON_LOG_COL_MAGENTA, + COMMON_LOG_COL_CYAN, + COMMON_LOG_COL_WHITE, }; // disable colors by default @@ -54,7 +54,7 @@ static std::vector g_col = { "", }; -struct gpt_log_entry { +struct common_log_entry { enum ggml_log_level level; bool prefix; @@ -71,7 +71,7 @@ struct gpt_log_entry { if (!fcur) { // stderr displays DBG messages only when their verbosity level is not higher than the threshold // these messages will still be logged to a file - if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) { + if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) { return; } @@ -86,19 +86,19 @@ struct gpt_log_entry { if (timestamp) { // [M.s.ms.us] fprintf(fcur, "%s%d.%02d.%03d.%03d%s ", - g_col[GPT_LOG_COL_BLUE], + g_col[COMMON_LOG_COL_BLUE], (int) (timestamp / 1000000 / 60), (int) (timestamp / 1000000 % 60), (int) (timestamp / 1000 % 1000), (int) (timestamp % 1000), - g_col[GPT_LOG_COL_DEFAULT]); + g_col[COMMON_LOG_COL_DEFAULT]); } switch (level) { - case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break; - case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break; - case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break; - case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break; + case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break; + case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break; + case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break; + case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break; default: break; } @@ -107,18 +107,18 @@ struct gpt_log_entry { fprintf(fcur, "%s", msg.data()); if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) { - fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]); + fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]); } fflush(fcur); } }; -struct gpt_log { +struct common_log { // default capacity - will be expanded if needed - gpt_log() : gpt_log(256) {} + common_log() : common_log(256) {} - gpt_log(size_t capacity) { + common_log(size_t capacity) { file = nullptr; prefix = false; timestamps = false; @@ -137,7 +137,7 @@ struct gpt_log { resume(); } - ~gpt_log() { + ~common_log() { pause(); if (file) { fclose(file); @@ -158,12 +158,12 @@ private: int64_t t_start; // ring buffer of entries - std::vector entries; + std::vector entries; size_t head; size_t tail; // worker thread copies into this - gpt_log_entry cur; + common_log_entry cur; public: void add(enum ggml_log_level level, const char * fmt, va_list args) { @@ -219,7 +219,7 @@ public: tail = (tail + 1) % entries.size(); if (tail == head) { // expand the buffer - std::vector new_entries(2*entries.size()); + std::vector new_entries(2*entries.size()); size_t new_tail = 0; @@ -320,15 +320,15 @@ public: pause(); if (colors) { - g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT; - g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD; - g_col[GPT_LOG_COL_RED] = LOG_COL_RED; - g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN; - g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW; - g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE; - g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA; - g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN; - g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE; + g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT; + g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD; + g_col[COMMON_LOG_COL_RED] = LOG_COL_RED; + g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN; + g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW; + g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE; + g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA; + g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN; + g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE; } else { for (size_t i = 0; i < g_col.size(); i++) { g_col[i] = ""; @@ -355,47 +355,47 @@ public: // public API // -struct gpt_log * gpt_log_init() { - return new gpt_log; +struct common_log * common_log_init() { + return new common_log; } -struct gpt_log * gpt_log_main() { - static struct gpt_log log; +struct common_log * common_log_main() { + static struct common_log log; return &log; } -void gpt_log_pause(struct gpt_log * log) { +void common_log_pause(struct common_log * log) { log->pause(); } -void gpt_log_resume(struct gpt_log * log) { +void common_log_resume(struct common_log * log) { log->resume(); } -void gpt_log_free(struct gpt_log * log) { +void common_log_free(struct common_log * log) { delete log; } -void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) { +void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) { va_list args; va_start(args, fmt); log->add(level, fmt, args); va_end(args); } -void gpt_log_set_file(struct gpt_log * log, const char * file) { +void common_log_set_file(struct common_log * log, const char * file) { log->set_file(file); } -void gpt_log_set_colors(struct gpt_log * log, bool colors) { +void common_log_set_colors(struct common_log * log, bool colors) { log->set_colors(colors); } -void gpt_log_set_prefix(struct gpt_log * log, bool prefix) { +void common_log_set_prefix(struct common_log * log, bool prefix) { log->set_prefix(prefix); } -void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) { +void common_log_set_timestamps(struct common_log * log, bool timestamps) { log->set_timestamps(timestamps); } diff --git a/common/log.h b/common/log.h index 84f9b3ed7..66605cc69 100644 --- a/common/log.h +++ b/common/log.h @@ -14,23 +14,23 @@ #define LOG_DEFAULT_LLAMA 0 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower -// set via gpt_log_set_verbosity() -extern int gpt_log_verbosity_thold; +// set via common_log_set_verbosity() +extern int common_log_verbosity_thold; -void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe +void common_log_set_verbosity_thold(int verbosity); // not thread-safe -// the gpt_log uses an internal worker thread to print/write log messages +// the common_log uses an internal worker thread to print/write log messages // when the worker thread is paused, incoming log messages are discarded -struct gpt_log; +struct common_log; -struct gpt_log * gpt_log_init(); -struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit -void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe -void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe -void gpt_log_free (struct gpt_log * log); +struct common_log * common_log_init(); +struct common_log * common_log_main(); // singleton, automatically destroys itself on exit +void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe +void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe +void common_log_free (struct common_log * log); LOG_ATTRIBUTE_FORMAT(3, 4) -void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...); +void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...); // defaults: file = NULL, colors = false, prefix = false, timestamps = false // @@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * f // D - debug (stderr, V = LOG_DEFAULT_DEBUG) // -void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe -void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe -void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log -void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix +void common_log_set_file (struct common_log * log, const char * file); // not thread-safe +void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe +void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log +void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix // helper macros for logging // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold @@ -66,13 +66,13 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w // // LOG_DBG("this is a debug message: %d\n", expensive_function()); // -// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold +// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold // #define LOG_TMPL(level, verbosity, ...) \ do { \ - if ((verbosity) <= gpt_log_verbosity_thold) { \ - gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \ + if ((verbosity) <= common_log_verbosity_thold) { \ + common_log_add(common_log_main(), (level), __VA_ARGS__); \ } \ } while (0) diff --git a/common/sampling.cpp b/common/sampling.cpp index ef4d24844..cd49ade69 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -98,8 +98,8 @@ struct ring_buffer { std::vector data; }; -struct gpt_sampler { - gpt_sampler_params params; +struct common_sampler { + common_sampler_params params; struct llama_sampler * grmr; struct llama_sampler * chain; @@ -125,7 +125,7 @@ struct gpt_sampler { } }; -std::string gpt_sampler_params::print() const { +std::string common_sampler_params::print() const { char result[1024]; snprintf(result, sizeof(result), @@ -139,12 +139,12 @@ std::string gpt_sampler_params::print() const { return std::string(result); } -struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); lparams.no_perf = params.no_perf; - auto * result = new gpt_sampler { + auto * result = new common_sampler { /* .params = */ params, /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"), /* .chain = */ llama_sampler_chain_init(lparams), @@ -175,22 +175,22 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st if (params.mirostat == 0) { for (const auto & cnstr : params.samplers) { switch (cnstr) { - case GPT_SAMPLER_TYPE_TOP_K: + case COMMON_SAMPLER_TYPE_TOP_K: llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); break; - case GPT_SAMPLER_TYPE_TOP_P: + case COMMON_SAMPLER_TYPE_TOP_P: llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); break; - case GPT_SAMPLER_TYPE_MIN_P: + case COMMON_SAMPLER_TYPE_MIN_P: llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); break; - case GPT_SAMPLER_TYPE_TFS_Z: + case COMMON_SAMPLER_TYPE_TFS_Z: llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); break; - case GPT_SAMPLER_TYPE_TYPICAL_P: + case COMMON_SAMPLER_TYPE_TYPICAL_P: llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); break; - case GPT_SAMPLER_TYPE_TEMPERATURE: + case COMMON_SAMPLER_TYPE_TEMPERATURE: llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); break; default: @@ -224,7 +224,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st return result; } -void gpt_sampler_free(struct gpt_sampler * gsmpl) { +void common_sampler_free(struct common_sampler * gsmpl) { if (gsmpl) { llama_sampler_free(gsmpl->grmr); @@ -234,7 +234,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) { } } -void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) { +void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { if (accept_grammar) { llama_sampler_accept(gsmpl->grmr, token); } @@ -244,14 +244,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce gsmpl->prev.push_back(token); } -void gpt_sampler_reset(struct gpt_sampler * gsmpl) { +void common_sampler_reset(struct common_sampler * gsmpl) { llama_sampler_reset(gsmpl->grmr); llama_sampler_reset(gsmpl->chain); } -struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) { - return new gpt_sampler { +struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { + return new common_sampler { /* .params = */ gsmpl->params, /* .grmr = */ llama_sampler_clone(gsmpl->grmr), /* .chain = */ llama_sampler_clone(gsmpl->chain), @@ -261,7 +261,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) { }; } -void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) { +void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) { // TODO: measure grammar performance if (gsmpl) { @@ -272,7 +272,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * } } -llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { gsmpl->set_logits(ctx, idx); auto & grmr = gsmpl->grmr; @@ -318,21 +318,21 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context return cur_p.data[cur_p.selected].id; } -uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) { +uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { return llama_sampler_get_seed(gsmpl->chain); } // helpers -llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) { +llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) { return &gsmpl->cur_p; } -llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) { +llama_token common_sampler_last(const struct common_sampler * gsmpl) { return gsmpl->prev.rat(0); } -std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { +std::string common_sampler_print(const struct common_sampler * gsmpl) { std::string result = "logits "; for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { @@ -343,7 +343,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { return result; } -std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) { +std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) { n = std::min(n, (int) gsmpl->prev.size()); if (n <= 0) { @@ -364,57 +364,57 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, return result; } -char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) { +char common_sampler_type_to_chr(enum common_sampler_type cnstr) { switch (cnstr) { - case GPT_SAMPLER_TYPE_TOP_K: return 'k'; - case GPT_SAMPLER_TYPE_TFS_Z: return 'f'; - case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y'; - case GPT_SAMPLER_TYPE_TOP_P: return 'p'; - case GPT_SAMPLER_TYPE_MIN_P: return 'm'; - case GPT_SAMPLER_TYPE_TEMPERATURE: return 't'; + case COMMON_SAMPLER_TYPE_TOP_K: return 'k'; + case COMMON_SAMPLER_TYPE_TFS_Z: return 'f'; + case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y'; + case COMMON_SAMPLER_TYPE_TOP_P: return 'p'; + case COMMON_SAMPLER_TYPE_MIN_P: return 'm'; + case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; default : return '?'; } } -std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) { +std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { switch (cnstr) { - case GPT_SAMPLER_TYPE_TOP_K: return "top_k"; - case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z"; - case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p"; - case GPT_SAMPLER_TYPE_TOP_P: return "top_p"; - case GPT_SAMPLER_TYPE_MIN_P: return "min_p"; - case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature"; + case COMMON_SAMPLER_TYPE_TOP_K: return "top_k"; + case COMMON_SAMPLER_TYPE_TFS_Z: return "tfs_z"; + case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p"; + case COMMON_SAMPLER_TYPE_TOP_P: return "top_p"; + case COMMON_SAMPLER_TYPE_MIN_P: return "min_p"; + case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; default : return ""; } } -std::vector gpt_sampler_types_from_names(const std::vector & names, bool allow_alt_names) { - std::unordered_map sampler_canonical_name_map { - { "top_k", GPT_SAMPLER_TYPE_TOP_K }, - { "top_p", GPT_SAMPLER_TYPE_TOP_P }, - { "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "min_p", GPT_SAMPLER_TYPE_MIN_P }, - { "tfs_z", GPT_SAMPLER_TYPE_TFS_Z }, - { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE }, +std::vector common_sampler_types_from_names(const std::vector & names, bool allow_alt_names) { + std::unordered_map sampler_canonical_name_map { + { "top_k", COMMON_SAMPLER_TYPE_TOP_K }, + { "top_p", COMMON_SAMPLER_TYPE_TOP_P }, + { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "min_p", COMMON_SAMPLER_TYPE_MIN_P }, + { "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z }, + { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, }; // since samplers names are written multiple ways // make it ready for both system names and input names - std::unordered_map sampler_alt_name_map { - { "top-k", GPT_SAMPLER_TYPE_TOP_K }, - { "top-p", GPT_SAMPLER_TYPE_TOP_P }, - { "nucleus", GPT_SAMPLER_TYPE_TOP_P }, - { "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "typical", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "typ", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "min-p", GPT_SAMPLER_TYPE_MIN_P }, - { "tfs-z", GPT_SAMPLER_TYPE_TFS_Z }, - { "tfs", GPT_SAMPLER_TYPE_TFS_Z }, - { "temp", GPT_SAMPLER_TYPE_TEMPERATURE }, + std::unordered_map sampler_alt_name_map { + { "top-k", COMMON_SAMPLER_TYPE_TOP_K }, + { "top-p", COMMON_SAMPLER_TYPE_TOP_P }, + { "nucleus", COMMON_SAMPLER_TYPE_TOP_P }, + { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "min-p", COMMON_SAMPLER_TYPE_MIN_P }, + { "tfs-z", COMMON_SAMPLER_TYPE_TFS_Z }, + { "tfs", COMMON_SAMPLER_TYPE_TFS_Z }, + { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE }, }; - std::vector samplers; + std::vector samplers; samplers.reserve(names.size()); for (const auto & name : names) { @@ -434,17 +434,17 @@ std::vector gpt_sampler_types_from_names(const std::vector gpt_sampler_types_from_chars(const std::string & chars) { - std::unordered_map sampler_name_map = { - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE } +std::vector common_sampler_types_from_chars(const std::string & chars) { + std::unordered_map sampler_name_map = { + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z), COMMON_SAMPLER_TYPE_TFS_Z }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE } }; - std::vector samplers; + std::vector samplers; samplers.reserve(chars.size()); for (const auto & c : chars) { diff --git a/common/sampling.h b/common/sampling.h index d0e1a9203..d37f25ad3 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -7,7 +7,7 @@ #include #include -// gpt_sampler extends llama_sampler with additional functionality: +// common_sampler extends llama_sampler with additional functionality: // // - grammar support // - custom sampler logic based on the parameters @@ -23,30 +23,30 @@ // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the // grammar constraints are applied to the full vocabulary and the token is resampled. // -// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can +// The common_sampler also maintains a container with the last accepted tokens. In the future, this can // be moved into the core llama library. // -// For convenience, the gpt_sampler also maintains a container with the current candidate tokens. +// For convenience, the common_sampler also maintains a container with the current candidate tokens. // This can be used to access the probabilities of the rest of the non-sampled tokens. // // TODO: measure grammar performance // -struct gpt_sampler; +struct common_sampler; // llama_sampler API overloads -struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params); +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params); -void gpt_sampler_free(struct gpt_sampler * gsmpl); +void common_sampler_free(struct common_sampler * gsmpl); // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar -void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar); -void gpt_sampler_reset (struct gpt_sampler * gsmpl); -struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl); +void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar); +void common_sampler_reset (struct common_sampler * gsmpl); +struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl); // arguments can be nullptr to skip printing -void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl); +void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl); // extended sampling implementation: // @@ -58,26 +58,26 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // if grammar_first is true, the grammar is applied before the samplers (slower) // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar // -llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); -uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl); +uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); // helpers // access the internal list of current candidate tokens -llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl); +llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl); // get the last accepted token -llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl); +llama_token common_sampler_last(const struct common_sampler * gsmpl); // print the sampler chain into a string -std::string gpt_sampler_print(const struct gpt_sampler * gsmpl); +std::string common_sampler_print(const struct common_sampler * gsmpl); // get a string representation of the last accepted tokens -std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n); +std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n); -char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr); -std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr); +char common_sampler_type_to_chr(enum common_sampler_type cnstr); +std::string common_sampler_type_to_str(enum common_sampler_type cnstr); -std::vector gpt_sampler_types_from_names(const std::vector & names, bool allow_alt_names); -std::vector gpt_sampler_types_from_chars(const std::string & chars); +std::vector common_sampler_types_from_names(const std::vector & names, bool allow_alt_names); +std::vector common_sampler_types_from_chars(const std::string & chars); diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index ec58295d5..f62bd5687 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -15,13 +15,13 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { return 1; } - gpt_init(); + common_init(); int is_pp_shared = params.is_pp_shared; @@ -36,7 +36,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = common_model_params_from_gpt_params(params); + llama_model_params model_params = common_model_params_from_common_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -45,7 +45,7 @@ int main(int argc, char ** argv) { return 1; } - llama_context_params ctx_params = common_context_params_from_gpt_params(params); + llama_context_params ctx_params = common_context_params_from_common_params(params); // ensure enough sequences are available ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index fe12312e2..abab45f99 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -15,16 +15,16 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.prompt = "Hello my name is"; params.n_predict = 32; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { return 1; } - gpt_init(); + common_init(); // number of parallel batches int n_parallel = params.n_parallel; @@ -39,7 +39,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = common_model_params_from_gpt_params(params); + llama_model_params model_params = common_model_params_from_common_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -57,7 +57,7 @@ int main(int argc, char ** argv) { // initialize the context - llama_context_params ctx_params = common_context_params_from_gpt_params(params); + llama_context_params ctx_params = common_context_params_from_common_params(params); ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_predict, n_parallel); diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index c140daed3..988a584c9 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -872,7 +872,7 @@ static std::string basename(const std::string &path) { } int main(int argc, char ** argv) { - gpt_init(); + common_init(); struct train_params params = get_default_train_params(); if (!params_parse(argc, argv, ¶ms)) { diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 724ec9eaf..d8ca3f1ad 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -370,7 +370,7 @@ static void export_gguf(const std::vector & v_ctrl, const * Load prompt files and completion file. * Then format each pair of prompt + completion to make an entry. */ -static int prepare_entries(gpt_params & params, train_context & ctx_train) { +static int prepare_entries(common_params & params, train_context & ctx_train) { // load prompts std::vector positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); std::vector negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); @@ -388,9 +388,9 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { return 1; } @@ -413,7 +413,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model to get hparams - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 2ce870613..96fd4c664 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -79,13 +79,13 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { return 1; } - gpt_init(); + common_init(); params.embedding = true; // For non-causal models, batch size must be equal to ubatch size @@ -95,7 +95,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -122,7 +122,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } // split the prompt into lines diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 8a03d3bd6..c24cea67c 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -126,7 +126,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { return true; } -static bool run(llama_context * ctx, const gpt_params & params) { +static bool run(llama_context * ctx, const common_params & params) { const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); @@ -142,13 +142,13 @@ static bool run(llama_context * ctx, const gpt_params & params) { int main(int argc, char ** argv) { callback_data cb_data; - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - gpt_init(); + common_init(); llama_backend_init(); llama_numa_init(params.numa); @@ -160,7 +160,7 @@ int main(int argc, char ** argv) { params.warmup = false; // init - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -172,7 +172,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); } diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index f97a3c67b..67662313d 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -400,9 +400,9 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { return 1; } diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index eaadee9c5..77c59a836 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -50,8 +50,8 @@ static void write_table(std::ofstream & file, std::vector & opts) static void export_md(std::string fname, llama_example ex) { std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); - gpt_params params; - auto ctx_arg = gpt_params_parser_init(params, ex); + common_params params; + auto ctx_arg = common_params_parser_init(params, ex); std::vector common_options; std::vector sparam_options; diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 4304019c0..96f31436c 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -152,16 +152,16 @@ static std::string gritlm_instruction(const std::string & instruction) { } int main(int argc, char * argv[]) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - gpt_init(); + common_init(); - llama_model_params mparams = common_model_params_from_gpt_params(params); - llama_context_params cparams = common_context_params_from_gpt_params(params); + llama_model_params mparams = common_model_params_from_common_params(params); + llama_context_params cparams = common_context_params_from_common_params(params); llama_backend_init(); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 041734774..659fdded8 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -37,13 +37,13 @@ struct Stats { class IMatrixCollector { public: IMatrixCollector() = default; - void set_params(gpt_params params) { m_params = std::move(params); } + void set_params(common_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; bool load_imatrix(const char * file_name); private: std::unordered_map m_stats; - gpt_params m_params; + common_params m_params; std::mutex m_mutex; int m_last_call = 0; std::vector m_src1_data; @@ -428,7 +428,7 @@ static void process_logits( } } -static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { +static bool compute_imatrix(llama_context * ctx, const common_params & params) { const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); const int n_ctx = llama_n_ctx(ctx); @@ -568,17 +568,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.n_ctx = 512; params.logits_all = true; params.escape = false; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { return 1; } - gpt_init(); + common_init(); params.n_batch = std::min(params.n_batch, params.n_ctx); @@ -607,7 +607,7 @@ int main(int argc, char ** argv) { params.warmup = false; // init - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -625,7 +625,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } if (!compute_imatrix(ctx, params)) { diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 143deed3b..a245be88f 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -35,8 +35,8 @@ static llama_context ** g_ctx; static llama_model ** g_model; -static gpt_sampler ** g_smpl; -static gpt_params * g_params; +static common_sampler ** g_smpl; +static common_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; @@ -44,7 +44,7 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, + const llama_context * ctx, const common_params & params, const llama_model * model, const std::vector & input_tokens, const std::string & output, const std::vector & output_tokens ) { @@ -95,12 +95,12 @@ static void sigint_handler(int signo) { } else { console::cleanup(); LOG("\n"); - gpt_perf_print(*g_ctx, *g_smpl); + common_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); // make sure all logs are flushed LOG("Interrupted by user\n"); - gpt_log_pause(gpt_log_main()); + common_log_pause(common_log_main()); _exit(130); } @@ -109,14 +109,14 @@ static void sigint_handler(int signo) { #endif int main(int argc, char ** argv) { - gpt_params params; + common_params params; g_params = ¶ms; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) { return 1; } - gpt_init(); + common_init(); auto & sparams = params.sparams; @@ -166,7 +166,7 @@ int main(int argc, char ** argv) { llama_model * model = nullptr; llama_context * ctx = nullptr; - gpt_sampler * smpl = nullptr; + common_sampler * smpl = nullptr; g_model = &model; g_ctx = &ctx; @@ -174,7 +174,7 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); model = llama_init.model; ctx = llama_init.context; @@ -195,7 +195,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } const bool add_bos = llama_add_bos_token(model); GGML_ASSERT(!llama_add_eos_token(model)); @@ -298,11 +298,11 @@ int main(int argc, char ** argv) { LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); } } - smpl = gpt_sampler_init(model, sparams); + smpl = common_sampler_init(model, sparams); - LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); - LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); @@ -411,9 +411,9 @@ int main(int argc, char ** argv) { embd.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - const llama_token id = gpt_sampler_sample(smpl, ctx, -1); + const llama_token id = common_sampler_sample(smpl, ctx, -1); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); @@ -434,7 +434,7 @@ int main(int argc, char ** argv) { // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - gpt_sampler_accept(smpl, embd_inp[n_consumed], false); + common_sampler_accept(smpl, embd_inp[n_consumed], false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -465,7 +465,7 @@ int main(int argc, char ** argv) { // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { // deal with eot token in infill mode - if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ + if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ if (is_interacting && !params.interactive_first) { // print an eot token LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); @@ -529,7 +529,7 @@ int main(int argc, char ** argv) { is_interacting = false; } // deal with end of generation tokens in interactive mode - else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { + else if (llama_token_is_eog(model, common_sampler_last(smpl))) { LOG_DBG("found EOS token\n"); if (params.interactive) { @@ -601,7 +601,7 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - gpt_sampler_reset(smpl); + common_sampler_reset(smpl); } is_interacting = false; } @@ -624,13 +624,13 @@ int main(int argc, char ** argv) { } LOG("\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); llama_free(ctx); llama_free_model(model); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_backend_free(); return 0; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 16947a904..c3b35bc7f 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -42,11 +42,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n return true; } -static const char * sample(struct gpt_sampler * smpl, +static const char * sample(struct common_sampler * smpl, struct llama_context * ctx_llama, int * n_past) { - const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1); - gpt_sampler_accept(smpl, id, true); + const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); + common_sampler_accept(smpl, id, true); static std::string ret; if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = ""; @@ -120,7 +120,7 @@ static void print_usage(int, char ** argv) { LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); } -static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { +static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) { // load and preprocess the image llava_image_embed * embed = NULL; @@ -146,7 +146,7 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para return embed; } -static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) { +static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) { int n_past = 0; const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; @@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ LOG("\n"); - struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); if (!smpl) { LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); exit(1); @@ -211,15 +211,15 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ fflush(stdout); } - gpt_sampler_free(smpl); + common_sampler_free(smpl); LOG("\n"); } -static struct llama_model * llava_init(gpt_params * params) { +static struct llama_model * llava_init(common_params * params) { llama_backend_init(); llama_numa_init(params->numa); - llama_model_params model_params = common_model_params_from_gpt_params(*params); + llama_model_params model_params = common_model_params_from_common_params(*params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { @@ -229,7 +229,7 @@ static struct llama_model * llava_init(gpt_params * params) { return model; } -static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { +static struct llava_context * llava_init_context(common_params * params, llama_model * model) { const char * clip_path = params->mmproj.c_str(); auto prompt = params->prompt; @@ -240,7 +240,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - llama_context_params ctx_params = common_context_params_from_gpt_params(*params); + llama_context_params ctx_params = common_context_params_from_common_params(*params); ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); @@ -272,13 +272,13 @@ static void llava_free(struct llava_context * ctx_llava) { int main(int argc, char ** argv) { ggml_time_init(); - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { return 1; } - gpt_init(); + common_init(); if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { print_usage(argc, argv); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 5c960be7e..05df2c359 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -25,11 +25,11 @@ static void show_additional_info(int /*argc*/, char ** argv) { LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); } -static struct llama_model * llava_init(gpt_params * params) { +static struct llama_model * llava_init(common_params * params) { llama_backend_init(); llama_numa_init(params->numa); - llama_model_params model_params = common_model_params_from_gpt_params(*params); + llama_model_params model_params = common_model_params_from_common_params(*params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { @@ -39,13 +39,13 @@ static struct llama_model * llava_init(gpt_params * params) { return model; } -static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { +static struct llava_context * llava_init_context(common_params * params, llama_model * model) { auto prompt = params->prompt; if (prompt.empty()) { prompt = "describe the image in detail."; } - llama_context_params ctx_params = common_context_params_from_gpt_params(*params); + llama_context_params ctx_params = common_context_params_from_common_params(*params); if (params->n_ctx < 2048) { // warn user here, "Image processing requires at least 2048 context, setting context to 2048" LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); @@ -79,7 +79,7 @@ static void llava_free(struct llava_context * ctx_llava) { llama_backend_free(); } -static struct clip_ctx * clip_init_context(gpt_params * params) { +static struct clip_ctx * clip_init_context(common_params * params) { const char * clip_path = params->mmproj.c_str(); auto prompt = params->prompt; @@ -129,7 +129,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str llava_image_embed_free(slice_embed); } -static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) { +static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) { std::string system_prompt; int idx = 0; int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); @@ -162,11 +162,11 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e LOG_INF("%s: image token past: %d\n", __func__, n_past); } -static const char * sample(struct gpt_sampler * smpl, +static const char * sample(struct common_sampler * smpl, struct llama_context * ctx_llama, int * n_past) { - const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1); - gpt_sampler_accept(smpl, id, true); + const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); + common_sampler_accept(smpl, id, true); static std::string ret; if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = ""; @@ -177,7 +177,7 @@ static const char * sample(struct gpt_sampler * smpl, return ret.c_str(); } -static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ +static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){ auto * ctx_clip = clip_init_context(params); auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); if (!embeds) { @@ -213,7 +213,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri return ctx_llava; } -static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){ +static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){ std::string user_prompt = prompt; int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); if (!is_first) { @@ -237,11 +237,11 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par LOG_INF("\n"); - struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); return smpl; } -static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){ +static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){ const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past); return tmp; @@ -250,13 +250,13 @@ static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampl int main(int argc, char ** argv) { ggml_time_init(); - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { return 1; } - gpt_init(); + common_init(); if (params.mmproj.empty() || (params.image.empty())) { show_additional_info(argc, argv); @@ -290,7 +290,7 @@ int main(int argc, char ** argv) { fflush(stdout); } - gpt_sampler_free(smpl); + common_sampler_free(smpl); }else { while (true) { LOG(""); @@ -309,7 +309,7 @@ int main(int argc, char ** argv) { if (strstr(response.c_str(), "")) break; // minicpm-v fflush(stdout); } - gpt_sampler_free(smpl); + common_sampler_free(smpl); } } printf("\n"); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 59aaeccf8..c238bcf4b 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -37,13 +37,13 @@ struct ngram_container { }; int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - gpt_init(); + common_init(); const int W = 15; // lookahead window const int N = 5; // n-gram size @@ -56,7 +56,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the target model - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -115,7 +115,7 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); // target model sampling context - struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams); + struct common_sampler * smpl = common_sampler_init(model, params.sparams); // verification n-grams std::vector ngrams_cur(G); @@ -156,9 +156,9 @@ int main(int argc, char ** argv) { // sample first token { - id = gpt_sampler_sample(smpl, ctx, 0); + id = common_sampler_sample(smpl, ctx, 0); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); { const std::string token_str = common_token_to_piece(ctx, id); @@ -281,9 +281,9 @@ int main(int argc, char ** argv) { } // sample the next token - id = gpt_sampler_sample(smpl, ctx, i_batch); + id = common_sampler_sample(smpl, ctx, i_batch); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); // print { @@ -358,7 +358,7 @@ int main(int argc, char ** argv) { if (v == 0) { // sample from the last level for (int i = 0; i < W; i++) { - tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i); + tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i); } } else { for (int i = 0; i < W; i++) { @@ -466,9 +466,9 @@ int main(int argc, char ** argv) { LOG_INF("n_accept = %d\n", n_accept); LOG_INF("\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_kv_cache_view_free(&kvc_view); diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 28cedec61..2706f6200 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -12,9 +12,9 @@ #include int main(int argc, char ** argv){ - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } @@ -23,7 +23,7 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index d73290d66..0f4de5510 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -13,13 +13,13 @@ #include int main(int argc, char ** argv){ - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } - gpt_init(); + common_init(); const int n_draft = params.n_draft; @@ -28,7 +28,7 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 32549904e..7950e38be 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -13,13 +13,13 @@ #include int main(int argc, char ** argv){ - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } - gpt_init(); + common_init(); // max. number of additional tokens to draft if match is found const int n_draft = params.n_draft; @@ -31,7 +31,7 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -102,7 +102,7 @@ int main(int argc, char ** argv){ bool has_eos = false; - struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams); + struct common_sampler * smpl = common_sampler_init(model, params.sparams); std::vector draft; @@ -126,9 +126,9 @@ int main(int argc, char ** argv){ int i_dft = 0; while (true) { // sample from the target model - llama_token id = gpt_sampler_sample(smpl, ctx, i_dft); + llama_token id = common_sampler_sample(smpl, ctx, i_dft); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); const std::string token_str = common_token_to_piece(ctx, id); @@ -237,9 +237,9 @@ int main(int argc, char ** argv){ LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_INF("\ntarget:\n\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_batch_free(batch_tgt); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 34781582f..7ba7ed879 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -33,8 +33,8 @@ static llama_context ** g_ctx; static llama_model ** g_model; -static gpt_sampler ** g_smpl; -static gpt_params * g_params; +static common_sampler ** g_smpl; +static common_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; @@ -63,7 +63,7 @@ static bool file_is_empty(const std::string & path) { } static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, + const llama_context * ctx, const common_params & params, const llama_model * model, const std::vector & input_tokens, const std::string & output, const std::vector & output_tokens ) { @@ -114,12 +114,12 @@ static void sigint_handler(int signo) { } else { console::cleanup(); LOG("\n"); - gpt_perf_print(*g_ctx, *g_smpl); + common_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); // make sure all logs are flushed LOG("Interrupted by user\n"); - gpt_log_pause(gpt_log_main()); + common_log_pause(common_log_main()); _exit(130); } @@ -136,13 +136,13 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector chat_msgs; @@ -197,7 +197,7 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); model = llama_init.model; ctx = llama_init.context; @@ -255,7 +255,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); } @@ -448,15 +448,15 @@ int main(int argc, char ** argv) { } } - smpl = gpt_sampler_init(model, sparams); + smpl = common_sampler_init(model, sparams); if (!smpl) { LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); return 1; } - LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); - LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); @@ -679,9 +679,9 @@ int main(int argc, char ** argv) { LOG_DBG("saved session to %s\n", path_session.c_str()); } - const llama_token id = gpt_sampler_sample(smpl, ctx, -1); + const llama_token id = common_sampler_sample(smpl, ctx, -1); - gpt_sampler_accept(smpl, id, /* accept_grammar= */ true); + common_sampler_accept(smpl, id, /* accept_grammar= */ true); // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); @@ -702,7 +702,7 @@ int main(int argc, char ** argv) { // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); + common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -743,7 +743,7 @@ int main(int argc, char ** argv) { // check for reverse prompt in the last n_prev tokens if (!params.antiprompt.empty()) { const int n_prev = 32; - const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev); + const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev); is_antiprompt = false; // Check if each of the reverse prompts appears at the end of the output. @@ -765,7 +765,7 @@ int main(int argc, char ** argv) { } // check for reverse prompt using special tokens - llama_token last_token = gpt_sampler_last(smpl); + llama_token last_token = common_sampler_last(smpl); for (std::vector ids : antiprompt_ids) { if (ids.size() == 1 && last_token == ids[0]) { if (params.interactive) { @@ -782,7 +782,7 @@ int main(int argc, char ** argv) { } // deal with end of generation tokens in interactive mode - if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { + if (llama_token_is_eog(model, common_sampler_last(smpl))) { LOG_DBG("found an EOG token\n"); if (params.interactive) { @@ -803,7 +803,7 @@ int main(int argc, char ** argv) { // if current token is not EOG, we add it to current assistant message if (params.conversation) { - const auto id = gpt_sampler_last(smpl); + const auto id = common_sampler_last(smpl); assistant_ss << common_token_to_piece(ctx, id, false); } @@ -899,7 +899,7 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - gpt_sampler_reset(smpl); + common_sampler_reset(smpl); } is_interacting = false; } @@ -925,10 +925,10 @@ int main(int argc, char ** argv) { } LOG("\n\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_free(ctx); llama_free_model(model); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 72cca11cd..31fbf0945 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -54,7 +54,7 @@ static std::vector k_prompts = { struct client { ~client() { if (smpl) { - gpt_sampler_free(smpl); + common_sampler_free(smpl); } } @@ -75,7 +75,7 @@ struct client { std::string prompt; std::string response; - struct gpt_sampler * smpl = nullptr; + struct common_sampler * smpl = nullptr; }; static void print_date_time() { @@ -103,13 +103,13 @@ static std::vector split_string(const std::string& input, char deli int main(int argc, char ** argv) { srand(1234); - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { return 1; } - gpt_init(); + common_init(); // number of simultaneous "clients" to simulate const int32_t n_clients = params.n_parallel; @@ -130,7 +130,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the target model - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -160,7 +160,7 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < clients.size(); ++i) { auto & client = clients[i]; client.id = i; - client.smpl = gpt_sampler_init(model, params.sparams); + client.smpl = common_sampler_init(model, params.sparams); } std::vector tokens_system; @@ -252,7 +252,7 @@ int main(int argc, char ** argv) { client.prompt = client.input + "\nAssistant:"; client.response = ""; - gpt_sampler_reset(client.smpl); + common_sampler_reset(client.smpl); // do not prepend BOS because we have a system prompt! std::vector tokens_prompt; @@ -340,9 +340,9 @@ int main(int argc, char ** argv) { //printf("client %d, seq %d, token %d, pos %d, batch %d\n", // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); - const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i); + const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i); - gpt_sampler_accept(client.smpl, id, true); + common_sampler_accept(client.smpl, id, true); if (client.n_decoded == 1) { // start measuring generation time after the first token to make sure all concurrent clients diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index ae6e40bf4..4c919bf29 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -15,17 +15,17 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.n_junk = 250; params.n_keep = 32; params.i_pos = -1; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { return 1; } - gpt_init(); + common_init(); int n_junk = params.n_junk; int n_keep = params.n_keep; @@ -61,7 +61,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = common_model_params_from_gpt_params(params); + llama_model_params model_params = common_model_params_from_common_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -72,7 +72,7 @@ int main(int argc, char ** argv) { // initialize the context - llama_context_params ctx_params = common_context_params_from_gpt_params(params); + llama_context_params ctx_params = common_context_params_from_common_params(params); ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index ade85430e..1fccf1d8f 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -35,7 +35,7 @@ struct results_log_softmax { }; static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, + const llama_context * ctx, const common_params & params, const llama_model * model, const struct results_perplexity & results ) { if (params.logdir.empty()) { @@ -337,7 +337,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens } } -static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { +static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) { // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` @@ -472,7 +472,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & return {tokens, std::exp(nll / count), logit_history, prob_history}; } -static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) { +static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) { if (params.ppl_stride > 0) { return perplexity_v2(ctx, params); } @@ -763,7 +763,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto } } -static void hellaswag_score(llama_context * ctx, const gpt_params & params) { +static void hellaswag_score(llama_context * ctx, const common_params & params) { // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl @@ -1102,7 +1102,7 @@ static std::vector load_winogrande_from_csv(const std::string * 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2 * */ -static void winogrande_score(llama_context * ctx, const gpt_params & params) { +static void winogrande_score(llama_context * ctx, const common_params & params) { constexpr int k_min_trailing_ctx = 3; @@ -1403,7 +1403,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic // git@hf.co:datasets/Stevross/mmlu // https://huggingface.co/datasets/truthful_qa // -static void multiple_choice_score(llama_context * ctx, const gpt_params & params) { +static void multiple_choice_score(llama_context * ctx, const common_params & params) { std::istringstream strstream(params.prompt); uint32_t n_task; @@ -1683,7 +1683,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params LOG_INF("\n"); } -static void kl_divergence(llama_context * ctx, const gpt_params & params) { +static void kl_divergence(llama_context * ctx, const common_params & params) { if (params.logits_file.empty()) { LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); return; @@ -1955,17 +1955,17 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.n_ctx = 512; params.logits_all = true; params.escape = false; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { return 1; } - gpt_init(); + common_init(); const int32_t n_ctx = params.n_ctx; @@ -2004,7 +2004,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model and apply lora adapter, if any - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -2023,7 +2023,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } struct results_perplexity results; diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index b58543491..24d05885b 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -112,13 +112,13 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { return 1; } - gpt_init(); + common_init(); // For BERT models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; @@ -149,7 +149,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -176,7 +176,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } // max batch size diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 2fdb79742..fd9182451 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -6,12 +6,12 @@ #include int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.prompt = "The quick brown fox"; params.sparams.seed = 1234; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } @@ -28,7 +28,7 @@ int main(int argc, char ** argv) { std::string result2; // init - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -92,7 +92,7 @@ int main(int argc, char ** argv) { llama_free(ctx); // make new context - auto * ctx2 = llama_new_context_with_model(model, common_context_params_from_gpt_params(params)); + auto * ctx2 = llama_new_context_with_model(model, common_context_params_from_common_params(params)); llama_sampler * smpl2 = llama_sampler_chain_init(sparams); @@ -152,7 +152,7 @@ int main(int argc, char ** argv) { } // make new context - auto * ctx3 = llama_new_context_with_model(model, common_context_params_from_gpt_params(params)); + auto * ctx3 = llama_new_context_with_model(model, common_context_params_from_common_params(params)); llama_sampler * smpl3 = llama_sampler_chain_init(sparams); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index bf29d2bcf..97b04a2cd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -188,8 +188,8 @@ struct server_slot { // sampling json json_schema; - struct gpt_sampler_params sparams; - struct gpt_sampler * smpl = nullptr; + struct common_sampler_params sparams; + struct common_sampler * smpl = nullptr; llama_token sampled; @@ -231,7 +231,7 @@ struct server_slot { generated_token_probs.clear(); } - bool has_budget(gpt_params &global_params) { + bool has_budget(common_params &global_params) { if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless } @@ -613,7 +613,7 @@ struct server_context { llama_context * ctx = nullptr; std::vector loras; - gpt_params params; + common_params params; llama_batch batch = {}; @@ -655,20 +655,20 @@ struct server_context { // Clear any sampling context for (server_slot & slot : slots) { if (slot.smpl != nullptr) { - gpt_sampler_free(slot.smpl); + common_sampler_free(slot.smpl); } } llama_batch_free(batch); } - bool load_model(const gpt_params & params_) { + bool load_model(const common_params & params_) { params = params_; // dedicate one sequence to the system prompt params.n_parallel += 1; - common_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_common_params(params); model = llama_init.model; ctx = llama_init.context; @@ -1031,7 +1031,7 @@ struct server_context { sampler_names.emplace_back(name); } } - slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false); + slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false); } else { slot.sparams.samplers = default_sparams.samplers; } @@ -1039,10 +1039,10 @@ struct server_context { { if (slot.smpl != nullptr) { - gpt_sampler_free(slot.smpl); + common_sampler_free(slot.smpl); } - slot.smpl = gpt_sampler_init(model, slot.sparams); + slot.smpl = common_sampler_init(model, slot.sparams); if (slot.smpl == nullptr) { // for now, the only error that may happen here is invalid grammar send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); @@ -1224,7 +1224,7 @@ struct server_context { std::vector samplers; samplers.reserve(slot.sparams.samplers.size()); for (const auto & sampler : slot.sparams.samplers) { - samplers.emplace_back(gpt_sampler_type_to_str(sampler)); + samplers.emplace_back(common_sampler_type_to_str(sampler)); } return json { @@ -1232,7 +1232,7 @@ struct server_context { {"n_predict", slot.n_predict}, // Server configured n_predict {"model", params.model_alias}, {"seed", slot.sparams.seed}, - {"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0}, + {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0}, {"temperature", slot.sparams.temp}, {"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, @@ -2092,7 +2092,7 @@ struct server_context { GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } - gpt_sampler_reset(slot.smpl); + common_sampler_reset(slot.smpl); if (!slot.params.cache_prompt) { slot.n_past_se = 0; @@ -2105,7 +2105,7 @@ struct server_context { // push the prompt into the sampling context (do not apply grammar) for (int i = 0; i < slot.n_past; ++i) { - gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false); + common_sampler_accept(slot.smpl, slot.cache_tokens[i], false); } } } @@ -2159,7 +2159,7 @@ struct server_context { slot.n_past_se = 0; slot.ga_i = 0; // TODO: is the system prompt ever in the sampling context? - gpt_sampler_reset(slot.smpl); + common_sampler_reset(slot.smpl); } // remove the non-common part from the cache @@ -2322,9 +2322,9 @@ struct server_context { } completion_token_output result; - const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i); - gpt_sampler_accept(slot.smpl, id, true); + common_sampler_accept(slot.smpl, id, true); slot.n_decoded += 1; if (slot.n_decoded == 1) { @@ -2335,7 +2335,7 @@ struct server_context { result.tok = id; - const auto * cur_p = gpt_sampler_get_candidates(slot.smpl); + const auto * cur_p = common_sampler_get_candidates(slot.smpl); for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { result.probs.push_back({ @@ -2399,13 +2399,13 @@ inline void signal_handler(int signal) { int main(int argc, char ** argv) { // own arguments required by this example - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { return 1; } - gpt_init(); + common_init(); // enabling this will output extra debug information in the HTTP responses from the server // see format_final_response_oaicompat() @@ -2427,7 +2427,7 @@ int main(int argc, char ** argv) { LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); std::unique_ptr svr; diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index eb4498e7c..0867bf3c6 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -12,16 +12,16 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.prompt = "Hello my name is"; params.n_predict = 32; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { return 1; } - gpt_init(); + common_init(); // total length of the sequence including the prompt const int n_predict = params.n_predict; @@ -33,7 +33,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = common_model_params_from_gpt_params(params); + llama_model_params model_params = common_model_params_from_common_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -44,7 +44,7 @@ int main(int argc, char ** argv) { // initialize the context - llama_context_params ctx_params = common_context_params_from_gpt_params(params); + llama_context_params ctx_params = common_context_params_from_common_params(params); llama_context * ctx = llama_new_context_with_model(model, ctx_params); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index b6ab1b08a..55d13fc14 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -26,20 +26,20 @@ struct seq_draft { std::vector tokens; std::vector> dists; - struct gpt_sampler * smpl = nullptr; + struct common_sampler * smpl = nullptr; }; int main(int argc, char ** argv) { - gpt_params params; + common_params params; // needed to get candidate probs even for temp <= 0.0 params.sparams.n_probs = 128; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { return 1; } - gpt_init(); + common_init(); if (params.model_draft.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); @@ -66,7 +66,7 @@ int main(int argc, char ** argv) { llama_context * ctx_dft = NULL; // load the target model - common_init_result llama_init_tgt = llama_init_from_gpt_params(params); + common_init_result llama_init_tgt = common_init_from_common_params(params); model_tgt = llama_init_tgt.model; ctx_tgt = llama_init_tgt.context; @@ -78,7 +78,7 @@ int main(int argc, char ** argv) { } params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads; - common_init_result llama_init_dft = llama_init_from_gpt_params(params); + common_init_result llama_init_dft = common_init_from_common_params(params); model_dft = llama_init_dft.model; ctx_dft = llama_init_dft.context; @@ -178,7 +178,7 @@ int main(int argc, char ** argv) { bool has_eos = false; // target model sampling context (reuse the llama_context's sampling instance) - struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams); + struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams); struct llama_sampler * softmax = llama_sampler_init_softmax(); @@ -186,8 +186,8 @@ int main(int argc, char ** argv) { std::vector drafts(n_seq_dft); for (int s = 0; s < n_seq_dft; ++s) { - // allocate gpt_sampler for each draft sequence - drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams); + // allocate llama_sampler for each draft sequence + drafts[s].smpl = common_sampler_init(model_dft, params.sparams); } llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); @@ -229,9 +229,9 @@ int main(int argc, char ** argv) { bool accept = false; if (params.sparams.temp > 0) { // stochastic verification - gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); + common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); - auto & dist_tgt = *gpt_sampler_get_candidates(smpl); + auto & dist_tgt = *common_sampler_get_candidates(smpl); float p_tgt = 0.0f; float p_dft = 0.0f; @@ -278,7 +278,7 @@ int main(int argc, char ** argv) { accept = true; token_id = drafts[s].tokens[i_dft]; token_str = common_token_to_piece(ctx_tgt, token_id); - gpt_sampler_accept(smpl, token_id, true); + common_sampler_accept(smpl, token_id, true); LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); break; @@ -349,7 +349,7 @@ int main(int argc, char ** argv) { const int idx = dist(rng); token_id = dist_tgt.data[idx].id; - gpt_sampler_accept(smpl, token_id, true); + common_sampler_accept(smpl, token_id, true); token_str = common_token_to_piece(ctx_tgt, token_id); } } else { @@ -357,9 +357,9 @@ int main(int argc, char ** argv) { // sample from the target model LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); - token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); + token_id = common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); - gpt_sampler_accept(smpl, token_id, true); + common_sampler_accept(smpl, token_id, true); token_str = common_token_to_piece(ctx_tgt, token_id); @@ -446,9 +446,9 @@ int main(int argc, char ** argv) { } if (drafts[0].smpl) { - gpt_sampler_free(drafts[0].smpl); + common_sampler_free(drafts[0].smpl); } - drafts[0].smpl = gpt_sampler_clone(smpl); + drafts[0].smpl = common_sampler_clone(smpl); int n_seq_cur = 1; int n_past_cur = n_past_dft; @@ -477,9 +477,9 @@ int main(int argc, char ** argv) { continue; } - gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true); + common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true); - const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl); + const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl); for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) { LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", @@ -518,9 +518,9 @@ int main(int argc, char ** argv) { drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; if (drafts[n_seq_cur].smpl) { - gpt_sampler_free(drafts[n_seq_cur].smpl); + common_sampler_free(drafts[n_seq_cur].smpl); } - drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl); + drafts[n_seq_cur].smpl = common_sampler_clone(drafts[s].smpl); sa.push_back(n_seq_cur); @@ -536,7 +536,7 @@ int main(int argc, char ** argv) { const int s = sa[is]; - gpt_sampler_accept(drafts[s].smpl, id, true); + common_sampler_accept(drafts[s].smpl, id, true); drafts[s].tokens.push_back(id); // save cur_p.data into drafts[s].dists @@ -617,11 +617,11 @@ int main(int argc, char ** argv) { LOG_INF("\n"); LOG_INF("target:\n\n"); - gpt_perf_print(ctx_tgt, smpl); + common_perf_print(ctx_tgt, smpl); - gpt_sampler_free(smpl); + common_sampler_free(smpl); for (int s = 0; s < n_seq_dft; ++s) { - gpt_sampler_free(drafts[s].smpl); + common_sampler_free(drafts[s].smpl); } llama_sampler_free(softmax); diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index e07d09733..3665238b5 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -10,12 +10,12 @@ #include int main(void) { - gpt_params params; + common_params params; printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n"); for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) { try { - auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex); + auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex); std::unordered_set seen_args; std::unordered_set seen_env_vars; for (const auto & opt : ctx_arg.options) { @@ -58,44 +58,44 @@ int main(void) { // missing value argv = {"binary_name", "-m"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); // wrong value (int) argv = {"binary_name", "-ngl", "hello"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); // wrong value (enum) argv = {"binary_name", "-sm", "hello"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); // non-existence arg in specific example (--draft cannot be used outside llama-speculative) argv = {"binary_name", "--draft", "123"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER)); + assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER)); printf("test-arg-parser: test valid usage\n\n"); argv = {"binary_name", "-m", "model_file.gguf"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "model_file.gguf"); argv = {"binary_name", "-t", "1234"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.cpuparams.n_threads == 1234); argv = {"binary_name", "--verbose"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.verbosity > 1); argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "abc.gguf"); assert(params.n_predict == 6789); assert(params.n_batch == 9090); // --draft cannot be used outside llama-speculative argv = {"binary_name", "--draft", "123"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE)); + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE)); assert(params.n_draft == 123); // skip this part on windows, because setenv is not supported @@ -106,12 +106,12 @@ int main(void) { setenv("LLAMA_ARG_THREADS", "blah", true); argv = {"binary_name"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); setenv("LLAMA_ARG_MODEL", "blah.gguf", true); setenv("LLAMA_ARG_THREADS", "1010", true); argv = {"binary_name"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "blah.gguf"); assert(params.cpuparams.n_threads == 1010); @@ -121,7 +121,7 @@ int main(void) { setenv("LLAMA_ARG_MODEL", "blah.gguf", true); setenv("LLAMA_ARG_THREADS", "1010", true); argv = {"binary_name", "-m", "overwritten.gguf"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "overwritten.gguf"); assert(params.cpuparams.n_threads == 1010); #endif // _WIN32 diff --git a/tests/test-log.cpp b/tests/test-log.cpp index 211222369..306f28c61 100644 --- a/tests/test-log.cpp +++ b/tests/test-log.cpp @@ -24,8 +24,8 @@ int main() { } if (rand () % 10 < 5) { - gpt_log_set_timestamps(gpt_log_main(), rand() % 2); - gpt_log_set_prefix (gpt_log_main(), rand() % 2); + common_log_set_timestamps(common_log_main(), rand() % 2); + common_log_set_prefix (common_log_main(), rand() % 2); } } });