diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 56fefd93d..9044cd78b 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -37,9 +37,9 @@ jobs: - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete. + # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete + #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } diff --git a/.gitignore b/.gitignore index 9986ac6b1..1092d097a 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,7 @@ llama-batched-swift /rpc-server out/ tmp/ +autogen-*.md # Deprecated diff --git a/Makefile b/Makefile index 332496cfc..c12bc61f4 100644 --- a/Makefile +++ b/Makefile @@ -39,10 +39,12 @@ BUILD_TARGETS = \ llama-tokenize \ llama-vdot \ llama-cvector-generator \ + llama-gen-docs \ tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ + tests/test-arg-parser \ tests/test-autorelease \ tests/test-backend-ops \ tests/test-chat-template \ @@ -923,11 +925,11 @@ OBJ_LLAMA = \ OBJ_COMMON = \ common/common.o \ + common/arg.o \ common/console.o \ common/ngram-cache.o \ common/sampling.o \ common/train.o \ - common/grammar-parser.o \ common/build-info.o \ common/json-schema-to-grammar.o @@ -1156,6 +1158,11 @@ common/common.o: \ include/llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ +common/arg.o: \ + common/arg.cpp \ + common/arg.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + common/sampling.o: \ common/sampling.cpp \ common/sampling.h \ @@ -1167,11 +1174,6 @@ common/console.o: \ common/console.h $(CXX) $(CXXFLAGS) -c $< -o $@ -common/grammar-parser.o: \ - common/grammar-parser.cpp \ - common/grammar-parser.h - $(CXX) $(CXXFLAGS) -c $< -o $@ - common/json-schema-to-grammar.o: \ common/json-schema-to-grammar.cpp \ common/json-schema-to-grammar.h @@ -1448,6 +1450,11 @@ examples/server/%.hpp: examples/server/public/% Makefile echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ ) > $@ +llama-gen-docs: examples/gen-docs/gen-docs.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + libllava.a: examples/llava/llava.cpp \ examples/llava/llava.h \ examples/llava/clip.cpp \ @@ -1505,6 +1512,11 @@ run-benchmark-matmult: llama-benchmark-matmult .PHONY: run-benchmark-matmult swift +tests/test-arg-parser: tests/test-arg-parser.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + tests/test-llama-grammar: tests/test-llama-grammar.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/README.md b/README.md index e30ab0c8c..c945e125c 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ## Hot topics -- *add hot topics here* +- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) ---- @@ -163,6 +163,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) - [AIKit](https://github.com/sozercan/aikit) (MIT) - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL) +- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 761971d68..22fd99689 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -54,12 +54,12 @@ add_library(${TARGET} STATIC base64.hpp common.h common.cpp + arg.h + arg.cpp sampling.h sampling.cpp console.h console.cpp - grammar-parser.h - grammar-parser.cpp json.hpp json-schema-to-grammar.cpp train.h diff --git a/common/arg.cpp b/common/arg.cpp new file mode 100644 index 000000000..c5134be51 --- /dev/null +++ b/common/arg.cpp @@ -0,0 +1,1994 @@ +#include "arg.h" + +#include "sampling.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "json-schema-to-grammar.h" + +using json = nlohmann::ordered_json; + +llama_arg & llama_arg::set_examples(std::initializer_list examples) { + this->examples = std::move(examples); + return *this; +} + +llama_arg & llama_arg::set_env(const char * env) { + help = help + "\n(env: " + env + ")"; + this->env = env; + return *this; +} + +llama_arg & llama_arg::set_sparam() { + is_sparam = true; + return *this; +} + +bool llama_arg::in_example(enum llama_example ex) { + return examples.find(ex) != examples.end(); +} + +bool llama_arg::get_value_from_env(std::string & output) { + if (env == nullptr) return false; + char * value = std::getenv(env); + if (value) { + output = value; + return true; + } + return false; +} + +bool llama_arg::has_value_from_env() { + return env != nullptr && std::getenv(env); +} + +static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) { + std::vector result; + std::istringstream iss(input); + std::string line; + auto add_line = [&](const std::string& l) { + if (l.length() <= max_char_per_line) { + result.push_back(l); + } else { + std::istringstream line_stream(l); + std::string word, current_line; + while (line_stream >> word) { + if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) { + if (!current_line.empty()) result.push_back(current_line); + current_line = word; + } else { + current_line += (!current_line.empty() ? " " : "") + word; + } + } + if (!current_line.empty()) result.push_back(current_line); + } + }; + while (std::getline(iss, line)) { + add_line(line); + } + return result; +} + +std::string llama_arg::to_string() { + // params for printing to console + const static int n_leading_spaces = 40; + const static int n_char_per_line_help = 70; // TODO: detect this based on current console + std::string leading_spaces(n_leading_spaces, ' '); + + std::ostringstream ss; + for (const auto arg : args) { + if (arg == args.front()) { + if (args.size() == 1) { + ss << arg; + } else { + // first arg is usually abbreviation, we need padding to make it more beautiful + auto tmp = std::string(arg) + ", "; + auto spaces = std::string(std::max(0, 7 - (int)tmp.size()), ' '); + ss << tmp << spaces; + } + } else { + ss << arg << (arg != args.back() ? ", " : ""); + } + } + if (value_hint) ss << " " << value_hint; + if (value_hint_2) ss << " " << value_hint_2; + if (ss.tellp() > n_leading_spaces - 3) { + // current line is too long, add new line + ss << "\n" << leading_spaces; + } else { + // padding between arg and help, same line + ss << std::string(leading_spaces.size() - ss.tellp(), ' '); + } + const auto help_lines = break_str_into_lines(help, n_char_per_line_help); + for (const auto & line : help_lines) { + ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; + } + return ss.str(); +} + +// +// utils +// + +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + +LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) +static std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + +static void gpt_params_handle_model_default(gpt_params & params) { + if (!params.hf_repo.empty()) { + // short-hand to avoid specifying --hf-file -> default it to --model + if (params.hf_file.empty()) { + if (params.model.empty()) { + throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); + } + params.hf_file = params.model; + } else if (params.model.empty()) { + params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); + } + } else if (!params.model_url.empty()) { + if (params.model.empty()) { + auto f = string_split(params.model_url, '#').front(); + f = string_split(f, '?').front(); + params.model = fs_get_cache_file(string_split(f, '/').back()); + } + } else if (params.model.empty()) { + params.model = DEFAULT_MODEL_PATH; + } +} + +// +// CLI argument parsing functions +// + +static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx_arg) { + std::string arg; + const std::string arg_prefix = "--"; + gpt_params & params = ctx_arg.params; + gpt_sampler_params & sparams = params.sparams; + + std::unordered_map arg_to_options; + for (auto & opt : ctx_arg.options) { + for (const auto & arg : opt.args) { + arg_to_options[arg] = &opt; + } + } + + // handle environment variables + for (auto & opt : ctx_arg.options) { + std::string value; + if (opt.get_value_from_env(value)) { + try { + if (opt.handler_void && (value == "1" || value == "true")) { + opt.handler_void(params); + } + if (opt.handler_int) { + opt.handler_int(params, std::stoi(value)); + } + if (opt.handler_string) { + opt.handler_string(params, value); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(format( + "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); + } + } + } + + // handle command line arguments + auto check_arg = [&](int i) { + if (i+1 >= argc) { + throw std::invalid_argument("expected value for argument"); + } + }; + + for (int i = 1; i < argc; i++) { + const std::string arg_prefix = "--"; + + std::string arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + if (arg_to_options.find(arg) == arg_to_options.end()) { + throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); + } + auto opt = *arg_to_options[arg]; + if (opt.has_value_from_env()) { + fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); + } + try { + if (opt.handler_void) { + opt.handler_void(params); + continue; + } + + // arg with single value + check_arg(i); + std::string val = argv[++i]; + if (opt.handler_int) { + opt.handler_int(params, std::stoi(val)); + continue; + } + if (opt.handler_string) { + opt.handler_string(params, val); + continue; + } + + // arg with 2 values + check_arg(i); + std::string val2 = argv[++i]; + if (opt.handler_str_str) { + opt.handler_str_str(params, val, val2); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(format( + "error while handling argument \"%s\": %s\n\n" + "usage:\n%s\n\nto show complete usage, run with -h", + arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); + } + } + + postprocess_cpu_params(params.cpuparams, nullptr); + postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); + + if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { + throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); + } + + gpt_params_handle_model_default(params); + + if (params.escape) { + string_process_escapes(params.prompt); + string_process_escapes(params.input_prefix); + string_process_escapes(params.input_suffix); + for (auto & antiprompt : params.antiprompt) { + string_process_escapes(antiprompt); + } + } + + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(); + params.kv_overrides.back().key[0] = 0; + } + + if (sparams.seed == LLAMA_DEFAULT_SEED) { + sparams.seed = time(NULL); + } + + return true; +} + +static void gpt_params_print_usage(gpt_params_context & ctx_arg) { + auto print_options = [](std::vector & options) { + for (llama_arg * opt : options) { + printf("%s", opt->to_string().c_str()); + } + }; + + std::vector common_options; + std::vector sparam_options; + std::vector specific_options; + for (auto & opt : ctx_arg.options) { + // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example + if (opt.is_sparam) { + sparam_options.push_back(&opt); + } else if (opt.in_example(ctx_arg.ex)) { + specific_options.push_back(&opt); + } else { + common_options.push_back(&opt); + } + } + printf("----- common params -----\n\n"); + print_options(common_options); + printf("\n\n----- sampling params -----\n\n"); + print_options(sparam_options); + // TODO: maybe convert enum llama_example to string + printf("\n\n----- example-specific params -----\n\n"); + print_options(specific_options); +} + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { + auto ctx_arg = gpt_params_parser_init(params, ex, print_usage); + const gpt_params params_org = ctx_arg.params; // the example can modify the default params + + try { + if (!gpt_params_parse_ex(argc, argv, ctx_arg)) { + ctx_arg.params = params_org; + return false; + } + if (ctx_arg.params.usage) { + gpt_params_print_usage(ctx_arg); + if (ctx_arg.print_usage) { + ctx_arg.print_usage(argc, argv); + } + exit(0); + } + } catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + ctx_arg.params = params_org; + return false; + } + + return true; +} + +gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { + gpt_params_context ctx_arg(params); + ctx_arg.print_usage = print_usage; + ctx_arg.ex = ex; + + std::string sampler_type_chars; + std::string sampler_type_names; + for (const auto & sampler : params.sparams.samplers) { + sampler_type_chars += gpt_sampler_type_to_chr(sampler); + sampler_type_names += gpt_sampler_type_to_str(sampler) + ";"; + } + sampler_type_names.pop_back(); + + + /** + * filter options by example + * rules: + * - all examples inherit options from LLAMA_EXAMPLE_COMMON + * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example + * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example + */ + auto add_opt = [&](llama_arg arg) { + if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { + ctx_arg.options.push_back(std::move(arg)); + } + }; + + + add_opt(llama_arg( + {"-h", "--help", "--usage"}, + "print usage and exit", + [](gpt_params & params) { + params.usage = true; + } + )); + add_opt(llama_arg( + {"--version"}, + "show version and build info", + [](gpt_params &) { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + )); + add_opt(llama_arg( + {"-v", "--verbose"}, + "print verbose information", + [](gpt_params & params) { + params.verbosity = 1; + } + )); + add_opt(llama_arg( + {"--verbosity"}, "N", + format("set specific verbosity level (default: %d)", params.verbosity), + [](gpt_params & params, int value) { + params.verbosity = value; + } + )); + add_opt(llama_arg( + {"--verbose-prompt"}, + format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), + [](gpt_params & params) { + params.verbose_prompt = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--no-display-prompt"}, + format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), + [](gpt_params & params) { + params.display_prompt = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-co", "--color"}, + format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), + [](gpt_params & params) { + params.use_color = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"-t", "--threads"}, "N", + format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), + [](gpt_params & params, int value) { + params.cpuparams.n_threads = value; + if (params.cpuparams.n_threads <= 0) { + params.cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_env("LLAMA_ARG_THREADS")); + add_opt(llama_arg( + {"-tb", "--threads-batch"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads)", + [](gpt_params & params, int value) { + params.cpuparams_batch.n_threads = value; + if (params.cpuparams_batch.n_threads <= 0) { + params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + )); + add_opt(llama_arg( + {"-td", "--threads-draft"}, "N", + "number of threads to use during generation (default: same as --threads)", + [](gpt_params & params, int value) { + params.draft_cpuparams.n_threads = value; + if (params.draft_cpuparams.n_threads <= 0) { + params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-tbd", "--threads-batch-draft"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.n_threads = value; + if (params.draft_cpuparams_batch.n_threads <= 0) { + params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-C", "--cpu-mask"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", + [](gpt_params & params, const std::string & mask) { + params.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + )); + add_opt(llama_arg( + {"-Cr", "--cpu-range"}, "lo-hi", + "range of CPUs for affinity. Complements --cpu-mask", + [](gpt_params & params, const std::string & range) { + params.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + )); + add_opt(llama_arg( + {"--cpu-strict"}, "<0|1>", + format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), + [](gpt_params & params, const std::string & value) { + params.cpuparams.strict_cpu = std::stoul(value); + } + )); + add_opt(llama_arg( + {"--prio"}, "N", + format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams.priority = (enum ggml_sched_priority) prio; + } + )); + add_opt(llama_arg( + {"--poll"}, "<0...100>", + format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), + [](gpt_params & params, const std::string & value) { + params.cpuparams.poll = std::stoul(value); + } + )); + add_opt(llama_arg( + {"-Cb", "--cpu-mask-batch"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + )); + add_opt(llama_arg( + {"-Crb", "--cpu-range-batch"}, "lo-hi", + "ranges of CPUs for affinity. Complements --cpu-mask-batch", + [](gpt_params & params, const std::string & range) { + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + )); + add_opt(llama_arg( + {"--cpu-strict-batch"}, "<0|1>", + "use strict CPU placement (default: same as --cpu-strict)", + [](gpt_params & params, int value) { + params.cpuparams_batch.strict_cpu = value; + } + )); + add_opt(llama_arg( + {"--prio-batch"}, "N", + format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + )); + add_opt(llama_arg( + {"--poll-batch"}, "<0|1>", + "use polling to wait for work (default: same as --poll)", + [](gpt_params & params, int value) { + params.cpuparams_batch.poll = value; + } + )); + add_opt(llama_arg( + {"-Cd", "--cpu-mask-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.draft_cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Crd", "--cpu-range-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft", + [](gpt_params & params, const std::string & range) { + params.draft_cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--cpu-strict-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: same as --cpu-strict)", + [](gpt_params & params, int value) { + params.draft_cpuparams.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--prio-draft"}, "N", + format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--poll-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: same as --poll])", + [](gpt_params & params, int value) { + params.draft_cpuparams.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Cbd", "--cpu-mask-batch-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.draft_cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", + [](gpt_params & params, const std::string & range) { + params.draft_cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--cpu-strict-batch-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: --cpu-strict-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--prio-batch-draft"}, "N", + format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--poll-batch-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: --poll-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--draft"}, "N", + format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), + [](gpt_params & params, int value) { + params.n_draft = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); + add_opt(llama_arg( + {"-ps", "--p-split"}, "N", + format("speculative decoding split probability (default: %.1f)", (double)params.p_split), + [](gpt_params & params, const std::string & value) { + params.p_split = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-lcs", "--lookup-cache-static"}, "FNAME", + "path to static lookup cache to use for lookup decoding (not updated by generation)", + [](gpt_params & params, const std::string & value) { + params.lookup_cache_static = value; + } + ).set_examples({LLAMA_EXAMPLE_LOOKUP})); + add_opt(llama_arg( + {"-lcd", "--lookup-cache-dynamic"}, "FNAME", + "path to dynamic lookup cache to use for lookup decoding (updated by generation)", + [](gpt_params & params, const std::string & value) { + params.lookup_cache_dynamic = value; + } + ).set_examples({LLAMA_EXAMPLE_LOOKUP})); + add_opt(llama_arg( + {"-c", "--ctx-size"}, "N", + format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), + [](gpt_params & params, int value) { + params.n_ctx = value; + } + ).set_env("LLAMA_ARG_CTX_SIZE")); + add_opt(llama_arg( + {"-n", "--predict", "--n-predict"}, "N", + format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), + [](gpt_params & params, int value) { + params.n_predict = value; + } + ).set_env("LLAMA_ARG_N_PREDICT")); + add_opt(llama_arg( + {"-b", "--batch-size"}, "N", + format("logical maximum batch size (default: %d)", params.n_batch), + [](gpt_params & params, int value) { + params.n_batch = value; + } + ).set_env("LLAMA_ARG_BATCH")); + add_opt(llama_arg( + {"-ub", "--ubatch-size"}, "N", + format("physical maximum batch size (default: %d)", params.n_ubatch), + [](gpt_params & params, int value) { + params.n_ubatch = value; + } + ).set_env("LLAMA_ARG_UBATCH")); + add_opt(llama_arg( + {"--keep"}, "N", + format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), + [](gpt_params & params, int value) { + params.n_keep = value; + } + )); + add_opt(llama_arg( + {"--chunks"}, "N", + format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), + [](gpt_params & params, int value) { + params.n_chunks = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"-fa", "--flash-attn"}, + format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), + [](gpt_params & params) { + params.flash_attn = true; + } + ).set_env("LLAMA_ARG_FLASH_ATTN")); + add_opt(llama_arg( + {"-p", "--prompt"}, "PROMPT", + ex == LLAMA_EXAMPLE_MAIN + ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" + : "prompt to start generation with", + [](gpt_params & params, const std::string & value) { + params.prompt = value; + } + )); + add_opt(llama_arg( + {"-f", "--file"}, "FNAME", + "a file containing the prompt (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + // store the external file name in params + params.prompt_file = value; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (!params.prompt.empty() && params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } + )); + add_opt(llama_arg( + {"--in-file"}, "FNAME", + "an input file (repeat to specify multiple files)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + params.in_files.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"-bf", "--binary-file"}, "FNAME", + "binary file containing the prompt (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + // store the external file name in params + params.prompt_file = value; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); + } + )); + add_opt(llama_arg( + {"-e", "--escape"}, + format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + [](gpt_params & params) { + params.escape = true; + } + )); + add_opt(llama_arg( + {"--no-escape"}, + "do not process escape sequences", + [](gpt_params & params) { + params.escape = false; + } + )); + add_opt(llama_arg( + {"-ptc", "--print-token-count"}, "N", + format("print token count every N tokens (default: %d)", params.n_print), + [](gpt_params & params, int value) { + params.n_print = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache"}, "FNAME", + "file to cache prompt state for faster startup (default: none)", + [](gpt_params & params, const std::string & value) { + params.path_prompt_cache = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache-all"}, + "if specified, saves user input and generations to cache as well\n", + [](gpt_params & params) { + params.prompt_cache_all = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache-ro"}, + "if specified, uses the prompt cache but does not update it", + [](gpt_params & params) { + params.prompt_cache_ro = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-r", "--reverse-prompt"}, "PROMPT", + "halt generation at PROMPT, return control in interactive mode\n", + [](gpt_params & params, const std::string & value) { + params.antiprompt.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-sp", "--special"}, + format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), + [](gpt_params & params) { + params.special = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-cnv", "--conversation"}, + format( + "run in conversation mode:\n" + "- does not print special tokens and suffix/prefix\n" + "- interactive mode is also enabled\n" + "(default: %s)", + params.conversation ? "true" : "false" + ), + [](gpt_params & params) { + params.conversation = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-i", "--interactive"}, + format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), + [](gpt_params & params) { + params.interactive = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-if", "--interactive-first"}, + format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), + [](gpt_params & params) { + params.interactive_first = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-mli", "--multiline-input"}, + "allows you to write or paste multiple lines without ending each in '\\'", + [](gpt_params & params) { + params.multiline_input = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--in-prefix-bos"}, + "prefix BOS to user inputs, preceding the `--in-prefix` string", + [](gpt_params & params) { + params.input_prefix_bos = true; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--in-prefix"}, "STRING", + "string to prefix user inputs with (default: empty)", + [](gpt_params & params, const std::string & value) { + params.input_prefix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--in-suffix"}, "STRING", + "string to suffix after user inputs with (default: empty)", + [](gpt_params & params, const std::string & value) { + params.input_suffix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--no-warmup"}, + "skip warming up the model with an empty run", + [](gpt_params & params) { + params.warmup = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--spm-infill"}, + format( + "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", + params.spm_infill ? "enabled" : "disabled" + ), + [](gpt_params & params) { + params.spm_infill = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"--samplers"}, "SAMPLERS", + format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), + [](gpt_params & params, const std::string & value) { + const auto sampler_names = string_split(value, ';'); + params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true); + } + ).set_sparam()); + add_opt(llama_arg( + {"-s", "--seed"}, "SEED", + format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed), + [](gpt_params & params, const std::string & value) { + params.sparams.seed = std::stoul(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--sampling-seq"}, "SEQUENCE", + format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), + [](gpt_params & params, const std::string & value) { + params.sparams.samplers = gpt_sampler_types_from_chars(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--ignore-eos"}, + "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", + [](gpt_params & params) { + params.sparams.ignore_eos = true; + } + ).set_sparam()); + add_opt(llama_arg( + {"--penalize-nl"}, + format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + [](gpt_params & params) { + params.sparams.penalize_nl = true; + } + ).set_sparam()); + add_opt(llama_arg( + {"--temp"}, "N", + format("temperature (default: %.1f)", (double)params.sparams.temp), + [](gpt_params & params, const std::string & value) { + params.sparams.temp = std::stof(value); + params.sparams.temp = std::max(params.sparams.temp, 0.0f); + } + ).set_sparam()); + add_opt(llama_arg( + {"--top-k"}, "N", + format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + [](gpt_params & params, int value) { + params.sparams.top_k = value; + } + ).set_sparam()); + add_opt(llama_arg( + {"--top-p"}, "N", + format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + [](gpt_params & params, const std::string & value) { + params.sparams.top_p = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--min-p"}, "N", + format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + [](gpt_params & params, const std::string & value) { + params.sparams.min_p = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--tfs"}, "N", + format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), + [](gpt_params & params, const std::string & value) { + params.sparams.tfs_z = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--typical"}, "N", + format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), + [](gpt_params & params, const std::string & value) { + params.sparams.typ_p = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--repeat-last-n"}, "N", + format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + [](gpt_params & params, int value) { + params.sparams.penalty_last_n = value; + params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); + } + ).set_sparam()); + add_opt(llama_arg( + {"--repeat-penalty"}, "N", + format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_repeat = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--presence-penalty"}, "N", + format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_present = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--frequency-penalty"}, "N", + format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_freq = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--dynatemp-range"}, "N", + format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + [](gpt_params & params, const std::string & value) { + params.sparams.dynatemp_range = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--dynatemp-exp"}, "N", + format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + [](gpt_params & params, const std::string & value) { + params.sparams.dynatemp_exponent = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--mirostat"}, "N", + format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), + [](gpt_params & params, int value) { + params.sparams.mirostat = value; + } + ).set_sparam()); + add_opt(llama_arg( + {"--mirostat-lr"}, "N", + format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + [](gpt_params & params, const std::string & value) { + params.sparams.mirostat_eta = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--mirostat-ent"}, "N", + format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + [](gpt_params & params, const std::string & value) { + params.sparams.mirostat_tau = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", + "modifies the likelihood of token appearing in the completion,\n" + "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" + "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", + [](gpt_params & params, const std::string & value) { + std::stringstream ss(value); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + params.sparams.logit_bias.push_back({key, bias}); + } else { + throw std::invalid_argument("invalid input format"); + } + } catch (const std::exception&) { + throw std::invalid_argument("invalid input format"); + } + } + ).set_sparam()); + add_opt(llama_arg( + {"--grammar"}, "GRAMMAR", + format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + [](gpt_params & params, const std::string & value) { + params.sparams.grammar = value; + } + ).set_sparam()); + add_opt(llama_arg( + {"--grammar-file"}, "FNAME", + "file to read grammar from", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(params.sparams.grammar) + ); + } + ).set_sparam()); + add_opt(llama_arg( + {"-j", "--json-schema"}, "SCHEMA", + "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", + [](gpt_params & params, const std::string & value) { + params.sparams.grammar = json_schema_to_grammar(json::parse(value)); + } + ).set_sparam()); + add_opt(llama_arg( + {"--pooling"}, "{none,mean,cls,last}", + "pooling type for embeddings, use model default if unspecified", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--attention"}, "{causal,non,causal}", + "attention type for embeddings, use model default if unspecified", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--rope-scaling"}, "{none,linear,yarn}", + "RoPE frequency scaling method, defaults to linear unless specified by the model", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { throw std::invalid_argument("invalid value"); } + } + )); + add_opt(llama_arg( + {"--rope-scale"}, "N", + "RoPE context scaling factor, expands context by a factor of N", + [](gpt_params & params, const std::string & value) { + params.rope_freq_scale = 1.0f / std::stof(value); + } + )); + add_opt(llama_arg( + {"--rope-freq-base"}, "N", + "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", + [](gpt_params & params, const std::string & value) { + params.rope_freq_base = std::stof(value); + } + )); + add_opt(llama_arg( + {"--rope-freq-scale"}, "N", + "RoPE frequency scaling factor, expands context by a factor of 1/N", + [](gpt_params & params, const std::string & value) { + params.rope_freq_scale = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-orig-ctx"}, "N", + format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), + [](gpt_params & params, int value) { + params.yarn_orig_ctx = value; + } + )); + add_opt(llama_arg( + {"--yarn-ext-factor"}, "N", + format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + [](gpt_params & params, const std::string & value) { + params.yarn_ext_factor = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-attn-factor"}, "N", + format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + [](gpt_params & params, const std::string & value) { + params.yarn_attn_factor = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-beta-slow"}, "N", + format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + [](gpt_params & params, const std::string & value) { + params.yarn_beta_slow = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-beta-fast"}, "N", + format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + [](gpt_params & params, const std::string & value) { + params.yarn_beta_fast = std::stof(value); + } + )); + add_opt(llama_arg( + {"-gan", "--grp-attn-n"}, "N", + format("group-attention factor (default: %d)", params.grp_attn_n), + [](gpt_params & params, int value) { + params.grp_attn_n = value; + } + )); + add_opt(llama_arg( + {"-gaw", "--grp-attn-w"}, "N", + format("group-attention width (default: %.1f)", (double)params.grp_attn_w), + [](gpt_params & params, int value) { + params.grp_attn_w = value; + } + )); + add_opt(llama_arg( + {"-dkvc", "--dump-kv-cache"}, + "verbose print of the KV cache", + [](gpt_params & params) { + params.dump_kv_cache = true; + } + )); + add_opt(llama_arg( + {"-nkvo", "--no-kv-offload"}, + "disable KV offload", + [](gpt_params & params) { + params.no_kv_offload = true; + } + )); + add_opt(llama_arg( + {"-ctk", "--cache-type-k"}, "TYPE", + format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + [](gpt_params & params, const std::string & value) { + // TODO: get the type right here + params.cache_type_k = value; + } + )); + add_opt(llama_arg( + {"-ctv", "--cache-type-v"}, "TYPE", + format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + [](gpt_params & params, const std::string & value) { + // TODO: get the type right here + params.cache_type_v = value; + } + )); + add_opt(llama_arg( + {"--perplexity", "--all-logits"}, + format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), + [](gpt_params & params) { + params.logits_all = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--hellaswag"}, + "compute HellaSwag score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.hellaswag = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--hellaswag-tasks"}, "N", + format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), + [](gpt_params & params, int value) { + params.hellaswag_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--winogrande"}, + "compute Winogrande score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.winogrande = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--winogrande-tasks"}, "N", + format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), + [](gpt_params & params, int value) { + params.winogrande_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--multiple-choice"}, + "compute multiple choice score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.multiple_choice = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--multiple-choice-tasks"}, "N", + format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), + [](gpt_params & params, int value) { + params.multiple_choice_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--kl-divergence"}, + "computes KL-divergence to logits provided via --kl-divergence-base", + [](gpt_params & params) { + params.kl_divergence = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--save-all-logits", "--kl-divergence-base"}, "FNAME", + "set logits file", + [](gpt_params & params, const std::string & value) { + params.logits_file = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--ppl-stride"}, "N", + format("stride for perplexity calculation (default: %d)", params.ppl_stride), + [](gpt_params & params, int value) { + params.ppl_stride = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--ppl-output-type"}, "<0|1>", + format("output type for perplexity calculation (default: %d)", params.ppl_output_type), + [](gpt_params & params, int value) { + params.ppl_output_type = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"-dt", "--defrag-thold"}, "N", + format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), + [](gpt_params & params, const std::string & value) { + params.defrag_thold = std::stof(value); + } + ).set_env("LLAMA_ARG_DEFRAG_THOLD")); + add_opt(llama_arg( + {"-np", "--parallel"}, "N", + format("number of parallel sequences to decode (default: %d)", params.n_parallel), + [](gpt_params & params, int value) { + params.n_parallel = value; + } + )); + add_opt(llama_arg( + {"-ns", "--sequences"}, "N", + format("number of sequences to decode (default: %d)", params.n_sequences), + [](gpt_params & params, int value) { + params.n_sequences = value; + } + ).set_examples({LLAMA_EXAMPLE_PARALLEL})); + add_opt(llama_arg( + {"-cb", "--cont-batching"}, + format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + [](gpt_params & params) { + params.cont_batching = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); + add_opt(llama_arg( + {"-nocb", "--no-cont-batching"}, + "disable continuous batching", + [](gpt_params & params) { + params.cont_batching = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); + add_opt(llama_arg( + {"--mmproj"}, "FILE", + "path to a multimodal projector file for LLaVA. see examples/llava/README.md", + [](gpt_params & params, const std::string & value) { + params.mmproj = value; + } + ).set_examples({LLAMA_EXAMPLE_LLAVA})); + add_opt(llama_arg( + {"--image"}, "FILE", + "path to an image file. use with multimodal models. Specify multiple times for batching", + [](gpt_params & params, const std::string & value) { + params.image.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_LLAVA})); +#ifdef GGML_USE_RPC + add_opt(llama_arg( + {"--rpc"}, "SERVERS", + "comma separated list of RPC servers", + [](gpt_params & params, const std::string & value) { + params.rpc_servers = value; + } + )); +#endif + add_opt(llama_arg( + {"--mlock"}, + "force system to keep model in RAM rather than swapping or compressing", + [](gpt_params & params) { + params.use_mlock = true; + } + )); + add_opt(llama_arg( + {"--no-mmap"}, + "do not memory-map model (slower load but may reduce pageouts if not using mlock)", + [](gpt_params & params) { + params.use_mmap = false; + } + )); + add_opt(llama_arg( + {"--numa"}, "TYPE", + "attempt optimizations that help on some NUMA systems\n" + "- distribute: spread execution evenly over all nodes\n" + "- isolate: only spawn threads on CPUs on the node that execution started on\n" + "- numactl: use the CPU map provided by numactl\n" + "if run without this previously, it is recommended to drop the system page cache before using this\n" + "see https://github.com/ggerganov/llama.cpp/issues/1437", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { throw std::invalid_argument("invalid value"); } + } + )); + add_opt(llama_arg( + {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", + "number of layers to store in VRAM", + [](gpt_params & params, int value) { + params.n_gpu_layers = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + ).set_env("LLAMA_ARG_N_GPU_LAYERS")); + add_opt(llama_arg( + {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", + "number of layers to store in VRAM for the draft model", + [](gpt_params & params, int value) { + params.n_gpu_layers_draft = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-sm", "--split-mode"}, "{none,layer,row}", + "how to split the model across multiple GPUs, one of:\n" + "- none: use one GPU only\n" + "- layer (default): split layers and KV across GPUs\n" + "- row: split rows across GPUs", + [](gpt_params & params, const std::string & value) { + std::string arg_next = value; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } + else if (arg_next == "row") { +#ifdef GGML_USE_SYCL + fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + exit(1); +#endif // GGML_USE_SYCL + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } + else { + throw std::invalid_argument("invalid value"); + } +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + } + )); + add_opt(llama_arg( + {"-ts", "--tensor-split"}, "N0,N1,N2,...", + "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", + [](gpt_params & params, const std::string & value) { + std::string arg_next = value; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + throw std::invalid_argument( + format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + ); + } + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } else { + params.tensor_split[i] = 0.0f; + } + } +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + } + )); + add_opt(llama_arg( + {"-mg", "--main-gpu"}, "INDEX", + format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), + [](gpt_params & params, int value) { + params.main_gpu = value; +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + } + )); + add_opt(llama_arg( + {"--check-tensors"}, + format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), + [](gpt_params & params) { + params.check_tensors = true; + } + )); + add_opt(llama_arg( + {"--override-kv"}, "KEY=TYPE:VALUE", + "advanced option to override model metadata by key. may be specified multiple times.\n" + "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", + [](gpt_params & params, const std::string & value) { + if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { + throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); + } + } + )); + add_opt(llama_arg( + {"--lora"}, "FNAME", + "path to LoRA adapter (can be repeated to use multiple adapters)", + [](gpt_params & params, const std::string & value) { + params.lora_adapters.push_back({ std::string(value), 1.0 }); + } + // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"--lora-scaled"}, "FNAME", "SCALE", + "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", + [](gpt_params & params, const std::string & fname, const std::string & scale) { + params.lora_adapters.push_back({ fname, std::stof(scale) }); + } + // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"--control-vector"}, "FNAME", + "add a control vector\nnote: this argument can be repeated to add multiple control vectors", + [](gpt_params & params, const std::string & value) { + params.control_vectors.push_back({ 1.0f, value, }); + } + )); + add_opt(llama_arg( + {"--control-vector-scaled"}, "FNAME", "SCALE", + "add a control vector with user defined scaling SCALE\n" + "note: this argument can be repeated to add multiple scaled control vectors", + [](gpt_params & params, const std::string & fname, const std::string & scale) { + params.control_vectors.push_back({ std::stof(scale), fname }); + } + )); + add_opt(llama_arg( + {"--control-vector-layer-range"}, "START", "END", + "layer range to apply the control vector(s) to, start and end inclusive", + [](gpt_params & params, const std::string & start, const std::string & end) { + params.control_vector_layer_start = std::stoi(start); + params.control_vector_layer_end = std::stoi(end); + } + )); + add_opt(llama_arg( + {"-a", "--alias"}, "STRING", + "set alias for model name (to be used by REST API)", + [](gpt_params & params, const std::string & value) { + params.model_alias = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"-m", "--model"}, "FNAME", + ex == LLAMA_EXAMPLE_EXPORT_LORA + ? std::string("model path from which to load base model") + : format( + "model path (default: `models/$filename` with filename from `--hf-file` " + "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH + ), + [](gpt_params & params, const std::string & value) { + params.model = value; + } + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); + add_opt(llama_arg( + {"-md", "--model-draft"}, "FNAME", + "draft model for speculative decoding (default: unused)", + [](gpt_params & params, const std::string & value) { + params.model_draft = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-mu", "--model-url"}, "MODEL_URL", + "model download url (default: unused)", + [](gpt_params & params, const std::string & value) { + params.model_url = value; + } + ).set_env("LLAMA_ARG_MODEL_URL")); + add_opt(llama_arg( + {"-hfr", "--hf-repo"}, "REPO", + "Hugging Face model repository (default: unused)", + [](gpt_params & params, const std::string & value) { + params.hf_repo = value; + } + ).set_env("LLAMA_ARG_HF_REPO")); + add_opt(llama_arg( + {"-hff", "--hf-file"}, "FILE", + "Hugging Face model file (default: unused)", + [](gpt_params & params, const std::string & value) { + params.hf_file = value; + } + ).set_env("LLAMA_ARG_HF_FILE")); + add_opt(llama_arg( + {"-hft", "--hf-token"}, "TOKEN", + "Hugging Face access token (default: value from HF_TOKEN environment variable)", + [](gpt_params & params, const std::string & value) { + params.hf_token = value; + } + ).set_env("HF_TOKEN")); + add_opt(llama_arg( + {"--context-file"}, "FNAME", + "file to load context from (repeat to specify multiple files)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + params.context_files.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--chunk-size"}, "N", + format("minimum length of embedded text chunks (default: %d)", params.chunk_size), + [](gpt_params & params, int value) { + params.chunk_size = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--chunk-separator"}, "STRING", + format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), + [](gpt_params & params, const std::string & value) { + params.chunk_separator = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--junk"}, "N", + format("number of times to repeat the junk text (default: %d)", params.n_junk), + [](gpt_params & params, int value) { + params.n_junk = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + add_opt(llama_arg( + {"--pos"}, "N", + format("position of the passkey in the junk text (default: %d)", params.i_pos), + [](gpt_params & params, int value) { + params.i_pos = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + add_opt(llama_arg( + {"-o", "--output", "--output-file"}, "FNAME", + format("output file (default: '%s')", + ex == LLAMA_EXAMPLE_EXPORT_LORA + ? params.lora_outfile.c_str() + : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR + ? params.cvector_outfile.c_str() + : params.out_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.out_file = value; + params.cvector_outfile = value; + params.lora_outfile = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"-ofreq", "--output-frequency"}, "N", + format("output the imatrix every N iterations (default: %d)", params.n_out_freq), + [](gpt_params & params, int value) { + params.n_out_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--save-frequency"}, "N", + format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), + [](gpt_params & params, int value) { + params.n_save_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--process-output"}, + format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), + [](gpt_params & params) { + params.process_output = true; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--no-ppl"}, + format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + [](gpt_params & params) { + params.compute_ppl = false; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--chunk", "--from-chunk"}, "N", + format("start processing the input from chunk N (default: %d)", params.i_chunk), + [](gpt_params & params, int value) { + params.i_chunk = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"-pps"}, + format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), + [](gpt_params & params) { + params.is_pp_shared = true; + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-npp"}, "n0,n1,...", + "number of prompt tokens", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-ntg"}, "n0,n1,...", + "number of text generation tokens", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-npl"}, "n0,n1,...", + "number of parallel prompts", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"--embd-normalize"}, "N", + format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), + [](gpt_params & params, int value) { + params.embd_normalize = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--embd-output-format"}, "FORMAT", + "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", + [](gpt_params & params, const std::string & value) { + params.embd_out = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--embd-separator"}, "STRING", + "separator of embendings (default \\n) for example \"<#sep#>\"", + [](gpt_params & params, const std::string & value) { + params.embd_sep = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--host"}, "HOST", + format("ip address to listen (default: %s)", params.hostname.c_str()), + [](gpt_params & params, const std::string & value) { + params.hostname = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); + add_opt(llama_arg( + {"--port"}, "PORT", + format("port to listen (default: %d)", params.port), + [](gpt_params & params, int value) { + params.port = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); + add_opt(llama_arg( + {"--path"}, "PATH", + format("path to serve static files from (default: %s)", params.public_path.c_str()), + [](gpt_params & params, const std::string & value) { + params.public_path = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--embedding", "--embeddings"}, + format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), + [](gpt_params & params) { + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); + add_opt(llama_arg( + {"--api-key"}, "KEY", + "API key to use for authentication (default: none)", + [](gpt_params & params, const std::string & value) { + params.api_keys.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); + add_opt(llama_arg( + {"--api-key-file"}, "FNAME", + "path to file containing API keys (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream key_file(value); + if (!key_file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::string key; + while (std::getline(key_file, key)) { + if (!key.empty()) { + params.api_keys.push_back(key); + } + } + key_file.close(); + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--ssl-key-file"}, "FNAME", + "path to file a PEM-encoded SSL private key", + [](gpt_params & params, const std::string & value) { + params.ssl_file_key = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--ssl-cert-file"}, "FNAME", + "path to file a PEM-encoded SSL certificate", + [](gpt_params & params, const std::string & value) { + params.ssl_file_cert = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"-to", "--timeout"}, "N", + format("server read/write timeout in seconds (default: %d)", params.timeout_read), + [](gpt_params & params, int value) { + params.timeout_read = value; + params.timeout_write = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--threads-http"}, "N", + format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), + [](gpt_params & params, int value) { + params.n_threads_http = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); + add_opt(llama_arg( + {"-spf", "--system-prompt-file"}, "FNAME", + "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::string system_prompt; + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(system_prompt) + ); + params.system_prompt = system_prompt; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--log-format"}, "{text, json}", + "log output format: json or text (default: json)", + [](gpt_params & params, const std::string & value) { + if (value == "json") { + params.log_json = true; + } else if (value == "text") { + params.log_json = false; + } else { + throw std::invalid_argument("invalid value"); + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--metrics"}, + format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_metrics = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); + add_opt(llama_arg( + {"--no-slots"}, + format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_slots = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); + add_opt(llama_arg( + {"--slot-save-path"}, "PATH", + "path to save slot kv cache (default: disabled)", + [](gpt_params & params, const std::string & value) { + params.slot_save_path = value; + // if doesn't end with DIRECTORY_SEPARATOR, add it + if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { + params.slot_save_path += DIRECTORY_SEPARATOR; + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--chat-template"}, "JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "if suffix/prefix are specified, template will be disabled\n" + "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", + [](gpt_params & params, const std::string & value) { + if (!llama_chat_verify_template(value)) { + throw std::runtime_error(format( + "error: the supplied chat template is not supported: %s\n" + "note: llama.cpp does not use jinja parser, we only support commonly used templates\n", + value.c_str() + )); + } + params.chat_template = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); + add_opt(llama_arg( + {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", + format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), + [](gpt_params & params, const std::string & value) { + params.slot_prompt_similarity = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--lora-init-without-apply"}, + format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), + [](gpt_params & params) { + params.lora_init_without_apply = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--simple-io"}, + "use basic IO for better compatibility in subprocesses and limited consoles", + [](gpt_params & params) { + params.simple_io = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"-ld", "--logdir"}, "LOGDIR", + "path under which to save YAML logs (no logging if unset)", + [](gpt_params & params, const std::string & value) { + params.logdir = value; + + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; + } + } + )); + add_opt(llama_arg( + {"--positive-file"}, "FNAME", + format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.cvector_positive_file = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--negative-file"}, "FNAME", + format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.cvector_negative_file = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--pca-batch"}, "N", + format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), + [](gpt_params & params, int value) { + params.n_pca_batch = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--pca-iter"}, "N", + format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), + [](gpt_params & params, int value) { + params.n_pca_iterations = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--method"}, "{pca, mean}", + "dimensionality reduction method to be used (default: pca)", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--output-format"}, "{md,jsonl}", + "output format for batched-bench results (default: md)", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } + else if (value == "md") { params.batched_bench_output_jsonl = false; } + else { std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); +#ifndef LOG_DISABLE_LOGS + // TODO: make this looks less weird + add_opt(llama_arg( + {"--log-test"}, + "Log test", + [](gpt_params &) { log_param_single_parse("--log-test"); } + )); + add_opt(llama_arg( + {"--log-disable"}, + "Log disable", + [](gpt_params &) { log_param_single_parse("--log-disable"); } + )); + add_opt(llama_arg( + {"--log-enable"}, + "Log enable", + [](gpt_params &) { log_param_single_parse("--log-enable"); } + )); + add_opt(llama_arg( + {"--log-new"}, + "Log new", + [](gpt_params &) { log_param_single_parse("--log-new"); } + )); + add_opt(llama_arg( + {"--log-append"}, + "Log append", + [](gpt_params &) { log_param_single_parse("--log-append"); } + )); + add_opt(llama_arg( + {"--log-file"}, "FNAME", + "Log file", + [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } + )); +#endif // LOG_DISABLE_LOGS + + return ctx_arg; +} + diff --git a/common/arg.h b/common/arg.h new file mode 100644 index 000000000..413de2c88 --- /dev/null +++ b/common/arg.h @@ -0,0 +1,77 @@ +#pragma once + +#include "common.h" + +#include +#include +#include + +// +// CLI argument parsing +// + +struct llama_arg { + std::set examples = {LLAMA_EXAMPLE_COMMON}; + std::vector args; + const char * value_hint = nullptr; // help text or example for arg value + const char * value_hint_2 = nullptr; // for second arg value + const char * env = nullptr; + std::string help; + bool is_sparam = false; // is current arg a sampling param? + void (*handler_void) (gpt_params & params) = nullptr; + void (*handler_string) (gpt_params & params, const std::string &) = nullptr; + void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr; + void (*handler_int) (gpt_params & params, int) = nullptr; + + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const std::string & help, + void (*handler)(gpt_params & params, const std::string &) + ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} + + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const std::string & help, + void (*handler)(gpt_params & params, int) + ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} + + llama_arg( + const std::initializer_list & args, + const std::string & help, + void (*handler)(gpt_params & params) + ) : args(args), help(help), handler_void(handler) {} + + // support 2 values for arg + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const char * value_hint_2, + const std::string & help, + void (*handler)(gpt_params & params, const std::string &, const std::string &) + ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} + + llama_arg & set_examples(std::initializer_list examples); + llama_arg & set_env(const char * env); + llama_arg & set_sparam(); + bool in_example(enum llama_example ex); + bool get_value_from_env(std::string & output); + bool has_value_from_env(); + std::string to_string(); +}; + +struct gpt_params_context { + enum llama_example ex = LLAMA_EXAMPLE_COMMON; + gpt_params & params; + std::vector options; + void(*print_usage)(int, char **) = nullptr; + gpt_params_context(gpt_params & params) : params(params) {} +}; + +// parse input arguments from CLI +// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) +bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); + +// function to be used by test-arg-parser +gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/common.cpp b/common/common.cpp index de2a177c1..5395eaa0e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -77,41 +77,6 @@ using json = nlohmann::ordered_json; -// -// Environment variable utils -// - -template -static typename std::enable_if::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - target = value ? std::string(value) : target; -} - -template -static typename std::enable_if::value && std::is_integral::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - target = value ? std::stoi(value) : target; -} - -template -static typename std::enable_if::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - target = value ? std::stof(value) : target; -} - -template -static typename std::enable_if::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - if (value) { - std::string val(value); - target = val == "1" || val == "true"; - } -} - // // CPU utils // @@ -306,27 +271,6 @@ bool set_process_priority(enum ggml_sched_priority prio) { // CLI argument parsing // -void gpt_params_handle_model_default(gpt_params & params) { - if (!params.hf_repo.empty()) { - // short-hand to avoid specifying --hf-file -> default it to --model - if (params.hf_file.empty()) { - if (params.model.empty()) { - throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); - } - params.hf_file = params.model; - } else if (params.model.empty()) { - params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); - } - } else if (!params.model_url.empty()) { - if (params.model.empty()) { - auto f = string_split(params.model_url, '#').front(); - f = string_split(f, '?').front(); - params.model = fs_get_cache_file(string_split(f, '/').back()); - } - } else if (params.model.empty()) { - params.model = DEFAULT_MODEL_PATH; - } -} void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) { int32_t n_set = 0; @@ -352,102 +296,6 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) } } -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { - bool invalid_param = false; - std::string arg; - const std::string arg_prefix = "--"; - llama_sampling_params & sparams = params.sparams; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { - throw std::invalid_argument("error: unknown argument: " + arg); - } - if (invalid_param) { - throw std::invalid_argument("error: invalid parameter for argument: " + arg); - } - } - - postprocess_cpu_params(params.cpuparams, nullptr); - postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); - - if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { - throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); - } - - gpt_params_handle_model_default(params); - - if (params.hf_token.empty()) { - get_env("HF_TOKEN", params.hf_token); - } - - if (params.escape) { - string_process_escapes(params.prompt); - string_process_escapes(params.input_prefix); - string_process_escapes(params.input_suffix); - string_process_escapes(sparams.cfg_negative_prompt); - for (auto & antiprompt : params.antiprompt) { - string_process_escapes(antiprompt); - } - } - - if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(); - params.kv_overrides.back().key[0] = 0; - } - - return true; -} - -void gpt_params_parse_from_env(gpt_params & params) { - // we only care about server-related params for now - get_env("LLAMA_ARG_MODEL", params.model); - get_env("LLAMA_ARG_MODEL_URL", params.model_url); - get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias); - get_env("LLAMA_ARG_HF_REPO", params.hf_repo); - get_env("LLAMA_ARG_HF_FILE", params.hf_file); - get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads); - get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx); - get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel); - get_env("LLAMA_ARG_BATCH", params.n_batch); - get_env("LLAMA_ARG_UBATCH", params.n_ubatch); - get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers); - get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http); - get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template); - get_env("LLAMA_ARG_N_PREDICT", params.n_predict); - get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics); - get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots); - get_env("LLAMA_ARG_EMBEDDINGS", params.embedding); - get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn); - get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold); - get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching); - get_env("LLAMA_ARG_HOST", params.hostname); - get_env("LLAMA_ARG_PORT", params.port); -} - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - const auto params_org = params; // the example can modify the default params - - try { - if (!gpt_params_parse_ex(argc, argv, params) || params.usage) { - params = params_org; - params.usage = true; - return false; - } - } catch (const std::invalid_argument & ex) { - fprintf(stderr, "%s\n", ex.what()); - params = params_org; - return false; - } - - return true; -} - bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) { size_t dash_loc = range.find('-'); if (dash_loc == std::string::npos) { @@ -521,1590 +369,6 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD return true; } -#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; } - -bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { - const char split_delim = ','; - - llama_sampling_params & sparams = params.sparams; - - if (arg == "-s" || arg == "--seed") { - CHECK_ARG - // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. - params.seed = std::stoul(argv[i]); - sparams.seed = std::stoul(argv[i]); - return true; - } - if (arg == "-t" || arg == "--threads") { - CHECK_ARG - params.cpuparams.n_threads = std::stoi(argv[i]); - if (params.cpuparams.n_threads <= 0) { - params.cpuparams.n_threads = std::thread::hardware_concurrency(); - } - return true; - } - if (arg == "-C" || arg == "--cpu-mask") { - CHECK_ARG - std::string mask = argv[i]; - params.cpuparams.mask_valid = true; - invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask); - return true; - } - if (arg == "-Cr" || arg == "--cpu-range") { - CHECK_ARG - std::string range = argv[i]; - params.cpuparams.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask); - return true; - } - if (arg == "--prio") { - CHECK_ARG - params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict") { - CHECK_ARG - params.cpuparams.strict_cpu = std::stoul(argv[i]); - return true; - } - if (arg == "--poll") { - CHECK_ARG - params.cpuparams.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-tb" || arg == "--threads-batch") { - CHECK_ARG - params.cpuparams_batch.n_threads = std::stoi(argv[i]); - if (params.cpuparams_batch.n_threads <= 0) { - params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - return true; - } - if (arg == "-Cb" || arg == "--cpu-mask-batch") { - CHECK_ARG - std::string mask = argv[i]; - params.cpuparams_batch.mask_valid = true; - invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask); - return true; - } - if (arg == "-Crb" || arg == "--cpu-range_batch") { - CHECK_ARG - std::string range = argv[i]; - params.cpuparams_batch.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask); - return true; - } - if (arg == "--prio-batch") { - CHECK_ARG - params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict-batch") { - params.cpuparams_batch.strict_cpu = true; - return true; - } - if (arg == "--poll-batch") { - CHECK_ARG - params.cpuparams_batch.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-td" || arg == "--threads-draft") { - CHECK_ARG - params.draft_cpuparams.n_threads = std::stoi(argv[i]); - if (params.draft_cpuparams.n_threads <= 0) { - params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); - } - return true; - } - if (arg == "-Cd" || arg == "--cpu-mask-draft") { - CHECK_ARG - std::string mask = argv[i]; - params.draft_cpuparams.mask_valid = true; - invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask); - return true; - } - if (arg == "-Crd" || arg == "--cpu-range-draft") { - CHECK_ARG - std::string range = argv[i]; - params.draft_cpuparams.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask); - return true; - } - if (arg == "--prio-draft") { - CHECK_ARG - params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict-draft") { - params.draft_cpuparams.strict_cpu = true; - return true; - } - if (arg == "--poll-draft") { - CHECK_ARG - params.draft_cpuparams.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-tbd" || arg == "--threads-batch-draft") { - CHECK_ARG - params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]); - if (params.draft_cpuparams_batch.n_threads <= 0) { - params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - return true; - } - if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") { - CHECK_ARG - std::string range = argv[i]; - params.draft_cpuparams_batch.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask); - return true; - } - if (arg == "--prio-batch-draft") { - CHECK_ARG - params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict-batch-draft") { - params.draft_cpuparams_batch.strict_cpu = true; - return true; - } - if (arg == "--poll-batch-draft") { - CHECK_ARG - params.draft_cpuparams_batch.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-p" || arg == "--prompt") { - CHECK_ARG - params.prompt = argv[i]; - return true; - } - if (arg == "-e" || arg == "--escape") { - params.escape = true; - return true; - } - if (arg == "--no-escape") { - params.escape = false; - return true; - } - if (arg == "--prompt-cache") { - CHECK_ARG - params.path_prompt_cache = argv[i]; - return true; - } - if (arg == "--prompt-cache-all") { - params.prompt_cache_all = true; - return true; - } - if (arg == "--prompt-cache-ro") { - params.prompt_cache_ro = true; - return true; - } - if (arg == "-bf" || arg == "--binary-file") { - CHECK_ARG - std::ifstream file(argv[i], std::ios::binary); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::ostringstream ss; - ss << file.rdbuf(); - params.prompt = ss.str(); - fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); - return true; - } - if (arg == "-f" || arg == "--file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - return true; - } - if (arg == "--in-file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - params.in_files.push_back(argv[i]); - return true; - } - if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { - CHECK_ARG - params.n_predict = std::stoi(argv[i]); - return true; - } - if (arg == "--top-k") { - CHECK_ARG - sparams.top_k = std::stoi(argv[i]); - return true; - } - if (arg == "-c" || arg == "--ctx-size") { - CHECK_ARG - params.n_ctx = std::stoi(argv[i]); - return true; - } - if (arg == "--grp-attn-n" || arg == "-gan") { - CHECK_ARG - params.grp_attn_n = std::stoi(argv[i]); - return true; - } - if (arg == "--grp-attn-w" || arg == "-gaw") { - CHECK_ARG - params.grp_attn_w = std::stoi(argv[i]); - return true; - } - if (arg == "--rope-freq-base") { - CHECK_ARG - params.rope_freq_base = std::stof(argv[i]); - return true; - } - if (arg == "--rope-freq-scale") { - CHECK_ARG - params.rope_freq_scale = std::stof(argv[i]); - return true; - } - if (arg == "--rope-scaling") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } - else { invalid_param = true; } - return true; - } - if (arg == "--rope-scale") { - CHECK_ARG - params.rope_freq_scale = 1.0f / std::stof(argv[i]); - return true; - } - if (arg == "--yarn-orig-ctx") { - CHECK_ARG - params.yarn_orig_ctx = std::stoi(argv[i]); - return true; - } - if (arg == "--yarn-ext-factor") { - CHECK_ARG - params.yarn_ext_factor = std::stof(argv[i]); - return true; - } - if (arg == "--yarn-attn-factor") { - CHECK_ARG - params.yarn_attn_factor = std::stof(argv[i]); - return true; - } - if (arg == "--yarn-beta-fast") { - CHECK_ARG - params.yarn_beta_fast = std::stof(argv[i]); - return true; - } - if (arg == "--yarn-beta-slow") { - CHECK_ARG - params.yarn_beta_slow = std::stof(argv[i]); - return true; - } - if (arg == "--pooling") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } - else { invalid_param = true; } - return true; - } - if (arg == "--attention") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } - else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } - else { invalid_param = true; } - return true; - } - if (arg == "--defrag-thold" || arg == "-dt") { - CHECK_ARG - params.defrag_thold = std::stof(argv[i]); - return true; - } - if (arg == "--samplers") { - CHECK_ARG - const auto sampler_names = string_split(argv[i], ';'); - sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); - return true; - } - if (arg == "--sampling-seq") { - CHECK_ARG - sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]); - return true; - } - if (arg == "--top-p") { - CHECK_ARG - sparams.top_p = std::stof(argv[i]); - return true; - } - if (arg == "--min-p") { - CHECK_ARG - sparams.min_p = std::stof(argv[i]); - return true; - } - if (arg == "--temp") { - CHECK_ARG - sparams.temp = std::stof(argv[i]); - sparams.temp = std::max(sparams.temp, 0.0f); - return true; - } - if (arg == "--tfs") { - CHECK_ARG - sparams.tfs_z = std::stof(argv[i]); - return true; - } - if (arg == "--typical") { - CHECK_ARG - sparams.typical_p = std::stof(argv[i]); - return true; - } - if (arg == "--repeat-last-n") { - CHECK_ARG - sparams.penalty_last_n = std::stoi(argv[i]); - sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); - return true; - } - if (arg == "--repeat-penalty") { - CHECK_ARG - sparams.penalty_repeat = std::stof(argv[i]); - return true; - } - if (arg == "--frequency-penalty") { - CHECK_ARG - sparams.penalty_freq = std::stof(argv[i]); - return true; - } - if (arg == "--presence-penalty") { - CHECK_ARG - sparams.penalty_present = std::stof(argv[i]); - return true; - } - if (arg == "--dynatemp-range") { - CHECK_ARG - sparams.dynatemp_range = std::stof(argv[i]); - return true; - } - if (arg == "--dynatemp-exp") { - CHECK_ARG - sparams.dynatemp_exponent = std::stof(argv[i]); - return true; - } - if (arg == "--mirostat") { - CHECK_ARG - sparams.mirostat = std::stoi(argv[i]); - return true; - } - if (arg == "--mirostat-lr") { - CHECK_ARG - sparams.mirostat_eta = std::stof(argv[i]); - return true; - } - if (arg == "--mirostat-ent") { - CHECK_ARG - sparams.mirostat_tau = std::stof(argv[i]); - return true; - } - if (arg == "--cfg-negative-prompt") { - CHECK_ARG - sparams.cfg_negative_prompt = argv[i]; - return true; - } - if (arg == "--cfg-negative-prompt-file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); - if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { - sparams.cfg_negative_prompt.pop_back(); - } - return true; - } - if (arg == "--cfg-scale") { - CHECK_ARG - sparams.cfg_scale = std::stof(argv[i]); - return true; - } - if (arg == "-b" || arg == "--batch-size") { - CHECK_ARG - params.n_batch = std::stoi(argv[i]); - return true; - } - if (arg == "-ub" || arg == "--ubatch-size") { - CHECK_ARG - params.n_ubatch = std::stoi(argv[i]); - return true; - } - if (arg == "--keep") { - CHECK_ARG - params.n_keep = std::stoi(argv[i]); - return true; - } - if (arg == "--draft") { - CHECK_ARG - params.n_draft = std::stoi(argv[i]); - return true; - } - if (arg == "--chunks") { - CHECK_ARG - params.n_chunks = std::stoi(argv[i]); - return true; - } - if (arg == "-np" || arg == "--parallel") { - CHECK_ARG - params.n_parallel = std::stoi(argv[i]); - return true; - } - if (arg == "-ns" || arg == "--sequences") { - CHECK_ARG - params.n_sequences = std::stoi(argv[i]); - return true; - } - if (arg == "--p-split" || arg == "-ps") { - CHECK_ARG - params.p_split = std::stof(argv[i]); - return true; - } - if (arg == "-m" || arg == "--model") { - CHECK_ARG - params.model = argv[i]; - return true; - } - if (arg == "-md" || arg == "--model-draft") { - CHECK_ARG - params.model_draft = argv[i]; - return true; - } - if (arg == "-a" || arg == "--alias") { - CHECK_ARG - params.model_alias = argv[i]; - return true; - } - if (arg == "-mu" || arg == "--model-url") { - CHECK_ARG - params.model_url = argv[i]; - return true; - } - if (arg == "-hft" || arg == "--hf-token") { - if (++i >= argc) { - invalid_param = true; - return true; - } - params.hf_token = argv[i]; - return true; - } - if (arg == "-hfr" || arg == "--hf-repo") { - CHECK_ARG - params.hf_repo = argv[i]; - return true; - } - if (arg == "-hff" || arg == "--hf-file") { - CHECK_ARG - params.hf_file = argv[i]; - return true; - } - if (arg == "--lora") { - CHECK_ARG - params.lora_adapters.push_back({ - std::string(argv[i]), - 1.0, - }); - return true; - } - if (arg == "--lora-scaled") { - CHECK_ARG - std::string lora_adapter = argv[i]; - CHECK_ARG - params.lora_adapters.push_back({ - lora_adapter, - std::stof(argv[i]), - }); - return true; - } - if (arg == "--lora-init-without-apply") { - params.lora_init_without_apply = true; - return true; - } - if (arg == "--control-vector") { - CHECK_ARG - params.control_vectors.push_back({ 1.0f, argv[i], }); - return true; - } - if (arg == "--control-vector-scaled") { - CHECK_ARG - const char* fname = argv[i]; - CHECK_ARG - params.control_vectors.push_back({ std::stof(argv[i]), fname, }); - return true; - } - if (arg == "--control-vector-layer-range") { - CHECK_ARG - params.control_vector_layer_start = std::stoi(argv[i]); - CHECK_ARG - params.control_vector_layer_end = std::stoi(argv[i]); - return true; - } - if (arg == "--mmproj") { - CHECK_ARG - params.mmproj = argv[i]; - return true; - } - if (arg == "--image") { - CHECK_ARG - params.image.emplace_back(argv[i]); - return true; - } - if (arg == "-i" || arg == "--interactive") { - params.interactive = true; - return true; - } - if (arg == "-sp" || arg == "--special") { - params.special = true; - return true; - } - if (arg == "--embedding" || arg == "--embeddings") { - params.embedding = true; - return true; - } - if (arg == "--embd-normalize") { - CHECK_ARG - params.embd_normalize = std::stoi(argv[i]); - return true; - } - if (arg == "--embd-output-format") { - CHECK_ARG - params.embd_out = argv[i]; - return true; - } - if (arg == "--embd-separator") { - CHECK_ARG - params.embd_sep = argv[i]; - return true; - } - if (arg == "-if" || arg == "--interactive-first") { - params.interactive_first = true; - return true; - } - if (arg == "-cnv" || arg == "--conversation") { - params.conversation = true; - return true; - } - if (arg == "--infill") { - params.infill = true; - return true; - } - if (arg == "-dkvc" || arg == "--dump-kv-cache") { - params.dump_kv_cache = true; - return true; - } - if (arg == "-nkvo" || arg == "--no-kv-offload") { - params.no_kv_offload = true; - return true; - } - if (arg == "-ctk" || arg == "--cache-type-k") { - params.cache_type_k = argv[++i]; - return true; - } - if (arg == "-ctv" || arg == "--cache-type-v") { - params.cache_type_v = argv[++i]; - return true; - } - if (arg == "-mli" || arg == "--multiline-input") { - params.multiline_input = true; - return true; - } - if (arg == "--simple-io") { - params.simple_io = true; - return true; - } - if (arg == "-cb" || arg == "--cont-batching") { - params.cont_batching = true; - return true; - } - if (arg == "-nocb" || arg == "--no-cont-batching") { - params.cont_batching = false; - return true; - } - if (arg == "-fa" || arg == "--flash-attn") { - params.flash_attn = true; - return true; - } - if (arg == "-co" || arg == "--color") { - params.use_color = true; - return true; - } - if (arg == "--mlock") { - params.use_mlock = true; - return true; - } - if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { - CHECK_ARG - params.n_gpu_layers = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - return true; - } - if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") { - CHECK_ARG - params.n_gpu_layers_draft = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - return true; - } - if (arg == "--main-gpu" || arg == "-mg") { - CHECK_ARG - params.main_gpu = std::stoi(argv[i]); -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - return true; - } - if (arg == "--split-mode" || arg == "-sm") { - CHECK_ARG - std::string arg_next = argv[i]; - if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; - } - else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; - } - else if (arg_next == "row") { -#ifdef GGML_USE_SYCL - fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); - exit(1); -#endif // GGML_USE_SYCL - params.split_mode = LLAMA_SPLIT_MODE_ROW; - } - else { - invalid_param = true; - return true; - } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - return true; - } - if (arg == "--tensor-split" || arg == "-ts") { - CHECK_ARG - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{ R"([,/]+)" }; - std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; - std::vector split_arg{ it, {} }; - if (split_arg.size() >= llama_max_devices()) { - invalid_param = true; - return true; - } - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); - } - else { - params.tensor_split[i] = 0.0f; - } - } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - return true; - } -#ifdef GGML_USE_RPC - if (arg == "--rpc") { - CHECK_ARG - params.rpc_servers = argv[i]; - return true; - } -#endif - if (arg == "--no-mmap") { - params.use_mmap = false; - return true; - } - if (arg == "--numa") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; } - return true; - } - if (arg == "-v" || arg == "--verbose") { - params.verbosity = 1; - return true; - } - if (arg == "--verbosity") { - CHECK_ARG - params.verbosity = std::stoi(argv[i]); - return true; - } - if (arg == "--verbose-prompt") { - params.verbose_prompt = true; - return true; - } - if (arg == "--no-display-prompt") { - params.display_prompt = false; - return true; - } - if (arg == "-r" || arg == "--reverse-prompt") { - CHECK_ARG - params.antiprompt.emplace_back(argv[i]); - return true; - } - if (arg == "-ld" || arg == "--logdir") { - CHECK_ARG - params.logdir = argv[i]; - - if (params.logdir.back() != DIRECTORY_SEPARATOR) { - params.logdir += DIRECTORY_SEPARATOR; - } - return true; - } - if (arg == "-lcs" || arg == "--lookup-cache-static") { - CHECK_ARG - params.lookup_cache_static = argv[i]; - return true; - } - if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { - CHECK_ARG - params.lookup_cache_dynamic = argv[i]; - return true; - } - if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { - CHECK_ARG - params.logits_file = argv[i]; - return true; - } - if (arg == "--perplexity" || arg == "--all-logits") { - params.logits_all = true; - return true; - } - if (arg == "--ppl-stride") { - CHECK_ARG - params.ppl_stride = std::stoi(argv[i]); - return true; - } - if (arg == "--ppl-output-type") { - CHECK_ARG - params.ppl_output_type = std::stoi(argv[i]); - return true; - } - if (arg == "-ptc" || arg == "--print-token-count") { - CHECK_ARG - params.n_print = std::stoi(argv[i]); - return true; - } - if (arg == "--check-tensors") { - params.check_tensors = true; - return true; - } - if (arg == "--hellaswag") { - params.hellaswag = true; - return true; - } - if (arg == "--hellaswag-tasks") { - CHECK_ARG - params.hellaswag_tasks = std::stoi(argv[i]); - return true; - } - if (arg == "--winogrande") { - params.winogrande = true; - return true; - } - if (arg == "--winogrande-tasks") { - CHECK_ARG - params.winogrande_tasks = std::stoi(argv[i]); - return true; - } - if (arg == "--multiple-choice") { - params.multiple_choice = true; - return true; - } - if (arg == "--multiple-choice-tasks") { - CHECK_ARG - params.multiple_choice_tasks = std::stoi(argv[i]); - return true; - } - if (arg == "--kl-divergence") { - params.kl_divergence = true; - return true; - } - if (arg == "--ignore-eos") { - params.ignore_eos = true; - return true; - } - if (arg == "--penalize-nl") { - sparams.penalize_nl = true; - return true; - } - if (arg == "-l" || arg == "--logit-bias") { - CHECK_ARG - std::stringstream ss(argv[i]); - llama_token key; - char sign; - std::string value_str; - try { - if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - } - else { - throw std::exception(); - } - } - catch (const std::exception&) { - invalid_param = true; - return true; - } - return true; - } - if (arg == "-h" || arg == "--help" || arg == "--usage" ) { - params.usage = true; - return true; - } - if (arg == "--version") { - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); - } - if (arg == "--in-prefix-bos") { - params.input_prefix_bos = true; - params.enable_chat_template = false; - return true; - } - if (arg == "--in-prefix") { - CHECK_ARG - params.input_prefix = argv[i]; - params.enable_chat_template = false; - return true; - } - if (arg == "--in-suffix") { - CHECK_ARG - params.input_suffix = argv[i]; - params.enable_chat_template = false; - return true; - } - if (arg == "--spm-infill") { - params.spm_infill = true; - return true; - } - if (arg == "--grammar") { - CHECK_ARG - sparams.grammar = argv[i]; - return true; - } - if (arg == "--grammar-file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(sparams.grammar) - ); - return true; - } - if (arg == "-j" || arg == "--json-schema") { - CHECK_ARG - sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); - return true; - } - if (arg == "--override-kv") { - CHECK_ARG - if (!string_parse_kv_override(argv[i], params.kv_overrides)) { - fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); - invalid_param = true; - return true; - } - return true; - } - if (arg == "--host") { - CHECK_ARG - params.hostname = argv[i]; - return true; - } - if (arg == "--port") { - CHECK_ARG - params.port = std::stoi(argv[i]); - return true; - } - if (arg == "--path") { - CHECK_ARG - params.public_path = argv[i]; - return true; - } - if (arg == "--api-key") { - CHECK_ARG - params.api_keys.push_back(argv[i]); - return true; - } - if (arg == "--api-key-file") { - CHECK_ARG - std::ifstream key_file(argv[i]); - if (!key_file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - std::string key; - while (std::getline(key_file, key)) { - if (!key.empty()) { - params.api_keys.push_back(key); - } - } - key_file.close(); - return true; - } - if (arg == "--ssl-key-file") { - CHECK_ARG - params.ssl_file_key = argv[i]; - return true; - } - if (arg == "--ssl-cert-file") { - CHECK_ARG - params.ssl_file_cert = argv[i]; - return true; - } - if (arg == "--timeout" || arg == "-to") { - CHECK_ARG - params.timeout_read = std::stoi(argv[i]); - params.timeout_write = std::stoi(argv[i]); - return true; - } - if (arg == "--threads-http") { - CHECK_ARG - params.n_threads_http = std::stoi(argv[i]); - return true; - } - if (arg == "-spf" || arg == "--system-prompt-file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - std::string system_prompt; - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(system_prompt) - ); - params.system_prompt = system_prompt; - return true; - } - if (arg == "--log-format") { - CHECK_ARG - if (std::strcmp(argv[i], "json") == 0) { - params.log_json = true; - } else if (std::strcmp(argv[i], "text") == 0) { - params.log_json = false; - } else { - invalid_param = true; - return true; - } - return true; - } - if (arg == "--no-slots") { - params.endpoint_slots = false; - return true; - } - if (arg == "--metrics") { - params.endpoint_metrics = true; - return true; - } - if (arg == "--slot-save-path") { - CHECK_ARG - params.slot_save_path = argv[i]; - // if doesn't end with DIRECTORY_SEPARATOR, add it - if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { - params.slot_save_path += DIRECTORY_SEPARATOR; - } - return true; - } - if (arg == "--chat-template") { - CHECK_ARG - if (!llama_chat_verify_template(argv[i])) { - fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); - fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); - invalid_param = true; - return true; - } - params.chat_template = argv[i]; - return true; - } - if (arg == "--slot-prompt-similarity" || arg == "-sps") { - CHECK_ARG - params.slot_prompt_similarity = std::stof(argv[i]); - return true; - } - if (arg == "-pps") { - params.is_pp_shared = true; - return true; - } - if (arg == "-npp") { - CHECK_ARG - auto p = string_split(argv[i], split_delim); - params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); - return true; - } - if (arg == "-ntg") { - CHECK_ARG - auto p = string_split(argv[i], split_delim); - params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); - return true; - } - if (arg == "-npl") { - CHECK_ARG - auto p = string_split(argv[i], split_delim); - params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); - return true; - } - if (arg == "--context-file") { - CHECK_ARG - std::ifstream file(argv[i], std::ios::binary); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - params.context_files.push_back(argv[i]); - return true; - } - if (arg == "--chunk-size") { - CHECK_ARG - params.chunk_size = std::stoi(argv[i]); - return true; - } - if (arg == "--chunk-separator") { - CHECK_ARG - params.chunk_separator = argv[i]; - return true; - } - if (arg == "--junk") { - CHECK_ARG - params.n_junk = std::stoi(argv[i]); - return true; - } - if (arg == "--pos") { - CHECK_ARG - params.i_pos = std::stoi(argv[i]); - return true; - } - if (arg == "-o" || arg == "--output" || arg == "--output-file") { - CHECK_ARG - params.out_file = argv[i]; - params.cvector_outfile = argv[i]; - params.lora_outfile = argv[i]; - return true; - } - if (arg == "-ofreq" || arg == "--output-frequency") { - CHECK_ARG - params.n_out_freq = std::stoi(argv[i]); - return true; - } - if (arg == "--save-frequency") { - CHECK_ARG - params.n_save_freq = std::stoi(argv[i]); - return true; - } - if (arg == "--process-output") { - params.process_output = true; - return true; - } - if (arg == "--no-ppl") { - params.compute_ppl = false; - return true; - } - if (arg == "--chunk" || arg == "--from-chunk") { - CHECK_ARG - params.i_chunk = std::stoi(argv[i]); - return true; - } - // cvector params - if (arg == "--positive-file") { - CHECK_ARG - params.cvector_positive_file = argv[i]; - return true; - } - if (arg == "--negative-file") { - CHECK_ARG - params.cvector_negative_file = argv[i]; - return true; - } - if (arg == "--pca-batch") { - CHECK_ARG - params.n_pca_batch = std::stoi(argv[i]); - return true; - } - if (arg == "--pca-iter") { - CHECK_ARG - params.n_pca_iterations = std::stoi(argv[i]); - return true; - } - if (arg == "--method") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } - else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } - else { invalid_param = true; } - return true; - } - if (arg == "--output-format") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } - else if (value == "md") { params.batched_bench_output_jsonl = false; } - else { invalid_param = true; } - return true; - } - if (arg == "--no-warmup") { - params.warmup = false; - return true; - } -#ifndef LOG_DISABLE_LOGS - // Parse args for logging parameters - if (log_param_single_parse(argv[i])) { - // Do nothing, log_param_single_parse automatically does it's thing - // and returns if a match was found and parsed. - return true; - } - if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) { - // We have a matching known parameter requiring an argument, - // now we need to check if there is anything after this argv - // and flag invalid_param or parse it. - CHECK_ARG - if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { - invalid_param = true; - return true; - } - return true; - } - // End of Parse args for logging parameters -#endif // LOG_DISABLE_LOGS - - return false; -} - -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) -#endif - -void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - const llama_sampling_params & sparams = params.sparams; - - std::string sampler_type_chars; - std::string sampler_type_names; - for (const auto sampler_type : sparams.samplers_sequence) { - sampler_type_chars += static_cast(sampler_type); - sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";"; - } - sampler_type_names.pop_back(); - - struct option_info { - LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5) - option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) { - va_list args_list; - va_start(args_list, desc); - char buffer[1024]; - vsnprintf(buffer, sizeof(buffer), desc, args_list); - va_end(args_list); - this->desc = buffer; - } - - option_info(const std::string & grp) : grp(grp) {} - - std::string tags; - std::string args; - std::string desc; - std::string grp; - }; - - std::vector options; - - // TODO: filter by tags - - options.push_back({ "general" }); - options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); - options.push_back({ "*", " --version", "show version and build info" }); - options.push_back({ "*", "-v, --verbose", "print verbose information" }); - options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity }); - options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); - options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); - options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); - options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed }); - options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads }); - options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); - options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); - options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" }); - -#ifndef GGML_USE_OPENMP - // these options are available only with the internal threadpool - options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"}); - options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"}); - options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu}); - options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority}); - options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll}); - - options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"}); - options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"}); - options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"}); - options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"}); - options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"}); - - options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"}); - options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"}); - options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"}); - options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"}); - options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"}); - - options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"}); - options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"}); - options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>", - "Use strict CPU placement for draft model (default: --cpu-strict-draft)"}); - options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"}); - options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"}); -#endif // GGML_USE_OPENMP - - options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft }); - options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split }); - options.push_back({ "*", "-lcs, --lookup-cache-static FNAME", - "path to static lookup cache to use for lookup decoding (not updated by generation)" }); - options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME", - "path to dynamic lookup cache to use for lookup decoding (updated by generation)" }); - - options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx }); - options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict }); - options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch }); - options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch }); - options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); - options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); - options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); - options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n" - "in conversation mode, this will be used as system prompt\n" - "(default: '%s')", params.prompt.c_str() }); - options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); - options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" }); - options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); - options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); - options.push_back({ "*", " --no-escape", "do not process escape sequences" }); - options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print }); - options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" }); - options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n" - "not supported with --interactive or other interactive options" }); - options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" }); - options.push_back({ "main", "-r, --reverse-prompt PROMPT", - "halt generation at PROMPT, return control in interactive mode\n" - "can be specified more than once for multiple prompts" }); - options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); - options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n" - "if suffix/prefix are not specified, default chat template will be used\n" - "(default: %s)", params.conversation ? "true" : "false" }); - options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); - options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); - options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); - options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); - options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); - options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); - options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" }); - options.push_back({ "server infill", - " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); - - options.push_back({ "sampling" }); - options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" - "(default: %s)", sampler_type_names.c_str() }); - options.push_back({ "*", " --sampling-seq SEQUENCE", - "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() }); - options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" }); - options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" }); - options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp }); - options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k }); - options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p }); - options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p }); - options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z }); - options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p }); - options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n }); - options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat }); - options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present }); - options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq }); - options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range }); - options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent }); - options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n" - "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" - "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat }); - options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta }); - options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau }); - options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" - "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" - "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" }); - options.push_back({ "main", " --cfg-negative-prompt PROMPT", - "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() }); - options.push_back({ "main", " --cfg-negative-prompt-file FNAME", - "negative prompt file to use for guidance" }); - options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale }); - options.push_back({ "main", " --chat-template JINJA_TEMPLATE", - "set custom jinja chat template (default: template taken from model's metadata)\n" - "if suffix/prefix are specified, template will be disabled\n" - "only commonly used templates are accepted:\n" - "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); - options.push_back({ "grammar" }); - options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() }); - options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" }); - options.push_back({ "*", "-j, --json-schema SCHEMA", - "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n" - "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" }); - - options.push_back({ "embedding" }); - options.push_back({ "embedding", " --pooling {none,mean,cls,last}", - "pooling type for embeddings, use model default if unspecified" }); - options.push_back({ "embedding", " --attention {causal,non-causal}", - "attention type for embeddings, use model default if unspecified" }); - - options.push_back({ "context hacking" }); - options.push_back({ "*", " --rope-scaling {none,linear,yarn}", - "RoPE frequency scaling method, defaults to linear unless specified by the model" }); - options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" }); - options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" }); - options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" }); - options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx }); - options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor }); - options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor }); - options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow }); - options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast }); - options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n }); - options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w }); - options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" }); - options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" }); - options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() }); - options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() }); - - options.push_back({ "perplexity" }); - options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" }); - options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" }); - options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks }); - options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" }); - options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks }); - options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" }); - options.push_back({ "perplexity", " --multiple-choice-tasks N", - "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks }); - options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" }); - options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride }); - options.push_back({ "perplexity", " --ppl-output-type {0,1}", - "output type for perplexity calculation (default: %d)", params.ppl_output_type }); - - options.push_back({ "parallel" }); - options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold }); - options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel }); - options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences }); - options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" }); - options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" }); - - options.push_back({ "multi-modality" }); - options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); - options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" }); - - options.push_back({ "backend" }); -#ifdef GGML_USE_RPC - options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); -#endif - - if (llama_supports_mlock()) { - options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); - } - if (llama_supports_mmap()) { - options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" }); - } - options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n" - " - distribute: spread execution evenly over all nodes\n" - " - isolate: only spawn threads on CPUs on the node that execution started on\n" - " - numactl: use the CPU map provided by numactl\n" - "if run without this previously, it is recommended to drop the system page cache before using this\n" - "see https://github.com/ggerganov/llama.cpp/issues/1437" }); - - if (llama_supports_gpu_offload()) { - options.push_back({ "*", "-ngl, --gpu-layers N", - "number of layers to store in VRAM" }); - options.push_back({ "*", "-ngld, --gpu-layers-draft N", - "number of layers to store in VRAM for the draft model" }); - options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", - "how to split the model across multiple GPUs, one of:\n" - " - none: use one GPU only\n" - " - layer (default): split layers and KV across GPUs\n" - " - row: split rows across GPUs" }); - options.push_back({ "*", "-ts, --tensor-split SPLIT", - "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); - options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n" - "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu }); - } - - options.push_back({ "model" }); - options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" }); - options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", - "advanced option to override model metadata by key. may be specified multiple times.\n" - "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); - options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" }); - options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); - options.push_back({ "*", " --control-vector FNAME", "add a control vector\n" - "note: this argument can be repeated to add multiple control vectors" }); - options.push_back({ "*", " --control-vector-scaled FNAME SCALE", - "add a control vector with user defined scaling SCALE\n" - "note: this argument can be repeated to add multiple scaled control vectors" }); - options.push_back({ "*", " --control-vector-layer-range START END", - "layer range to apply the control vector(s) to, start and end inclusive" }); - options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n" - "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH }); - options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" }); - options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" }); - options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" }); - options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" }); - options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" }); - - options.push_back({ "retrieval" }); - options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" }); - options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size }); - options.push_back({ "retrieval", " --chunk-separator STRING", - "separator between chunks (default: '%s')", params.chunk_separator.c_str() }); - - options.push_back({ "passkey" }); - options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); - options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); - - options.push_back({ "imatrix" }); - options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() }); - options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq }); - options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq }); - options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" }); - options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" }); - options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk }); - - options.push_back({ "bench" }); - options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); - options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); - options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" }); - options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" }); - - options.push_back({ "embedding" }); - options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize }); - options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" }); - options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" }); - - options.push_back({ "server" }); - options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); - options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); - options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); - options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" }); - options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); - options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); - options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); - options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" }); - options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read }); - options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http }); - options.push_back({ "server", " --system-prompt-file FNAME", - "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" }); - options.push_back({ "server", " --log-format {text,json}", - "log output format: json or text (default: json)" }); - options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" }); - options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" }); - options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" }); - options.push_back({ "server", " --chat-template JINJA_TEMPLATE", - "set custom jinja chat template (default: template taken from model's metadata)\n" - "only commonly used templates are accepted:\n" - "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); - options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", - "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); - options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"}); - -#ifndef LOG_DISABLE_LOGS - options.push_back({ "logging" }); - options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" }); - options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" }); - options.push_back({ "logging", " --log-test", "Run simple logging test" }); - options.push_back({ "logging", " --log-disable", "Disable trace logs" }); - options.push_back({ "logging", " --log-enable", "Enable trace logs" }); - options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" }); - options.push_back({ "logging", " --log-new", "Create a separate new log file on start. " - "Each log file will have unique name: \"..log\"" }); - options.push_back({ "logging", " --log-append", "Don't truncate the old log file." }); -#endif // LOG_DISABLE_LOGS - - options.push_back({ "cvector" }); - options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); - options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); - options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); - options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); - options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); - options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" }); - - options.push_back({ "export-lora" }); - options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() }); - options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" }); - options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); - options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() }); - - options.push_back({ "batched-bench" }); - options.push_back({ "batched-bench", " --output-format {md,jsonl}", "output format for batched-bench results (default: md)" }); - - printf("usage: %s [options]\n", argv[0]); - - for (const auto & o : options) { - if (!o.grp.empty()) { - printf("\n%s:\n\n", o.grp.c_str()); - continue; - } - printf(" %-32s", o.args.c_str()); - if (o.args.length() > 30) { - printf("\n%34s", ""); - } - - const auto desc = o.desc; - size_t start = 0; - size_t end = desc.find('\n'); - while (end != std::string::npos) { - printf("%s\n%34s", desc.substr(start, end - start).c_str(), ""); - start = end + 1; - end = desc.find('\n', start); - } - - printf("%s\n", desc.substr(start).c_str()); - } - printf("\n"); -} - std::string gpt_params_get_system_info(const gpt_params & params) { std::ostringstream os; @@ -2528,8 +792,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_lora_adapters_apply(lctx, iparams.lora_adapters); } - if (params.ignore_eos) { - params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; + if (params.sparams.ignore_eos && llama_token_eos(model) == -1) { + fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); + params.sparams.ignore_eos = false; } if (params.warmup) { @@ -2539,10 +804,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_token bos = llama_token_bos(model); llama_token eos = llama_token_eos(model); // some models (e.g. T5) don't have a BOS token - if (bos != -1) { + if (bos != LLAMA_TOKEN_NULL) { tmp.push_back(bos); } - tmp.push_back(eos); + if (eos != LLAMA_TOKEN_NULL) { + tmp.push_back(eos); + } + if (tmp.empty()) { + tmp.push_back(0); + } if (llama_model_has_encoder(model)) { llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0)); @@ -2558,7 +828,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } llama_kv_cache_clear(lctx); llama_synchronize(lctx); - llama_reset_timings(lctx); + llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT); } iparams.model = model; @@ -2637,7 +907,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_threads = params.cpuparams.n_threads; cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? params.cpuparams.n_threads : params.cpuparams_batch.n_threads; - cparams.seed = params.seed; cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; @@ -3523,7 +1792,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { - const llama_sampling_params & sparams = params.sparams; + const auto & sparams = params.sparams; fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); @@ -3574,8 +1843,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); - yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str()); - fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale); fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); @@ -3586,10 +1853,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); - - const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx))); - const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; - fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); + fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false"); yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str()); fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); @@ -3600,11 +1864,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); fprintf(stream, "logit_bias:\n"); - for (std::pair lb : sparams.logit_bias) { - if (ignore_eos && lb.first == logit_bias_eos->first) { - continue; - } - fprintf(stream, " %d: %f", lb.first, lb.second); + for (const auto & logit_bias : sparams.logit_bias) { + fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias); } fprintf(stream, "lora:\n"); @@ -3657,7 +1918,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); - fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); @@ -3671,7 +1931,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); - fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); + fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); } diff --git a/common/common.h b/common/common.h index 795ff4405..23babdd09 100644 --- a/common/common.h +++ b/common/common.h @@ -4,18 +4,11 @@ #include "llama.h" -#include "sampling.h" - #define LOG_NO_FILE_LINE_FUNCTION #include "log.h" -#include #include #include -#include -#include -#include -#include #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -54,19 +47,6 @@ struct llama_control_vector_load_info; // CPU utils // -int32_t cpu_get_num_physical_cores(); -int32_t cpu_get_num_math(); - -// -// CLI argument parsing -// - -// dimensionality reduction methods, used by cvector-generator -enum dimre_method { - DIMRE_METHOD_PCA, - DIMRE_METHOD_MEAN, -}; - struct cpu_params { int n_threads = -1; bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. @@ -76,9 +56,93 @@ struct cpu_params { uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) }; -struct gpt_params { - uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed +int32_t cpu_get_num_physical_cores(); +int32_t cpu_get_num_math(); +// +// Common params +// + +enum llama_example { + LLAMA_EXAMPLE_COMMON, + LLAMA_EXAMPLE_SPECULATIVE, + LLAMA_EXAMPLE_MAIN, + LLAMA_EXAMPLE_INFILL, + LLAMA_EXAMPLE_EMBEDDING, + LLAMA_EXAMPLE_PERPLEXITY, + LLAMA_EXAMPLE_RETRIEVAL, + LLAMA_EXAMPLE_PASSKEY, + LLAMA_EXAMPLE_IMATRIX, + LLAMA_EXAMPLE_BENCH, + LLAMA_EXAMPLE_SERVER, + LLAMA_EXAMPLE_CVECTOR_GENERATOR, + LLAMA_EXAMPLE_EXPORT_LORA, + LLAMA_EXAMPLE_LLAVA, + LLAMA_EXAMPLE_LOOKUP, + LLAMA_EXAMPLE_PARALLEL, + + LLAMA_EXAMPLE_COUNT, +}; + +enum gpt_sampler_type { + GPT_SAMPLER_TYPE_NONE = 0, + GPT_SAMPLER_TYPE_TOP_K = 1, + GPT_SAMPLER_TYPE_TOP_P = 2, + GPT_SAMPLER_TYPE_MIN_P = 3, + GPT_SAMPLER_TYPE_TFS_Z = 4, + GPT_SAMPLER_TYPE_TYPICAL_P = 5, + GPT_SAMPLER_TYPE_TEMPERATURE = 6, +}; + +// dimensionality reduction methods, used by cvector-generator +enum dimre_method { + DIMRE_METHOD_PCA, + DIMRE_METHOD_MEAN, +}; + +// sampler parameters +struct gpt_sampler_params { + uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler + + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typ_p = 1.00f; // typical_p, 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = false; // consider newlines as a repeatable token + bool ignore_eos = false; + + std::vector samplers = { + GPT_SAMPLER_TYPE_TOP_K, + GPT_SAMPLER_TYPE_TFS_Z, + GPT_SAMPLER_TYPE_TYPICAL_P, + GPT_SAMPLER_TYPE_TOP_P, + GPT_SAMPLER_TYPE_MIN_P, + GPT_SAMPLER_TYPE_TEMPERATURE + }; + + std::string grammar; // optional BNF-like grammar to constrain sampling + + std::vector logit_bias; // logit biases to apply + + // print the parameters into a string + std::string print() const; +}; + +struct gpt_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) @@ -120,26 +184,25 @@ struct gpt_params { enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - // // sampling parameters - struct llama_sampling_params sparams; + struct gpt_sampler_params sparams; - std::string model = ""; // model path - std::string model_draft = ""; // draft model for speculative decoding - std::string model_alias = "unknown"; // model alias - std::string model_url = ""; // model url to download - std::string hf_token = ""; // HF token - std::string hf_repo = ""; // HF repo - std::string hf_file = ""; // HF file - std::string prompt = ""; - std::string prompt_file = ""; // store the external prompt file name - std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state - std::string input_prefix = ""; // string to prefix user inputs with - std::string input_suffix = ""; // string to suffix user inputs with - std::string logdir = ""; // directory in which to save YAML log files - std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding - std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding - std::string logits_file = ""; // file for saving *all* logits - std::string rpc_servers = ""; // comma separated list of RPC servers + std::string model = ""; // model path // NOLINT + std::string model_draft = ""; // draft model for speculative decoding // NOLINT + std::string model_alias = "unknown"; // model alias // NOLINT + std::string model_url = ""; // model url to download // NOLINT + std::string hf_token = ""; // HF token // NOLINT + std::string hf_repo = ""; // HF repo // NOLINT + std::string hf_file = ""; // HF file // NOLINT + std::string prompt = ""; // NOLINT + std::string prompt_file = ""; // store the external prompt file name // NOLINT + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT + std::string input_prefix = ""; // string to prefix user inputs with // NOLINT + std::string input_suffix = ""; // string to suffix user inputs with // NOLINT + std::string logdir = ""; // directory in which to save YAML log files // NOLINT + std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT + std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT + std::string logits_file = ""; // file for saving *all* logits // NOLINT + std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) @@ -185,13 +248,11 @@ struct gpt_params { bool flash_attn = false; // flash attention bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - bool ignore_eos = false; // ignore generated EOS tokens bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation - bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading bool warmup = true; // warmup run @@ -201,7 +262,7 @@ struct gpt_params { std::string cache_type_v = "f16"; // KV cache data type for the V // multimodal models (see examples/llava) - std::string mmproj = ""; // path to multimodal projector + std::string mmproj = ""; // path to multimodal projector // NOLINT std::vector image; // path to image file(s) // embedding @@ -217,15 +278,15 @@ struct gpt_params { int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) std::string hostname = "127.0.0.1"; - std::string public_path = ""; - std::string chat_template = ""; - std::string system_prompt = ""; + std::string public_path = ""; // NOLINT + std::string chat_template = ""; // NOLINT + std::string system_prompt = ""; // NOLINT bool enable_chat_template = true; std::vector api_keys; - std::string ssl_file_key = ""; - std::string ssl_file_cert = ""; + std::string ssl_file_key = ""; // NOLINT + std::string ssl_file_cert = ""; // NOLINT bool endpoint_slots = true; bool endpoint_metrics = false; @@ -280,14 +341,6 @@ struct gpt_params { bool batched_bench_output_jsonl = false; }; -void gpt_params_parse_from_env(gpt_params & params); -void gpt_params_handle_model_default(gpt_params & params); - -bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params); -bool gpt_params_parse (int argc, char ** argv, gpt_params & params); -bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); -void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params); - std::string gpt_params_get_system_info(const gpt_params & params); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp deleted file mode 100644 index 438452eab..000000000 --- a/common/grammar-parser.cpp +++ /dev/null @@ -1,539 +0,0 @@ -#include "grammar-parser.h" -#include -#include -#include -#include -#include -#include - -namespace grammar_parser { - // NOTE: assumes valid utf8 (but checks for overrun) - // copied from llama.cpp - static std::pair decode_utf8(const char * src) { - static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; - uint8_t first_byte = static_cast(*src); - uint8_t highbits = first_byte >> 4; - int len = lookup[highbits]; - uint8_t mask = (1 << (8 - len)) - 1; - uint32_t value = first_byte & mask; - const char * end = src + len; // may overrun! - const char * pos = src + 1; - for ( ; pos < end && *pos; pos++) { - value = (value << 6) + (static_cast(*pos) & 0x3F); - } - return std::make_pair(value, pos); - } - - static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { - uint32_t next_id = static_cast(state.symbol_ids.size()); - auto result = state.symbol_ids.emplace(std::string(src, len), next_id); - return result.first->second; - } - - static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { - uint32_t next_id = static_cast(state.symbol_ids.size()); - state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; - return next_id; - } - - static void add_rule( - parse_state & state, - uint32_t rule_id, - const std::vector & rule) { - if (state.rules.size() <= rule_id) { - state.rules.resize(rule_id + 1); - } - state.rules[rule_id] = rule; - } - - static bool is_digit_char(char c) { - return '0' <= c && c <= '9'; - } - - static bool is_word_char(char c) { - return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c); - } - - static std::pair parse_hex(const char * src, int size) { - const char * pos = src; - const char * end = src + size; - uint32_t value = 0; - for ( ; pos < end && *pos; pos++) { - value <<= 4; - char c = *pos; - if ('a' <= c && c <= 'f') { - value += c - 'a' + 10; - } else if ('A' <= c && c <= 'F') { - value += c - 'A' + 10; - } else if ('0' <= c && c <= '9') { - value += c - '0'; - } else { - break; - } - } - if (pos != end) { - throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src); - } - return std::make_pair(value, pos); - } - - static const char * parse_space(const char * src, bool newline_ok) { - const char * pos = src; - while (*pos == ' ' || *pos == '\t' || *pos == '#' || - (newline_ok && (*pos == '\r' || *pos == '\n'))) { - if (*pos == '#') { - while (*pos && *pos != '\r' && *pos != '\n') { - pos++; - } - } else { - pos++; - } - } - return pos; - } - - static const char * parse_name(const char * src) { - const char * pos = src; - while (is_word_char(*pos)) { - pos++; - } - if (pos == src) { - throw std::runtime_error(std::string("expecting name at ") + src); - } - return pos; - } - - static const char * parse_int(const char * src) { - const char * pos = src; - while (is_digit_char(*pos)) { - pos++; - } - if (pos == src) { - throw std::runtime_error(std::string("expecting integer at ") + src); - } - return pos; - } - - static std::pair parse_char(const char * src) { - if (*src == '\\') { - switch (src[1]) { - case 'x': return parse_hex(src + 2, 2); - case 'u': return parse_hex(src + 2, 4); - case 'U': return parse_hex(src + 2, 8); - case 't': return std::make_pair('\t', src + 2); - case 'r': return std::make_pair('\r', src + 2); - case 'n': return std::make_pair('\n', src + 2); - case '\\': - case '"': - case '[': - case ']': - return std::make_pair(src[1], src + 2); - default: - throw std::runtime_error(std::string("unknown escape at ") + src); - } - } else if (*src) { - return decode_utf8(src); - } - throw std::runtime_error("unexpected end of input"); - } - - const char * parse_alternates( - parse_state & state, - const char * src, - const std::string & rule_name, - uint32_t rule_id, - bool is_nested); - - static const char * parse_sequence( - parse_state & state, - const char * src, - const std::string & rule_name, - std::vector & out_elements, - bool is_nested) { - size_t last_sym_start = out_elements.size(); - const char * pos = src; - - auto handle_repetitions = [&](int min_times, int max_times) { - - if (last_sym_start == out_elements.size()) { - throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos); - } - - // apply transformation to previous symbol (last_sym_start to end) according to - // the following rewrite rules: - // S{m,n} --> S S S (m times) S'(n-m) - // S'(x) ::= S S'(x-1) | - // (... n-m definitions of these S' rules ...) - // S'(1) ::= S | - // S{m,} --> S S S (m times) S' - // S' ::= S S' | - // S* --> S{0,} - // --> S' ::= S S' | - // S+ --> S{1,} - // --> S S' - // S' ::= S S' | - // S? --> S{0,1} - // --> S' - // S' ::= S | - - std::vector previous_elements(out_elements.begin() + last_sym_start, out_elements.end()); - if (min_times == 0) { - out_elements.resize(last_sym_start); - } else { - // Repeat the previous elements (min_times - 1) times - for (int i = 1; i < min_times; i++) { - out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end()); - } - } - - uint32_t last_rec_rule_id = 0; - auto n_opt = max_times < 0 ? 1 : max_times - min_times; - - std::vector rec_rule(previous_elements); - for (int i = 0; i < n_opt; i++) { - rec_rule.resize(previous_elements.size()); - uint32_t rec_rule_id = generate_symbol_id(state, rule_name); - if (i > 0 || max_times < 0) { - rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id}); - } - rec_rule.push_back({LLAMA_GRETYPE_ALT, 0}); - rec_rule.push_back({LLAMA_GRETYPE_END, 0}); - add_rule(state, rec_rule_id, rec_rule); - last_rec_rule_id = rec_rule_id; - } - if (n_opt > 0) { - out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id}); - } - }; - - while (*pos) { - if (*pos == '"') { // literal string - pos++; - last_sym_start = out_elements.size(); - while (*pos != '"') { - if (!*pos) { - throw std::runtime_error("unexpected end of input"); - } - auto char_pair = parse_char(pos); - pos = char_pair.second; - out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); - } - pos = parse_space(pos + 1, is_nested); - } else if (*pos == '[') { // char range(s) - pos++; - enum llama_gretype start_type = LLAMA_GRETYPE_CHAR; - if (*pos == '^') { - pos++; - start_type = LLAMA_GRETYPE_CHAR_NOT; - } - last_sym_start = out_elements.size(); - while (*pos != ']') { - if (!*pos) { - throw std::runtime_error("unexpected end of input"); - } - auto char_pair = parse_char(pos); - pos = char_pair.second; - enum llama_gretype type = last_sym_start < out_elements.size() - ? LLAMA_GRETYPE_CHAR_ALT - : start_type; - - out_elements.push_back({type, char_pair.first}); - if (pos[0] == '-' && pos[1] != ']') { - if (!pos[1]) { - throw std::runtime_error("unexpected end of input"); - } - auto endchar_pair = parse_char(pos + 1); - pos = endchar_pair.second; - out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); - } - } - pos = parse_space(pos + 1, is_nested); - } else if (is_word_char(*pos)) { // rule reference - const char * name_end = parse_name(pos); - uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos); - pos = parse_space(name_end, is_nested); - last_sym_start = out_elements.size(); - out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); - } else if (*pos == '(') { // grouping - // parse nested alternates into synthesized rule - pos = parse_space(pos + 1, true); - uint32_t sub_rule_id = generate_symbol_id(state, rule_name); - pos = parse_alternates(state, pos, rule_name, sub_rule_id, true); - last_sym_start = out_elements.size(); - // output reference to synthesized rule - out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); - if (*pos != ')') { - throw std::runtime_error(std::string("expecting ')' at ") + pos); - } - pos = parse_space(pos + 1, is_nested); - } else if (*pos == '.') { // any char - last_sym_start = out_elements.size(); - out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0}); - pos = parse_space(pos + 1, is_nested); - } else if (*pos == '*') { - pos = parse_space(pos + 1, is_nested); - handle_repetitions(0, -1); - } else if (*pos == '+') { - pos = parse_space(pos + 1, is_nested); - handle_repetitions(1, -1); - } else if (*pos == '?') { - pos = parse_space(pos + 1, is_nested); - handle_repetitions(0, 1); - } else if (*pos == '{') { - pos = parse_space(pos + 1, is_nested); - - if (!is_digit_char(*pos)) { - throw std::runtime_error(std::string("expecting an int at ") + pos); - } - const char * int_end = parse_int(pos); - int min_times = std::stoul(std::string(pos, int_end - pos)); - pos = parse_space(int_end, is_nested); - - int max_times = -1; - - if (*pos == '}') { - max_times = min_times; - pos = parse_space(pos + 1, is_nested); - } else if (*pos == ',') { - pos = parse_space(pos + 1, is_nested); - - if (is_digit_char(*pos)) { - const char * int_end = parse_int(pos); - max_times = std::stoul(std::string(pos, int_end - pos)); - pos = parse_space(int_end, is_nested); - } - - if (*pos != '}') { - throw std::runtime_error(std::string("expecting '}' at ") + pos); - } - pos = parse_space(pos + 1, is_nested); - } else { - throw std::runtime_error(std::string("expecting ',' at ") + pos); - } - handle_repetitions(min_times, max_times); - } else { - break; - } - } - return pos; - } - - const char * parse_alternates( - parse_state & state, - const char * src, - const std::string & rule_name, - uint32_t rule_id, - bool is_nested) { - std::vector rule; - const char * pos = parse_sequence(state, src, rule_name, rule, is_nested); - while (*pos == '|') { - rule.push_back({LLAMA_GRETYPE_ALT, 0}); - pos = parse_space(pos + 1, true); - pos = parse_sequence(state, pos, rule_name, rule, is_nested); - } - rule.push_back({LLAMA_GRETYPE_END, 0}); - add_rule(state, rule_id, rule); - return pos; - } - - static const char * parse_rule(parse_state & state, const char * src) { - const char * name_end = parse_name(src); - const char * pos = parse_space(name_end, false); - size_t name_len = name_end - src; - uint32_t rule_id = get_symbol_id(state, src, name_len); - const std::string name(src, name_len); - - if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) { - throw std::runtime_error(std::string("expecting ::= at ") + pos); - } - pos = parse_space(pos + 3, true); - - pos = parse_alternates(state, pos, name, rule_id, false); - - if (*pos == '\r') { - pos += pos[1] == '\n' ? 2 : 1; - } else if (*pos == '\n') { - pos++; - } else if (*pos) { - throw std::runtime_error(std::string("expecting newline or end at ") + pos); - } - return parse_space(pos, true); - } - - parse_state parse(const char * src) { - try { - parse_state state; - const char * pos = parse_space(src, true); - while (*pos) { - pos = parse_rule(state, pos); - } - // Validate the state to ensure that all rules are defined - for (const auto & rule : state.rules) { - if (rule.empty()) { - throw std::runtime_error("Undefined rule"); - } - for (const auto & elem : rule) { - if (elem.type == LLAMA_GRETYPE_RULE_REF) { - // Ensure that the rule at that location exists - if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) { - // Get the name of the rule that is missing - for (const auto & kv : state.symbol_ids) { - if (kv.second == elem.value) { - throw std::runtime_error("Undefined rule identifier '" + kv.first + "'"); - } - } - } - } - } - } - return state; - } catch (const std::exception & err) { - fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what()); - return parse_state(); - } - } - - static void print_grammar_char(FILE * file, uint32_t c) { - if (0x20 <= c && c <= 0x7f) { - fprintf(file, "%c", static_cast(c)); - } else { - // cop out of encoding UTF-8 - fprintf(file, "", c); - } - } - - static bool is_char_element(llama_grammar_element elem) { - switch (elem.type) { - case LLAMA_GRETYPE_CHAR: return true; - case LLAMA_GRETYPE_CHAR_NOT: return true; - case LLAMA_GRETYPE_CHAR_ALT: return true; - case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true; - case LLAMA_GRETYPE_CHAR_ANY: return true; - default: return false; - } - } - - static void print_rule_binary(FILE * file, const std::vector & rule) { - for (auto elem : rule) { - switch (elem.type) { - case LLAMA_GRETYPE_END: fprintf(file, "END"); break; - case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break; - case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break; - case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break; - case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break; - case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break; - case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break; - case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break; - } - switch (elem.type) { - case LLAMA_GRETYPE_END: - case LLAMA_GRETYPE_ALT: - case LLAMA_GRETYPE_RULE_REF: - fprintf(file, "(%u) ", elem.value); - break; - case LLAMA_GRETYPE_CHAR: - case LLAMA_GRETYPE_CHAR_NOT: - case LLAMA_GRETYPE_CHAR_RNG_UPPER: - case LLAMA_GRETYPE_CHAR_ALT: - case LLAMA_GRETYPE_CHAR_ANY: - fprintf(file, "(\""); - print_grammar_char(file, elem.value); - fprintf(file, "\") "); - break; - } - } - fprintf(file, "\n"); - } - - static void print_rule( - FILE * file, - uint32_t rule_id, - const std::vector & rule, - const std::map & symbol_id_names) { - if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) { - throw std::runtime_error( - "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id)); - } - fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str()); - for (size_t i = 0, end = rule.size() - 1; i < end; i++) { - llama_grammar_element elem = rule[i]; - switch (elem.type) { - case LLAMA_GRETYPE_END: - throw std::runtime_error( - "unexpected end of rule: " + std::to_string(rule_id) + "," + - std::to_string(i)); - case LLAMA_GRETYPE_ALT: - fprintf(file, "| "); - break; - case LLAMA_GRETYPE_RULE_REF: - fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str()); - break; - case LLAMA_GRETYPE_CHAR: - fprintf(file, "["); - print_grammar_char(file, elem.value); - break; - case LLAMA_GRETYPE_CHAR_NOT: - fprintf(file, "[^"); - print_grammar_char(file, elem.value); - break; - case LLAMA_GRETYPE_CHAR_RNG_UPPER: - if (i == 0 || !is_char_element(rule[i - 1])) { - throw std::runtime_error( - "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " + - std::to_string(rule_id) + "," + std::to_string(i)); - } - fprintf(file, "-"); - print_grammar_char(file, elem.value); - break; - case LLAMA_GRETYPE_CHAR_ALT: - if (i == 0 || !is_char_element(rule[i - 1])) { - throw std::runtime_error( - "LLAMA_GRETYPE_CHAR_ALT without preceding char: " + - std::to_string(rule_id) + "," + std::to_string(i)); - } - print_grammar_char(file, elem.value); - break; - case LLAMA_GRETYPE_CHAR_ANY: - fprintf(file, "."); - break; - } - if (is_char_element(elem)) { - switch (rule[i + 1].type) { - case LLAMA_GRETYPE_CHAR_ALT: - case LLAMA_GRETYPE_CHAR_RNG_UPPER: - case LLAMA_GRETYPE_CHAR_ANY: - break; - default: - fprintf(file, "] "); - } - } - } - fprintf(file, "\n"); - } - - void print_grammar(FILE * file, const parse_state & state) { - try { - std::map symbol_id_names; - for (const auto & kv : state.symbol_ids) { - symbol_id_names[kv.second] = kv.first; - } - for (size_t i = 0, end = state.rules.size(); i < end; i++) { - // fprintf(file, "%zu: ", i); - // print_rule_binary(file, state.rules[i]); - print_rule(file, uint32_t(i), state.rules[i], symbol_id_names); - // fprintf(file, "\n"); - } - } catch (const std::exception & err) { - fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what()); - } - } - - std::vector parse_state::c_rules() { - std::vector ret; - ret.reserve(rules.size()); - for (const auto & rule : rules) { - ret.push_back(rule.data()); - } - return ret; - } -} diff --git a/common/grammar-parser.h b/common/grammar-parser.h deleted file mode 100644 index 9037d7272..000000000 --- a/common/grammar-parser.h +++ /dev/null @@ -1,29 +0,0 @@ -// Implements a parser for an extended Backus-Naur form (BNF), producing the -// binary context-free grammar format specified by llama.h. Supports character -// ranges, grouping, and repetition operators. As an example, a grammar for -// arithmetic might look like: -// -// root ::= expr -// expr ::= term ([-+*/] term)* -// term ::= num | "(" space expr ")" space -// num ::= [0-9]+ space -// space ::= [ \t\n]* - -#pragma once -#include "llama.h" -#include -#include -#include -#include - -namespace grammar_parser { - struct parse_state { - std::map symbol_ids; - std::vector> rules; - - std::vector c_rules(); - }; - - parse_state parse(const char * src); - void print_grammar(FILE * file, const parse_state & state); -} diff --git a/common/sampling.cpp b/common/sampling.cpp index 079e40516..21b956462 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -1,460 +1,446 @@ -#define LLAMA_API_INTERNAL #include "sampling.h" -#include -struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) { - struct llama_sampling_context * result = new llama_sampling_context(); +#include "common.h" - result->params = params; - result->grammar = nullptr; +#include +#include - // if there is a grammar, parse it - if (!params.grammar.empty()) { - result->parsed_grammar = grammar_parser::parse(params.grammar.c_str()); +// the ring buffer works similarly to std::deque, but with a fixed capacity +// TODO: deduplicate with llama-impl.h +template +struct ring_buffer { + ring_buffer(size_t cap) : capacity(cap), data(cap) {} - // will be empty (default) if there are parse errors - if (result->parsed_grammar.rules.empty()) { - fprintf(stderr, "%s: failed to parse grammar\n", __func__); - delete result; - return nullptr; + T & front() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[first]; + } + + const T & front() const { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[first]; + } + + T & back() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[pos]; + } + + const T & back() const { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[pos]; + } + + void push_back(const T & value) { + if (sz == capacity) { + // advance the start when buffer is full + first = (first + 1) % capacity; + } else { + sz++; + } + data[pos] = value; + pos = (pos + 1) % capacity; + } + + T pop_front() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + T value = data[first]; + first = (first + 1) % capacity; + sz--; + return value; + } + + const T & rat(size_t i) const { + if (i >= sz) { + throw std::runtime_error("ring buffer: index out of bounds"); + } + return data[(first + sz - i - 1) % capacity]; + } + + std::vector to_vector() const { + std::vector result; + result.reserve(sz); + for (size_t i = 0; i < sz; i++) { + result.push_back(data[(first + i) % capacity]); + } + return result; + } + + void clear() { + // here only reset the status of the buffer + sz = 0; + first = 0; + pos = 0; + } + + bool empty() const { + return sz == 0; + } + + size_t size() const { + return sz; + } + + size_t capacity = 0; + size_t sz = 0; + size_t first = 0; + size_t pos = 0; + std::vector data; +}; + +struct gpt_sampler { + gpt_sampler_params params; + + struct llama_sampler * grmr; + struct llama_sampler * chain; + + ring_buffer prev; + + std::vector cur; + + llama_token_data_array cur_p; + + void set_logits(struct llama_context * ctx, int idx) { + const auto * logits = llama_get_logits_ith(ctx, idx); + + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + + cur.resize(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; } - // Ensure that there is a "root" node. - if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) { - fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__); - delete result; - return nullptr; - } - - std::vector grammar_rules(result->parsed_grammar.c_rules()); - - struct llama_grammar * grammar = llama_grammar_init( - grammar_rules.data(), - grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root")); - if (grammar == nullptr) { - throw std::runtime_error("Failed to initialize llama_grammar"); - } - result->grammar = grammar; + cur_p = { cur.data(), cur.size(), -1, false }; } +}; - result->prev.resize(params.n_prev); - - result->n_valid = 0; - - llama_sampling_set_rng_seed(result, params.seed); - - return result; -} - -void llama_sampling_free(struct llama_sampling_context * ctx) { - if (ctx->grammar != NULL) { - llama_grammar_free(ctx->grammar); - } - - delete ctx; -} - -void llama_sampling_reset(llama_sampling_context * ctx) { - if (ctx->grammar != NULL) { - llama_grammar_free(ctx->grammar); - ctx->grammar = NULL; - } - - if (!ctx->parsed_grammar.rules.empty()) { - std::vector grammar_rules(ctx->parsed_grammar.c_rules()); - - struct llama_grammar * grammar = llama_grammar_init( - grammar_rules.data(), - grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root")); - if (grammar == nullptr) { - throw std::runtime_error("Failed to initialize llama_grammar"); - } - ctx->grammar = grammar; - } - - std::fill(ctx->prev.begin(), ctx->prev.end(), 0); - ctx->cur.clear(); - ctx->n_valid = 0; -} - -void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) { - if (seed == LLAMA_DEFAULT_SEED) { - seed = std::random_device{}(); - } - ctx->rng.seed(seed); -} - -void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) { - if (dst->grammar) { - llama_grammar_free(dst->grammar); - dst->grammar = nullptr; - } - - if (src->grammar) { - dst->grammar = llama_grammar_copy(src->grammar); - } - - dst->prev = src->prev; -} - -llama_token llama_sampling_last(llama_sampling_context * ctx) { - return ctx->prev.back(); -} - -std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) { - const int size = ctx_sampling->prev.size(); - - n = std::min(n, size); - - std::string result; - - for (int i = size - n; i < size; i++) { - result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]); - } - - return result; -} - -std::string llama_sampling_print(const llama_sampling_params & params) { +std::string gpt_sampler_params::print() const { char result[1024]; snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", - params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present, - params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp, - params.mirostat, params.mirostat_eta, params.mirostat_tau); + penalty_last_n, penalty_repeat, penalty_freq, penalty_present, + top_k, tfs_z, top_p, min_p, typ_p, temp, + mirostat, mirostat_eta, mirostat_tau); return std::string(result); } -std::string llama_sampling_order_print(const llama_sampling_params & params) { - std::string result = "CFG -> Penalties "; - if (params.mirostat == 0) { - for (auto sampler_type : params.samplers_sequence) { - const auto sampler_type_name = llama_sampling_type_to_str(sampler_type); - if (!sampler_type_name.empty()) { - result += "-> " + sampler_type_name + " "; +struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { + llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); + + lparams.no_perf = false; // TODO: control via params + + auto * result = new gpt_sampler { + /* .params = */ params, + /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"), + /* .chain = */ llama_sampler_chain_init(lparams), + /* .prev = */ ring_buffer(std::max(32, params.n_prev)), + /* .cur = */ {}, + /* .cur_p = */ {}, + }; + + llama_sampler_chain_add(result->chain, + llama_sampler_init_logit_bias( + llama_n_vocab(model), + params.logit_bias.size(), + params.logit_bias.data())); + + llama_sampler_chain_add(result->chain, + llama_sampler_init_penalties( + llama_n_vocab (model), + llama_token_eos(model), + llama_token_nl (model), + params.penalty_last_n, + params.penalty_repeat, + params.penalty_freq, + params.penalty_present, + params.penalize_nl, + params.ignore_eos)); + + if (params.temp > 0.0f) { + if (params.mirostat == 0) { + for (const auto & cnstr : params.samplers) { + switch (cnstr) { + case GPT_SAMPLER_TYPE_TOP_K: + llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); + break; + case GPT_SAMPLER_TYPE_TOP_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); + break; + case GPT_SAMPLER_TYPE_MIN_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); + break; + case GPT_SAMPLER_TYPE_TFS_Z: + llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); + break; + case GPT_SAMPLER_TYPE_TYPICAL_P: + llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); + break; + case GPT_SAMPLER_TYPE_TEMPERATURE: + llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); + break; + default: + GGML_ASSERT(false && "unknown sampler type"); + } } + llama_sampler_chain_add(result->chain, llama_sampler_init_softmax()); + llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); + } else if (params.mirostat == 1) { + llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); + llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); + } else if (params.mirostat == 2) { + llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); + llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); + } else { + GGML_ASSERT(false && "unknown mirostat version"); } } else { - result += "-> mirostat "; + llama_sampler_chain_add(result->chain, llama_sampler_init_softmax()); + llama_sampler_chain_add(result->chain, llama_sampler_init_greedy()); } return result; } -std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) { - switch (sampler_type) { - case llama_sampler_type::TOP_K: return "top_k"; - case llama_sampler_type::TFS_Z: return "tfs_z"; - case llama_sampler_type::TYPICAL_P: return "typical_p"; - case llama_sampler_type::TOP_P: return "top_p"; - case llama_sampler_type::MIN_P: return "min_p"; - case llama_sampler_type::TEMPERATURE: return "temperature"; +void gpt_sampler_free(struct gpt_sampler * gsmpl) { + if (gsmpl) { + llama_sampler_free(gsmpl->grmr); + + llama_sampler_free(gsmpl->chain); + + delete gsmpl; + } +} + +void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) { + if (accept_grammar) { + llama_sampler_accept(gsmpl->grmr, token); + } + + llama_sampler_accept(gsmpl->chain, token); + + gsmpl->prev.push_back(token); +} + +void gpt_sampler_reset(struct gpt_sampler * gsmpl) { + llama_sampler_reset(gsmpl->grmr); + + llama_sampler_reset(gsmpl->chain); +} + +struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) { + return new gpt_sampler { + /* .params = */ gsmpl->params, + /* .grmr = */ llama_sampler_clone(gsmpl->grmr), + /* .chain = */ llama_sampler_clone(gsmpl->chain), + /* .prev = */ gsmpl->prev, + /* .cur = */ gsmpl->cur, + /* .cur_p = */ gsmpl->cur_p, + }; +} + +void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) { + // TODO: measure grammar performance + + if (gsmpl) { + llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + } + if (ctx) { + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + } +} + +llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { + gsmpl->set_logits(ctx, idx); + + auto & grmr = gsmpl->grmr; + auto & chain = gsmpl->chain; + auto & cur_p = gsmpl->cur_p; // initialized by set_logits + + if (grammar_first) { + llama_sampler_apply(grmr, &cur_p); + } + + llama_sampler_apply(chain, &cur_p); + + GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration"); + + const llama_token id = cur_p.data[cur_p.selected].id; + + if (grammar_first) { + return id; + } + + // check if it the sampled token fits the grammar + { + llama_token_data single_token_data = { id, 1.0f, 0.0f }; + llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false }; + + llama_sampler_apply(grmr, &single_token_data_array); + + const bool is_valid = single_token_data_array.data[0].logit != -INFINITY; + if (is_valid) { + return id; + } + } + + // resampling: + // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain + gsmpl->set_logits(ctx, idx); + + llama_sampler_apply(grmr, &cur_p); + llama_sampler_apply(chain, &cur_p); + + GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration"); + + return cur_p.data[cur_p.selected].id; +} + +// helpers + +llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) { + return &gsmpl->cur_p; +} + +llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) { + return gsmpl->prev.rat(0); +} + +std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { + std::string result = "\tlogits "; + + for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { + const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); + result += std::string("-> ") + llama_sampler_name(smpl) + " "; + } + + return result; +} + +std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) { + n = std::min(n, (int) gsmpl->prev.size()); + + if (n <= 0) { + return ""; + } + + std::string result; + result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab + + for (int i = n - 1; i >= 0; i--) { + const llama_token id = gsmpl->prev.rat(i); + + GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen"); + + result += llama_token_to_piece(ctx_main, id); + } + + return result; +} + +char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) { + switch (cnstr) { + case GPT_SAMPLER_TYPE_TOP_K: return 'k'; + case GPT_SAMPLER_TYPE_TFS_Z: return 'f'; + case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y'; + case GPT_SAMPLER_TYPE_TOP_P: return 'p'; + case GPT_SAMPLER_TYPE_MIN_P: return 'm'; + case GPT_SAMPLER_TYPE_TEMPERATURE: return 't'; + default : return '?'; + } +} + +std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) { + switch (cnstr) { + case GPT_SAMPLER_TYPE_TOP_K: return "top_k"; + case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z"; + case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p"; + case GPT_SAMPLER_TYPE_TOP_P: return "top_p"; + case GPT_SAMPLER_TYPE_MIN_P: return "min_p"; + case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature"; default : return ""; } } -std::vector llama_sampling_types_from_names(const std::vector & names, bool allow_alt_names) { - std::unordered_map sampler_canonical_name_map { - {"top_k", llama_sampler_type::TOP_K}, - {"top_p", llama_sampler_type::TOP_P}, - {"typical_p", llama_sampler_type::TYPICAL_P}, - {"min_p", llama_sampler_type::MIN_P}, - {"tfs_z", llama_sampler_type::TFS_Z}, - {"temperature", llama_sampler_type::TEMPERATURE} +std::vector gpt_sampler_types_from_names(const std::vector & names, bool allow_alt_names) { + std::unordered_map sampler_canonical_name_map { + { "top_k", GPT_SAMPLER_TYPE_TOP_K }, + { "top_p", GPT_SAMPLER_TYPE_TOP_P }, + { "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P }, + { "min_p", GPT_SAMPLER_TYPE_MIN_P }, + { "tfs_z", GPT_SAMPLER_TYPE_TFS_Z }, + { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE }, }; // since samplers names are written multiple ways // make it ready for both system names and input names - std::unordered_map sampler_alt_name_map { - {"top-k", llama_sampler_type::TOP_K}, - {"top-p", llama_sampler_type::TOP_P}, - {"nucleus", llama_sampler_type::TOP_P}, - {"typical-p", llama_sampler_type::TYPICAL_P}, - {"typical", llama_sampler_type::TYPICAL_P}, - {"min-p", llama_sampler_type::MIN_P}, - {"tfs-z", llama_sampler_type::TFS_Z}, - {"tfs", llama_sampler_type::TFS_Z}, - {"temp", llama_sampler_type::TEMPERATURE} + std::unordered_map sampler_alt_name_map { + { "top-k", GPT_SAMPLER_TYPE_TOP_K }, + { "top-p", GPT_SAMPLER_TYPE_TOP_P }, + { "nucleus", GPT_SAMPLER_TYPE_TOP_P }, + { "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P }, + { "typical", GPT_SAMPLER_TYPE_TYPICAL_P }, + { "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P }, + { "typ", GPT_SAMPLER_TYPE_TYPICAL_P }, + { "min-p", GPT_SAMPLER_TYPE_MIN_P }, + { "tfs-z", GPT_SAMPLER_TYPE_TFS_Z }, + { "tfs", GPT_SAMPLER_TYPE_TFS_Z }, + { "temp", GPT_SAMPLER_TYPE_TEMPERATURE }, }; - std::vector sampler_types; - sampler_types.reserve(names.size()); - for (const auto & name : names) - { - auto sampler_item = sampler_canonical_name_map.find(name); - if (sampler_item != sampler_canonical_name_map.end()) - { - sampler_types.push_back(sampler_item->second); - } - else - { - if (allow_alt_names) - { - sampler_item = sampler_alt_name_map.find(name); - if (sampler_item != sampler_alt_name_map.end()) - { - sampler_types.push_back(sampler_item->second); - } - } - } - } - return sampler_types; -} + std::vector samplers; + samplers.reserve(names.size()); -std::vector llama_sampling_types_from_chars(const std::string & names_string) { - std::unordered_map sampler_name_map { - {'k', llama_sampler_type::TOP_K}, - {'p', llama_sampler_type::TOP_P}, - {'y', llama_sampler_type::TYPICAL_P}, - {'m', llama_sampler_type::MIN_P}, - {'f', llama_sampler_type::TFS_Z}, - {'t', llama_sampler_type::TEMPERATURE} - }; - - std::vector sampler_types; - sampler_types.reserve(names_string.size()); - for (const auto & c : names_string) { - const auto sampler_item = sampler_name_map.find(c); - if (sampler_item != sampler_name_map.end()) { - sampler_types.push_back(sampler_item->second); - } - } - return sampler_types; -} - -// no reasons to expose this function in header -static void sampler_queue( - struct llama_context * ctx_main, - const llama_sampling_params & params, - llama_token_data_array & cur_p, - size_t min_keep) { - const float temp = params.temp; - const float dynatemp_range = params.dynatemp_range; - const float dynatemp_exponent = params.dynatemp_exponent; - const int32_t top_k = params.top_k; - const float top_p = params.top_p; - const float min_p = params.min_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const std::vector & samplers_sequence = params.samplers_sequence; - - for (auto sampler_type : samplers_sequence) { - switch (sampler_type) { - case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; - case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; - case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; - case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; - case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; - case llama_sampler_type::TEMPERATURE: - if (dynatemp_range > 0) { - float dynatemp_min = std::max(0.0f, temp - dynatemp_range); - float dynatemp_max = std::max(0.0f, temp + dynatemp_range); - llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent); - } else { - llama_sample_temp(ctx_main, &cur_p, temp); - } - break; - default : break; - } - } -} - -static llama_token llama_sampling_sample_impl( - struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_main, - struct llama_context * ctx_cfg, - const int idx, - bool is_resampling) { - const llama_sampling_params & params = ctx_sampling->params; - - const float temp = params.temp; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - - std::vector original_logits; - auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits); - if (ctx_sampling->grammar != NULL && !is_resampling) { - GGML_ASSERT(!original_logits.empty()); - } - llama_token id = 0; - - if (temp < 0.0) { - // greedy sampling, with probs - llama_sample_softmax(ctx_main, &cur_p); - id = cur_p.data[0].id; - } else if (temp == 0.0) { - // greedy sampling, no probs - id = llama_sample_token_greedy(ctx_main, &cur_p); - } else { - if (mirostat == 1) { - const int mirostat_m = 100; - llama_sample_temp(ctx_main, &cur_p, temp); - id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu); - } else if (mirostat == 2) { - llama_sample_temp(ctx_main, &cur_p, temp); - id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu); + for (const auto & name : names) { + auto sampler = sampler_canonical_name_map.find(name); + if (sampler != sampler_canonical_name_map.end()) { + samplers.push_back(sampler->second); } else { - // temperature sampling - size_t min_keep = std::max(1, params.min_keep); - - sampler_queue(ctx_main, params, cur_p, min_keep); - - id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng); - - //{ - // const int n_top = 10; - // LOG("top %d candidates:\n", n_top); - - // for (int i = 0; i < n_top; i++) { - // const llama_token id = cur_p.data[i].id; - // (void)id; // To avoid a warning that id is unused when logging is disabled. - // LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p); - // } - //} - - //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str()); - } - } - - if (ctx_sampling->grammar != NULL && !is_resampling) { - // Get a pointer to the logits - float * logits = llama_get_logits_ith(ctx_main, idx); - - // Create an array with a single token data element for the sampled id - llama_token_data single_token_data = {id, logits[id], 0.0f}; - llama_token_data_array single_token_data_array = { &single_token_data, 1, false }; - - // Apply grammar constraints to the single token - llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array); - - // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY - bool is_valid = single_token_data_array.data[0].logit != -INFINITY; - - // If the token is not valid according to the grammar, perform resampling - if (!is_valid) { - LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str()); - - // Restore logits from the copy - std::copy(original_logits.begin(), original_logits.end(), logits); - - return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true); - } - } - - ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size; - - return id; -} - -static llama_token_data_array llama_sampling_prepare_impl( - struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_main, - struct llama_context * ctx_cfg, - const int idx, - bool apply_grammar, - std::vector * original_logits) { - const llama_sampling_params & params = ctx_sampling->params; - - const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - - const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; - const float penalty_repeat = params.penalty_repeat; - const float penalty_freq = params.penalty_freq; - const float penalty_present = params.penalty_present; - - const bool penalize_nl = params.penalize_nl; - - auto & prev = ctx_sampling->prev; - auto & cur = ctx_sampling->cur; - - // Get a pointer to the logits - float * logits = llama_get_logits_ith(ctx_main, idx); - - if (ctx_sampling->grammar != NULL && !apply_grammar) { - GGML_ASSERT(original_logits != NULL); - // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. - *original_logits = {logits, logits + n_vocab}; - } - - // apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { - logits[it->first] += it->second; - } - - if (ctx_cfg) { - float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx); - llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); - } - - cur.resize(n_vocab); - - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; - } - - llama_token_data_array cur_p = { cur.data(), cur.size(), false }; - - // apply penalties - const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; - const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); - if (penalty_tokens_used_size) { - const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; - - llama_sample_repetition_penalties(ctx_main, &cur_p, - penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size, - penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present); - - if (!penalize_nl) { - for (size_t idx = 0; idx < cur_p.size; idx++) { - if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) { - cur_p.data[idx].logit = nl_logit; - break; + if (allow_alt_names) { + sampler = sampler_alt_name_map.find(name); + if (sampler != sampler_alt_name_map.end()) { + samplers.push_back(sampler->second); } } } } - // apply grammar checks before sampling logic - if (apply_grammar && ctx_sampling->grammar != NULL) { - llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p); + return samplers; +} + +std::vector gpt_sampler_types_from_chars(const std::string & chars) { + std::unordered_map sampler_name_map = { + { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K }, + { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z }, + { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P }, + { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P }, + { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P }, + { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE } + }; + + std::vector samplers; + samplers.reserve(chars.size()); + + for (const auto & c : chars) { + const auto sampler = sampler_name_map.find(c); + if (sampler != sampler_name_map.end()) { + samplers.push_back(sampler->second); + } } - return cur_p; -} - -llama_token llama_sampling_sample( - struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_main, - struct llama_context * ctx_cfg, - const int idx) { - // Call the implementation function with is_resampling set to false by default - return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false); -} - -llama_token_data_array llama_sampling_prepare( - struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_main, - struct llama_context * ctx_cfg, - const int idx, - bool apply_grammar, - std::vector * original_logits) { - return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits); -} - -void llama_sampling_accept( - struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_main, - llama_token id, - bool apply_grammar) { - ctx_sampling->prev.erase(ctx_sampling->prev.begin()); - ctx_sampling->prev.push_back(id); - - if (ctx_sampling->grammar != NULL && apply_grammar) { - llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id); - } + return samplers; } diff --git a/common/sampling.h b/common/sampling.h index eeaa53b8b..0a4461fab 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -2,159 +2,80 @@ #include "llama.h" -#include "grammar-parser.h" - -#include -#include -#include -#include - -// sampler types -enum class llama_sampler_type : char { - TOP_K = 'k', - TOP_P = 'p', - MIN_P = 'm', - TFS_Z = 'f', - TYPICAL_P = 'y', - TEMPERATURE = 't' -}; - -// sampling parameters -typedef struct llama_sampling_params { - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = false; // consider newlines as a repeatable token - uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context - - std::vector samplers_sequence = { - llama_sampler_type::TOP_K, - llama_sampler_type::TFS_Z, - llama_sampler_type::TYPICAL_P, - llama_sampler_type::TOP_P, - llama_sampler_type::MIN_P, - llama_sampler_type::TEMPERATURE - }; - - std::string grammar; // optional BNF-like grammar to constrain sampling - - // Classifier-Free Guidance - // https://arxiv.org/abs/2306.17806 - std::string cfg_negative_prompt; // string to help guidance - float cfg_scale = 1.f; // how strong is guidance - - std::unordered_map logit_bias; // logit bias for specific tokens - - std::vector penalty_prompt_tokens; - bool use_penalty_prompt_tokens = false; -} llama_sampling_params; - -// general sampler context -// TODO: move to llama.h -struct llama_sampling_context { - // parameters that will be used for sampling - llama_sampling_params params; - - // mirostat sampler state - float mirostat_mu; - - llama_grammar * grammar; - - // internal - grammar_parser::parse_state parsed_grammar; - - // TODO: replace with ring-buffer - std::vector prev; - std::vector cur; - size_t n_valid; // Number of correct top tokens with correct probabilities. - - std::mt19937 rng; -}; - #include "common.h" -// Create a new sampling context instance. -struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params); +#include +#include -void llama_sampling_free(struct llama_sampling_context * ctx); - -// Reset the sampler context -// - clear prev tokens -// - reset grammar -void llama_sampling_reset(llama_sampling_context * ctx); - -// Set the sampler seed -void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed); - -// Copy the sampler context -void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst); - -// Get the last sampled token -llama_token llama_sampling_last(llama_sampling_context * ctx); - -// Get a string representation of the last sampled tokens -std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n); - -// Print sampling parameters into a string -std::string llama_sampling_print(const llama_sampling_params & params); - -// Print sampling order into a string -std::string llama_sampling_order_print(const llama_sampling_params & params); - -std::string llama_sampling_type_to_str(llama_sampler_type sampler_type); - -std::vector llama_sampling_types_from_names(const std::vector & names, bool allow_alt_names); -std::vector llama_sampling_types_from_chars(const std::string & names_string); - -// this is a common sampling function used across the examples for convenience -// it can serve as a starting point for implementing your own sampling function -// Note: When using multiple sequences, it is the caller's responsibility to call -// llama_sampling_reset when a sequence ends +// gpt_sampler extends llama_sampler with additional functionality: // -// required: -// - ctx_main: context to use for sampling -// - ctx_sampling: sampling-specific context +// - grammar support +// - custom sampler logic based on the parameters +// - history of the last accepted tokens +// - performance metrics // -// optional: -// - ctx_cfg: context to use for classifier-free guidance -// - idx: sample from llama_get_logits_ith(ctx, idx) +// This goal is to have a common implementation of the sampling logic shared across the examples. +// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more +// complex (top-k, top-p, etc). // -// returns: -// - token: sampled token -// - candidates: vector of candidate tokens +// Another example is related to the grammar. In general, the grammar constraints applied on the full +// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled +// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the +// grammar constraints are applied to the full vocabulary and the token is resampled. +// +// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can +// be moved into the core llama library. +// +// For convenience, the gpt_sampler also maintains a container with the current candidate tokens. +// This can be used to access the probabilities of the rest of the non-sampled tokens. +// +// TODO: measure grammar performance // -llama_token llama_sampling_sample( - struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_main, - struct llama_context * ctx_cfg, - int idx = -1); -// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters. -llama_token_data_array llama_sampling_prepare( - struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_main, - struct llama_context * ctx_cfg, - int idx = 0, - bool apply_grammar = true, - std::vector * original_logits = nullptr); +struct gpt_sampler; -void llama_sampling_accept( - struct llama_sampling_context * ctx_sampling, - struct llama_context * ctx_main, - llama_token id, - bool apply_grammar); +// llama_sampler API overloads + +struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params); + +void gpt_sampler_free(struct gpt_sampler * gsmpl); + +// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar +void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar); +void gpt_sampler_reset (struct gpt_sampler * gsmpl); +struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl); + +// arguments can be nullptr to skip printing +void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl); + +// extended sampling implementation: +// +// - set logits +// - apply the configured sampler chain +// - check if the token fits the grammar (if any) +// - if not: resample by first applying the grammar constraints and then sampling again (slower path) +// +// if grammar_first is true, the grammar is applied before the samplers (slower) +// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar +// +llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); + +// helpers + +// access the internal list of current candidate tokens +llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl); + +// get the last accepted token +llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl); + +// print the sampler chain into a string +std::string gpt_sampler_print(const struct gpt_sampler * gsmpl); + +// get a string representation of the last accepted tokens +std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n); + +char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr); +std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr); + +std::vector gpt_sampler_types_from_names(const std::vector & names, bool allow_alt_names); +std::vector gpt_sampler_types_from_chars(const std::string & chars); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0a9bbc829..ca473244e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -302,6 +302,8 @@ class Model: gguf.MODEL_TENSOR.TIME_MIX_FIRST, gguf.MODEL_TENSOR.TIME_MIX_W1, gguf.MODEL_TENSOR.TIME_MIX_W2, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, ) ) or not new_name.endswith(".weight") diff --git a/docs/build.md b/docs/build.md index 152d46d6f..faa0ecfa4 100644 --- a/docs/build.md +++ b/docs/build.md @@ -380,3 +380,9 @@ For detailed info, such as model/device supports, CANN install, please refer to ### Android To read documentation for how to build on Android, [click here](./android.md) + +### Arm CPU optimized mulmat kernels + +Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats. + +To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`). diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 25a950ea5..a91e7f4bd 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -28,9 +29,7 @@ static std::vector parse_list(char * p) { return ret; } -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); LOG_TEE("\n"); @@ -39,8 +38,7 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { return 1; } @@ -210,7 +208,8 @@ int main(int argc, char ** argv) { } } - llama_print_timings(ctx); + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_batch_free(batch); diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 616494d2d..9f7c49492 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -27,7 +27,6 @@ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), mo print("Failed to load model") exit(1) } - defer { llama_free_model(model) } @@ -37,7 +36,6 @@ var tokens = tokenize(text: prompt, add_bos: true) let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel) var context_params = llama_context_default_params() -context_params.seed = 1234 context_params.n_ctx = n_kv_req context_params.n_batch = UInt32(max(n_len, n_parallel)) context_params.n_threads = 8 @@ -48,11 +46,26 @@ guard context != nil else { print("Failed to initialize context") exit(1) } - defer { llama_free(context) } +var sparams = llama_sampler_chain_default_params() + +let smpl = llama_sampler_chain_init(sparams) +guard smpl != nil else { + print("Failed to initialize sampling") + exit(1) +} +defer { + llama_sampler_free(smpl) +} + +llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40)); +llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1)); +llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4)); +llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234)); + let n_ctx = llama_n_ctx(context) print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n") @@ -125,32 +138,7 @@ while n_cur <= n_len { continue } - var n_vocab = llama_n_vocab(model) - var logits = llama_get_logits_ith(context, i_batch[i]) - - var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab)) - - for token_id in 0 ..< n_vocab { - candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0)) - } - - var candidates_p: llama_token_data_array = .init( - data: &candidates, - size: candidates.count, - sorted: false - ) - - let top_k: Int32 = 40 - let top_p: Float = 0.9 - let temp: Float = 0.4 - - llama_sample_top_k(context, &candidates_p, top_k, 1) - llama_sample_top_p(context, &candidates_p, top_p, 1) - llama_sample_temp(context, &candidates_p, temp) - - let new_token_id = llama_sample_token(context, &candidates_p) - - // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); + let new_token_id = llama_sampler_sample(smpl, context, i_batch[i]) // is it an end of stream? -> mark the stream as finished if llama_token_is_eog(model, new_token_id) || n_cur == n_len { @@ -210,9 +198,10 @@ if n_parallel > 1 { let t_main_end = ggml_time_us() -print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n") +print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n") -llama_print_timings(context) +llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT) +llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 53fbfb0a8..5d32153fe 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -1,15 +1,13 @@ +#include "arg.h" #include "common.h" #include "llama.h" #include -#include #include #include #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); LOG_TEE("\n"); @@ -21,8 +19,7 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; params.n_predict = 32; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { return 1; } @@ -65,6 +62,15 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_new_context_with_model(model, ctx_params); + auto sparams = llama_sampler_chain_default_params(); + + llama_sampler * smpl = llama_sampler_chain_init(sparams); + + llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep)); + llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed)); + if (ctx == NULL) { fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); return 1; @@ -164,29 +170,7 @@ int main(int argc, char ** argv) { continue; } - auto n_vocab = llama_n_vocab(model); - auto * logits = llama_get_logits_ith(ctx, i_batch[i]); - - std::vector candidates; - candidates.reserve(n_vocab); - - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - const int top_k = 40; - const float top_p = 0.9f; - const float temp = 0.4f; - - llama_sample_top_k(ctx, &candidates_p, top_k, 1); - llama_sample_top_p(ctx, &candidates_p, top_p, 1); - llama_sample_temp (ctx, &candidates_p, temp); - - const llama_token new_token_id = llama_sample_token(ctx, &candidates_p); - - //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); + const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]); // is it an end of generation? -> mark the stream as finished if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { @@ -244,12 +228,15 @@ int main(int argc, char ** argv) { LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - llama_print_timings(ctx); + LOG_TEE("\n"); + llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); fprintf(stderr, "\n"); llama_batch_free(batch); + llama_sampler_free(smpl); llama_free(ctx); llama_free_model(model); diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index a68268388..569b6c38f 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" #include "ggml.h" @@ -35,9 +36,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { return ret; } -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { printf("\nexample usage:\n"); printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]); printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]); @@ -390,8 +389,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { return 1; } diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index 6ec3141af..05c66856c 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -12,12 +12,9 @@ #include #include +#include #include -#include #include -#include -#include -#include #define DEBUG_POS 5 diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index b05aa006e..da7c79253 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -79,8 +80,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { return 1; } @@ -90,13 +90,7 @@ int main(int argc, char ** argv) { print_build_info(); - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); + LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); llama_backend_init(); llama_numa_init(params.numa); @@ -313,8 +307,10 @@ int main(int argc, char ** argv) { if (notArray) fprintf(stdout, "\n}\n"); } + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + // clean up - llama_print_timings(ctx); llama_batch_free(batch); llama_free(ctx); llama_free_model(model); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 5e89988e2..bc7203143 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" #include "ggml.h" @@ -144,15 +145,12 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } print_build_info(); - std::mt19937 rng(params.seed); - llama_backend_init(); llama_numa_init(params.numa); @@ -183,7 +181,8 @@ int main(int argc, char ** argv) { return 1; } - llama_print_timings(ctx); + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_free(ctx); llama_free_model(model); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 8df457e21..ff324926a 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "ggml.h" #include "ggml-alloc.h" @@ -391,9 +392,7 @@ struct lora_merge_ctx { } }; -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { printf("\nexample usage:\n"); printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]); printf("\nNOTE: output model is F16\n"); @@ -403,8 +402,7 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { return 1; } diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index 48a705e15..7493af9d3 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -1,9 +1,5 @@ -#define LLAMA_API_INTERNAL - -#include "grammar-parser.h" -#include "ggml.h" -#include "llama.h" #include "unicode.h" +#include "llama-grammar.h" #include #include @@ -12,29 +8,28 @@ #include #include -static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { - auto decoded = decode_utf8(input_str, {}); - const auto & code_points = decoded.first; +static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { + const auto cpts = unicode_cpts_from_utf8(input_str); const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); - llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar); + llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar); size_t pos = 0; - for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy + for (const auto & cpt : cpts) { + const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy - llama_grammar_accept(rules, prev_stacks, *it, cur_stacks); + llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur); - if (cur_stacks.empty()) { + if (stacks_cur.empty()) { error_pos = pos; - error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'"; - cur_stacks = prev_stacks; + error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'"; + stacks_cur = stacks_prev; return false; } ++pos; } - for (const auto & stack : cur_stacks) { + for (const auto & stack : stacks_cur) { if (stack.empty()) { return true; } @@ -85,27 +80,7 @@ int main(int argc, char** argv) { grammar_str = buffer.str(); } - // Parse the GBNF grammar - auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); - - // will be empty (default) if there are parse errors - if (parsed_grammar.rules.empty()) { - fprintf(stdout, "%s: failed to parse grammar\n", __func__); - return 1; - } - - // Ensure that there is a "root" node. - if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) { - fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__); - return 1; - } - - std::vector grammar_rules(parsed_grammar.c_rules()); - - // Create the LLAMA grammar - auto grammar = llama_grammar_init( - grammar_rules.data(), - grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root"); if (grammar == nullptr) { throw std::runtime_error("Failed to initialize llama_grammar"); } @@ -122,7 +97,7 @@ int main(int argc, char** argv) { // Validate the input string against the grammar size_t error_pos; std::string error_msg; - bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg); + bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg); if (is_valid) { fprintf(stdout, "Input string is valid according to the grammar.\n"); @@ -131,7 +106,7 @@ int main(int argc, char** argv) { } // Clean up - llama_grammar_free(grammar); + llama_grammar_free_impl(grammar); return 0; } diff --git a/examples/gen-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt new file mode 100644 index 000000000..c94cda776 --- /dev/null +++ b/examples/gen-docs/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-gen-docs) +add_executable(${TARGET} gen-docs.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp new file mode 100644 index 000000000..b6d4725fd --- /dev/null +++ b/examples/gen-docs/gen-docs.cpp @@ -0,0 +1,52 @@ +#include "arg.h" +#include "common.h" + +#include +#include + +// Export usage message (-h) to markdown format + +static void export_md(std::string fname, llama_example ex) { + std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); + + gpt_params params; + auto ctx_arg = gpt_params_parser_init(params, ex); + + file << "| Argument | Explanation |\n"; + file << "| -------- | ----------- |\n"; + for (auto & opt : ctx_arg.options) { + file << "| `"; + // args + for (const auto & arg : opt.args) { + if (arg == opt.args.front()) { + file << arg; + if (opt.args.size() > 1) file << ", "; + } else { + file << arg << (arg != opt.args.back() ? ", " : ""); + } + } + // value hint + if (opt.value_hint) { + std::string md_value_hint(opt.value_hint); + string_replace_all(md_value_hint, "|", "\\|"); + file << " " << md_value_hint; + } + if (opt.value_hint_2) { + std::string md_value_hint_2(opt.value_hint_2); + string_replace_all(md_value_hint_2, "|", "\\|"); + file << " " << md_value_hint_2; + } + // help text + std::string md_help(opt.help); + string_replace_all(md_help, "\n", "
"); + string_replace_all(md_help, "|", "\\|"); + file << "` | " << md_help << " |\n"; + } +} + +int main(int, char **) { + export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN); + export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER); + + return 0; +} diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 2c61c2e1e..14c715202 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -9,7 +10,7 @@ static std::vector> encode(llama_context * ctx, const std::vector & sentences, const std::string & instruction) { std::vector> result; - const llama_model * mdl = llama_get_model(ctx); + const llama_model * model = llama_get_model(ctx); llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); @@ -18,16 +19,16 @@ static std::vector> encode(llama_context * ctx, const std::ve const std::string input_string = instruction + sentences[i]; - std::vector inputs = llama_tokenize(mdl, input_string, true, false); + std::vector inputs = llama_tokenize(model, input_string, true, false); const int32_t n_toks = inputs.size(); // GritLM seems to have EOS = "" // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18 - // inputs.push_back(llama_token_eos(mdl)); + // inputs.push_back(llama_token_eos(model)); // we want to ignore instruction tokens for mean pooling - const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size(); + const int32_t n_inst = llama_tokenize(model, instruction, true, false).size(); #ifdef GRIT_DEBUG // debug tokens - should be matching as referenced in the GritLM sample @@ -51,7 +52,7 @@ static std::vector> encode(llama_context * ctx, const std::ve llama_decode(ctx, batch); // get embedding dimensions - uint64_t n_embd = llama_n_embd(mdl); + uint64_t n_embd = llama_n_embd(model); // allocate embedding output std::vector emb_unorm(n_embd, 0.0f); @@ -92,11 +93,11 @@ static std::vector> encode(llama_context * ctx, const std::ve return result; } -static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) { +static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) { std::string result; - const llama_model * mdl = llama_get_model(ctx); - llama_token eos_token = llama_token_eos(mdl); + const llama_model * model = llama_get_model(ctx); + llama_token eos_token = llama_token_eos(model); llama_kv_cache_clear(ctx); llama_set_embeddings(ctx, false); @@ -104,28 +105,24 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); - std::vector inputs = llama_tokenize(mdl, prompt, false, true); + std::vector inputs = llama_tokenize(model, prompt, false, true); int32_t i_current_token = 0; while (true) { llama_batch_clear(bat); - auto n_inputs = (int32_t)inputs.size(); - for (int32_t i = 0; i < n_inputs; i++) { - llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); + { + const int32_t n_inputs = inputs.size(); + + for (int32_t i = 0; i < n_inputs; i++) { + llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); + } } inputs.clear(); llama_decode(ctx, bat); - auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1); - auto candidates = std::vector(llama_n_vocab(mdl)); - auto n_candidates = (int32_t)candidates.size(); - for (int32_t token = 0; token < n_candidates; token++) { - candidates[token] = llama_token_data{ token, logits[token], 0.0f }; - } - auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false }; + llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1); - llama_token token = llama_sample_token_greedy(ctx, &candidates_p); if (token == eos_token) { break; } @@ -157,8 +154,7 @@ static std::string gritlm_instruction(const std::string & instruction) { int main(int argc, char * argv[]) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } @@ -167,10 +163,18 @@ int main(int argc, char * argv[]) { llama_backend_init(); - llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams); + llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); // create generation context - llama_context * ctx = llama_new_context_with_model(mdl, cparams); + llama_context * ctx = llama_new_context_with_model(model, cparams); + + auto sparams = llama_sampler_chain_default_params(); + + sparams.no_perf = false; + + llama_sampler * smpl = llama_sampler_chain_init(sparams); + + llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); // ### Embedding/Representation ### // samples taken from: https://github.com/ContextualAI/gritlm#basic @@ -191,7 +195,7 @@ int main(int argc, char * argv[]) { const std::vector> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); - const int n_embd = llama_n_embd(mdl); + const int n_embd = llama_n_embd(model); const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); @@ -208,11 +212,12 @@ int main(int argc, char * argv[]) { // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction { const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n"; - std::string response = generate(ctx, prompt, true); + std::string response = generate(ctx, smpl, prompt, true); } + llama_sampler_free(smpl); llama_free(ctx); - llama_free_model(mdl); + llama_free_model(model); llama_backend_free(); return 0; diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 83b85d72b..032a90136 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -17,9 +18,7 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s \\\n" " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" @@ -579,8 +578,7 @@ int main(int argc, char ** argv) { params.logits_all = true; params.verbosity = 1; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { return 1; } @@ -638,7 +636,8 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); - llama_print_timings(ctx); + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_free(ctx); llama_free_model(model); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 05700c1d5..9a527e244 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -1,8 +1,8 @@ +#include "arg.h" #include "common.h" - #include "console.h" +#include "sampling.h" #include "llama.h" -#include "grammar-parser.h" #include #include @@ -34,6 +34,7 @@ static llama_context ** g_ctx; static llama_model ** g_model; +static gpt_sampler ** g_smpl; static gpt_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; @@ -81,7 +82,7 @@ static void write_logfile( yaml_dump_string_multiline(logfile, "output", output.c_str()); yaml_dump_vector_int(logfile, "output_tokens", output_tokens); - llama_dump_timing_info_yaml(logfile, ctx); + llama_perf_dump_yaml(logfile, ctx); fclose(logfile); } @@ -93,7 +94,7 @@ static void sigint_handler(int signo) { } else { console::cleanup(); printf("\n"); - llama_print_timings(*g_ctx); + gpt_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); } @@ -103,14 +104,14 @@ static void sigint_handler(int signo) { int main(int argc, char ** argv) { gpt_params params; - llama_sampling_params & sparams = params.sparams; g_params = ¶ms; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) { return 1; } + auto & sparams = params.sparams; + #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("infill", "log")); LOG_TEE("Log start\n"); @@ -156,26 +157,21 @@ int main(int argc, char ** argv) { LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); + print_build_info(); - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - LOG_TEE("%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); + LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); LOG("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); - llama_model * model; - llama_context * ctx; + llama_model * model = nullptr; + llama_context * ctx = nullptr; + gpt_sampler * smpl = nullptr; g_model = &model; g_ctx = &ctx; + g_smpl = &smpl; // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); @@ -305,16 +301,11 @@ int main(int argc, char ** argv) { LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); } } - LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); + LOG_TEE("sampling: \n%s\n", sparams.print().c_str()); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("\n\n"); LOG_TEE("\n##### Infill mode #####\n\n"); - if (params.infill) { - printf("\n************\n"); - printf("no need to specify '--infill', always running infill\n"); - printf("************\n\n"); - } if (params.interactive) { const char *control_message; if (params.multiline_input) { @@ -349,7 +340,7 @@ int main(int argc, char ** argv) { std::vector embd; - struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); + smpl = gpt_sampler_init(model, sparams); while (n_remain != 0 || params.interactive) { // predict @@ -421,11 +412,11 @@ int main(int argc, char ** argv) { embd.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr); + const llama_token id = gpt_sampler_sample(smpl, ctx, -1); - llama_sampling_accept(ctx_sampling, ctx, id, true); + gpt_sampler_accept(smpl, id, true); - LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); + // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str()); embd.push_back(id); @@ -444,7 +435,7 @@ int main(int argc, char ** argv) { // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); + gpt_sampler_accept(smpl, embd_inp[n_consumed], false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -476,7 +467,7 @@ int main(int argc, char ** argv) { // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { // deal with eot token in infill mode - if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ + if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ if (is_interacting && !params.interactive_first) { // print an eot token printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); @@ -542,7 +533,7 @@ int main(int argc, char ** argv) { is_interacting = false; } // deal with end of generation tokens in interactive mode - else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { + else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { LOG("found EOS token\n"); if (params.interactive) { @@ -615,7 +606,7 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - llama_sampling_reset(ctx_sampling); + gpt_sampler_reset(smpl); } is_interacting = false; } @@ -638,13 +629,14 @@ int main(int argc, char ** argv) { fflush(stdout); } - llama_print_timings(ctx); + LOG_TEE("\n"); + gpt_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); llama_free(ctx); llama_free_model(model); - llama_sampling_free(ctx_sampling); + gpt_sampler_free(smpl); llama_backend_free(); #ifndef LOG_DISABLE_LOGS diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index fe1802b51..d7db5af72 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) { fflush(p_err->fout); } - llama_print_timings(ctx); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_free(ctx); diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 2aafe2316..f611809c6 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -120,8 +120,8 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo LOGi("Using %d threads", n_threads); llama_context_params ctx_params = llama_context_default_params(); - ctx_params.seed = 1234; - ctx_params.n_ctx = 2048; + + ctx_params.n_ctx = 2048; ctx_params.n_threads = n_threads; ctx_params.n_threads_batch = n_threads; @@ -269,12 +269,6 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( return env->NewStringUTF(result.str().c_str()); } -extern "C" -JNIEXPORT void JNICALL -Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { - llama_batch_free(*reinterpret_cast(batch_pointer)); -} - extern "C" JNIEXPORT jlong JNICALL Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) { @@ -311,6 +305,29 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, return reinterpret_cast(batch); } +extern "C" +JNIEXPORT void JNICALL +Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { + llama_batch_free(*reinterpret_cast(batch_pointer)); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) { + auto sparams = llama_sampler_chain_default_params(); + sparams.no_perf = true; + llama_sampler * smpl = llama_sampler_chain_init(sparams); + llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + + return reinterpret_cast(smpl); +} + +extern "C" +JNIEXPORT void JNICALL +Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) { + llama_sampler_free(reinterpret_cast(sampler_pointer)); +} + extern "C" JNIEXPORT void JNICALL Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) { @@ -381,31 +398,21 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( jobject, jlong context_pointer, jlong batch_pointer, + jlong sampler_pointer, jint n_len, jobject intvar_ncur ) { const auto context = reinterpret_cast(context_pointer); - const auto batch = reinterpret_cast(batch_pointer); + const auto batch = reinterpret_cast(batch_pointer); + const auto sampler = reinterpret_cast(sampler_pointer); const auto model = llama_get_model(context); if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur); if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I"); if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V"); - auto n_vocab = llama_n_vocab(model); - auto logits = llama_get_logits_ith(context, batch->n_tokens - 1); - - std::vector candidates; - candidates.reserve(n_vocab); - - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - // sample the most likely token - const auto new_token_id = llama_sample_token_greedy(context, &candidates_p); + const auto new_token_id = llama_sampler_sample(sampler, context, -1); const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt index 6c63e54e0..cf520e459 100644 --- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt +++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt @@ -45,8 +45,10 @@ class LLamaAndroid { private external fun free_context(context: Long) private external fun backend_init(numa: Boolean) private external fun backend_free() - private external fun free_batch(batch: Long) private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long + private external fun free_batch(batch: Long) + private external fun new_sampler(): Long + private external fun free_sampler(sampler: Long) private external fun bench_model( context: Long, model: Long, @@ -69,6 +71,7 @@ class LLamaAndroid { private external fun completion_loop( context: Long, batch: Long, + sampler: Long, nLen: Int, ncur: IntVar ): String? @@ -101,8 +104,11 @@ class LLamaAndroid { val batch = new_batch(512, 0, 1) if (batch == 0L) throw IllegalStateException("new_batch() failed") + val sampler = new_sampler() + if (sampler == 0L) throw IllegalStateException("new_sampler() failed") + Log.i(tag, "Loaded model $pathToModel") - threadLocalState.set(State.Loaded(model, context, batch)) + threadLocalState.set(State.Loaded(model, context, batch, sampler)) } else -> throw IllegalStateException("Model already loaded") } @@ -114,7 +120,7 @@ class LLamaAndroid { is State.Loaded -> { val ncur = IntVar(completion_init(state.context, state.batch, message, nlen)) while (ncur.value <= nlen) { - val str = completion_loop(state.context, state.batch, nlen, ncur) + val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur) if (str == null) { break } @@ -138,6 +144,7 @@ class LLamaAndroid { free_context(state.context) free_model(state.model) free_batch(state.batch) + free_sampler(state.sampler); threadLocalState.set(State.Idle) } @@ -161,7 +168,7 @@ class LLamaAndroid { private sealed interface State { data object Idle: State - data class Loaded(val model: Long, val context: Long, val batch: Long): State + data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State } // Enforce only one instance of Llm. diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 48b7840ae..dcd9803a2 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama actor LlamaContext { private var model: OpaquePointer private var context: OpaquePointer + private var sampling: UnsafeMutablePointer private var batch: llama_batch private var tokens_list: [llama_token] var is_done: Bool = false @@ -42,9 +43,15 @@ actor LlamaContext { self.tokens_list = [] self.batch = llama_batch_init(512, 0, 1) self.temporary_invalid_cchars = [] + let sparams = llama_sampler_chain_default_params() + self.sampling = llama_sampler_chain_init(sparams) + llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4)) + llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax()) + llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234)) } deinit { + llama_sampler_free(sampling) llama_batch_free(batch) llama_free(context) llama_free_model(model) @@ -69,7 +76,6 @@ actor LlamaContext { print("Using \(n_threads) threads") var ctx_params = llama_context_default_params() - ctx_params.seed = 1234 ctx_params.n_ctx = 2048 ctx_params.n_threads = Int32(n_threads) ctx_params.n_threads_batch = Int32(n_threads) @@ -144,20 +150,7 @@ actor LlamaContext { func completion_loop() -> String { var new_token_id: llama_token = 0 - let n_vocab = llama_n_vocab(model) - let logits = llama_get_logits_ith(context, batch.n_tokens - 1) - - var candidates = Array() - candidates.reserveCapacity(Int(n_vocab)) - - for token_id in 0.. #include @@ -40,11 +41,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n return true; } -static const char * sample(struct llama_sampling_context * ctx_sampling, +static const char * sample(struct gpt_sampler * smpl, struct llama_context * ctx_llama, int * n_past) { - const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); - llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1); + gpt_sampler_accept(smpl, id, true); static std::string ret; if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = ""; @@ -112,9 +113,7 @@ struct llava_context { struct llama_model * model = NULL; }; -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\n example usage:\n"); LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); @@ -191,15 +190,15 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ LOG_TEE("\n"); - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); - if (!ctx_sampling) { + struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); + if (!smpl) { fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); exit(1); } std::string response = ""; for (int i = 0; i < max_tgt_len; i++) { - const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); + const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past); response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior @@ -211,7 +210,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ fflush(stdout); } - llama_sampling_free(ctx_sampling); + gpt_sampler_free(smpl); printf("\n"); } @@ -280,8 +279,7 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { return 1; } @@ -293,7 +291,7 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - print_usage(argc, argv, {}); + print_usage(argc, argv); return 1; } auto model = llava_init(¶ms); @@ -310,7 +308,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_print_timings(ctx_llava->ctx_llama); + llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); @@ -327,7 +325,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_print_timings(ctx_llava->ctx_llama); + llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index f500ea5b9..3475bbce5 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -1,9 +1,11 @@ -#include "ggml.h" +#include "arg.h" #include "log.h" #include "common.h" +#include "sampling.h" #include "clip.h" #include "llava.h" #include "llama.h" +#include "ggml.h" #include #include @@ -163,11 +165,11 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e LOG_TEE("%s: image token past: %d\n", __func__, n_past); } -static const char * sample(struct llama_sampling_context * ctx_sampling, +static const char * sample(struct gpt_sampler * smpl, struct llama_context * ctx_llama, int * n_past) { - const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); - llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1); + gpt_sampler_accept(smpl, id, true); static std::string ret; if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = ""; @@ -214,7 +216,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri return ctx_llava; } -static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ +static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ std::string user_prompt = prompt; int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); if (!is_first) { @@ -238,13 +240,13 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla LOG_TEE("\n"); - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); - return ctx_sampling; + struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); + return smpl; } -static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ +static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){ - const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); + const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past); return tmp; } @@ -253,8 +255,7 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - show_additional_info(argc, argv); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) { return 1; } @@ -266,7 +267,6 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS if (params.mmproj.empty() || (params.image.empty())) { - gpt_params_print_usage(argc, argv, params); show_additional_info(argc, argv); return 1; } @@ -278,12 +278,12 @@ int main(int argc, char ** argv) { if (!params.prompt.empty()) { LOG_TEE("%s\n", params.prompt.c_str()); LOG_TEE(""); - auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true); + auto smpl = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true); const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; std::string response = ""; bool have_tmp = false; for (int i = 0; i < max_tgt_len; i++) { - auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + auto tmp = llama_loop(ctx_llava, smpl, n_past); response += tmp; if (strcmp(tmp, "") == 0){ if(!have_tmp)continue; @@ -296,18 +296,18 @@ int main(int argc, char ** argv) { fflush(stdout); } - llama_sampling_free(ctx_sampling); + gpt_sampler_free(smpl); }else { while (true) { LOG_TEE(""); std::string prompt; std::getline(std::cin, prompt); LOG_TEE(""); - auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true); + auto smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true); const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; std::string response = ""; for (int i = 0; i < max_tgt_len; i++) { - auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + auto tmp = llama_loop(ctx_llava, smpl, n_past); response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior @@ -315,11 +315,11 @@ int main(int argc, char ** argv) { if (strstr(response.c_str(), "")) break; // minicpm-v fflush(stdout); } - llama_sampling_free(ctx_sampling); + gpt_sampler_free(smpl); } } printf("\n"); - llama_print_timings(ctx_llava->ctx_llama); + llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 81cf1629c..de8b792f2 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -1,7 +1,8 @@ +#include "arg.h" #include "common.h" +#include "sampling.h" #include "llama.h" -#include #include #include #include @@ -37,8 +38,7 @@ struct ngram_container { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } @@ -118,7 +118,7 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); // target model sampling context - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); + struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams); // verification n-grams std::vector ngrams_cur(G); @@ -159,9 +159,9 @@ int main(int argc, char ** argv) { // sample first token { - id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0); + id = gpt_sampler_sample(smpl, ctx, 0); - llama_sampling_accept(ctx_sampling, ctx, id, true); + gpt_sampler_accept(smpl, id, true); { const std::string token_str = llama_token_to_piece(ctx, id); @@ -284,9 +284,9 @@ int main(int argc, char ** argv) { } // sample the next token - id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch); + id = gpt_sampler_sample(smpl, ctx, i_batch); - llama_sampling_accept(ctx_sampling, ctx, id, true); + gpt_sampler_accept(smpl, id, true); // print { @@ -361,7 +361,7 @@ int main(int argc, char ** argv) { if (v == 0) { // sample from the last level for (int i = 0; i < W; i++) { - tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i); + tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i); } } else { for (int i = 0; i < W; i++) { @@ -468,10 +468,12 @@ int main(int argc, char ** argv) { LOG_TEE("n_predict = %d\n", n_predict); LOG_TEE("n_accept = %d\n", n_accept); - llama_print_timings(ctx); + LOG_TEE("\n"); + gpt_perf_print(ctx, smpl); + + gpt_sampler_free(smpl); llama_kv_cache_view_free(&kvc_view); - llama_sampling_free(ctx_sampling); llama_batch_free(batch); diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 5f04709f5..33287c02c 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -1,7 +1,8 @@ -#include "ggml.h" -#include "llama.h" +#include "arg.h" #include "common.h" #include "ngram-cache.h" +#include "ggml.h" +#include "llama.h" #include #include @@ -13,8 +14,7 @@ int main(int argc, char ** argv){ gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } @@ -40,4 +40,6 @@ int main(int argc, char ** argv){ fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); + + return 0; } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 400f3e0b0..f299d68a9 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -1,8 +1,9 @@ -#include "ggml.h" +#include "arg.h" #include "common.h" -#include "llama.h" #include "log.h" #include "ngram-cache.h" +#include "llama.h" +#include "ggml.h" #include #include @@ -15,8 +16,7 @@ int main(int argc, char ** argv){ gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index d53a9828c..fff44a499 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -1,21 +1,20 @@ +#include "arg.h" #include "ggml.h" -#include "llama.h" #include "common.h" #include "ngram-cache.h" +#include "sampling.h" +#include "llama.h" -#include #include #include #include #include #include -#include int main(int argc, char ** argv){ gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } @@ -106,7 +105,7 @@ int main(int argc, char ** argv){ bool has_eos = false; - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); + struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams); std::vector draft; @@ -130,9 +129,9 @@ int main(int argc, char ** argv){ int i_dft = 0; while (true) { // sample from the target model - llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft); + llama_token id = gpt_sampler_sample(smpl, ctx, i_dft); - llama_sampling_accept(ctx_sampling, ctx, id, true); + gpt_sampler_accept(smpl, id, true); const std::string token_str = llama_token_to_piece(ctx, id); @@ -240,10 +239,12 @@ int main(int argc, char ** argv){ LOG_TEE("n_accept = %d\n", n_accept); LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - LOG_TEE("\ntarget:\n"); - llama_print_timings(ctx); + LOG_TEE("\ntarget:\n\n"); + llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + + gpt_sampler_free(smpl); - llama_sampling_free(ctx_sampling); llama_batch_free(batch_tgt); llama_free(ctx); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index c55efbb66..b986a865a 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,6 +1,7 @@ +#include "arg.h" #include "common.h" - #include "console.h" +#include "sampling.h" #include "llama.h" #include @@ -33,6 +34,7 @@ static llama_context ** g_ctx; static llama_model ** g_model; +static gpt_sampler ** g_smpl; static gpt_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; @@ -40,6 +42,13 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; +static void print_usage(int, char ** argv) { + printf("\nexample usage:\n"); + printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); + printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); + printf("\n"); +} + static bool file_exists(const std::string & path) { std::ifstream f(path.c_str()); return f.good(); @@ -92,7 +101,7 @@ static void write_logfile( yaml_dump_string_multiline(logfile, "output", output.c_str()); yaml_dump_vector_int(logfile, "output_tokens", output_tokens); - llama_dump_timing_info_yaml(logfile, ctx); + llama_perf_dump_yaml(logfile, ctx); fclose(logfile); } @@ -105,7 +114,7 @@ static void sigint_handler(int signo) { } else { console::cleanup(); printf("\n"); - llama_print_timings(*g_ctx); + gpt_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); } @@ -121,8 +130,7 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, std::string role, std::string content) { llama_chat_msg new_msg{role, content}; - auto formatted = llama_chat_format_single( - model, g_params->chat_template, chat_msgs, new_msg, role == "user"); + auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user"); chat_msgs.push_back({role, content}); LOG("formatted: %s\n", formatted.c_str()); return formatted; @@ -131,13 +139,11 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector chat_msgs; + g_model = &model; g_ctx = &ctx; + g_smpl = &smpl; // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); @@ -211,10 +213,6 @@ int main(int argc, char ** argv) { model = llama_init.model; ctx = llama_init.context; - if (sparams.cfg_scale > 1.f) { - struct llama_context_params lparams = llama_context_params_from_gpt_params(params); - ctx_guidance = llama_new_context_with_model(model, lparams); - } if (model == NULL) { LOG_TEE("%s: error: unable to load model\n", __func__); @@ -251,9 +249,6 @@ int main(int argc, char ** argv) { } llama_attach_threadpool(ctx, threadpool, threadpool_batch); - if (ctx_guidance) { - llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch); - } const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); @@ -337,24 +332,6 @@ int main(int argc, char ** argv) { } // Tokenize negative prompt - std::vector guidance_inp; - int guidance_offset = 0; - int original_prompt_len = 0; - if (ctx_guidance) { - LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); - - guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true); - LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); - - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true, true); - LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); - - original_prompt_len = original_inp.size(); - guidance_offset = (int)guidance_inp.size() - original_prompt_len; - LOG("original_prompt_len: %s", log_tostr(original_prompt_len)); - LOG("guidance_offset: %s", log_tostr(guidance_offset)); - } - if ((int) embd_inp.size() > n_ctx - 4) { LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; @@ -421,15 +398,6 @@ int main(int argc, char ** argv) { LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } - if (ctx_guidance) { - LOG_TEE("\n"); - LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str()); - LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); - for (int i = 0; i < (int) guidance_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); - } - } - if (params.n_keep > add_bos) { LOG_TEE("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { @@ -495,8 +463,15 @@ int main(int argc, char ** argv) { } } } - LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); - LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str()); + + smpl = gpt_sampler_init(model, sparams); + if (!smpl) { + fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } + + LOG_TEE("sampling params: \n%s\n", sparams.print().c_str()); + LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str()); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); // group-attention state @@ -543,7 +518,6 @@ int main(int argc, char ** argv) { int n_remain = params.n_predict; int n_consumed = 0; int n_session_consumed = 0; - int n_past_guidance = 0; std::vector input_tokens; g_input_tokens = &input_tokens; std::vector output_tokens; g_output_tokens = &output_tokens; @@ -555,7 +529,6 @@ int main(int argc, char ** argv) { display = params.display_prompt; std::vector embd; - std::vector embd_guidance; // tokenized antiprompts std::vector> antiprompt_ids; @@ -565,12 +538,6 @@ int main(int argc, char ** argv) { antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); } - struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); - if (!ctx_sampling) { - fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); - exit(1); - } - if (llama_model_has_encoder(model)) { int enc_input_size = embd_inp.size(); llama_token * enc_input_buf = embd_inp.data(); @@ -612,7 +579,7 @@ int main(int argc, char ** argv) { // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - if (n_past + (int) embd.size() + std::max(0, guidance_offset) >= n_ctx) { + if (n_past + (int) embd.size() >= n_ctx) { if (params.n_predict == -2) { LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; @@ -629,11 +596,7 @@ int main(int argc, char ** argv) { n_past -= n_discard; - if (ctx_guidance) { - n_past_guidance -= n_discard; - } - - LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); + LOG("after swap: n_past = %d\n", n_past); LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); @@ -686,46 +649,6 @@ int main(int argc, char ** argv) { } } - // evaluate tokens in batches - // embd is typically prepared beforehand to fit within a batch, but not always - if (ctx_guidance) { - int input_size = 0; - llama_token * input_buf = NULL; - - if (n_past_guidance < (int) guidance_inp.size()) { - // Guidance context should have the same data with these modifications: - // - // * Replace the initial prompt - // * Shift everything by guidance_offset - embd_guidance = guidance_inp; - if (embd.begin() + original_prompt_len < embd.end()) { - embd_guidance.insert( - embd_guidance.end(), - embd.begin() + original_prompt_len, - embd.end() - ); - } - - input_buf = embd_guidance.data(); - input_size = embd_guidance.size(); - - LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str()); - } else { - input_buf = embd.data(); - input_size = embd.size(); - } - - for (int i = 0; i < input_size; i += params.n_batch) { - int n_eval = std::min(input_size - i, params.n_batch); - if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); - return 1; - } - - n_past_guidance += n_eval; - } - } - for (int i = 0; i < (int) embd.size(); i += params.n_batch) { int n_eval = (int) embd.size() - i; if (n_eval > params.n_batch) { @@ -755,7 +678,6 @@ int main(int argc, char ** argv) { } embd.clear(); - embd_guidance.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { // optionally save the session on first sample (for faster prompt loading next time) @@ -766,11 +688,11 @@ int main(int argc, char ** argv) { LOG("saved session to %s\n", path_session.c_str()); } - const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); + const llama_token id = gpt_sampler_sample(smpl, ctx, -1); - llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true); + gpt_sampler_accept(smpl, id, /* apply_grammar= */ true); - LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); + // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str()); embd.push_back(id); @@ -789,7 +711,7 @@ int main(int argc, char ** argv) { // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false); + gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -832,7 +754,7 @@ int main(int argc, char ** argv) { // check for reverse prompt in the last n_prev tokens if (!params.antiprompt.empty()) { const int n_prev = 32; - const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev); + const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev); is_antiprompt = false; // Check if each of the reverse prompts appears at the end of the output. @@ -854,7 +776,7 @@ int main(int argc, char ** argv) { } // check for reverse prompt using special tokens - llama_token last_token = llama_sampling_last(ctx_sampling); + llama_token last_token = gpt_sampler_last(smpl); for (std::vector ids : antiprompt_ids) { if (ids.size() == 1 && last_token == ids[0]) { if (params.interactive) { @@ -871,7 +793,7 @@ int main(int argc, char ** argv) { } // deal with end of generation tokens in interactive mode - if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { + if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { LOG("found an EOG token\n"); if (params.interactive) { @@ -892,7 +814,7 @@ int main(int argc, char ** argv) { // if current token is not EOG, we add it to current assistant message if (params.conversation) { - auto id = llama_sampling_last(ctx_sampling); + const auto id = gpt_sampler_last(smpl); assistant_ss << llama_token_to_piece(ctx, id, false); } @@ -988,7 +910,7 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - llama_sampling_reset(ctx_sampling); + gpt_sampler_reset(smpl); } is_interacting = false; } @@ -1013,14 +935,15 @@ int main(int argc, char ** argv) { llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } - llama_print_timings(ctx); + LOG_TEE("\n"); + gpt_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); - if (ctx_guidance) { llama_free(ctx_guidance); } + gpt_sampler_free(smpl); + llama_free(ctx); llama_free_model(model); - llama_sampling_free(ctx_sampling); llama_backend_free(); ggml_threadpool_free(threadpool); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 621a1c959..bc6301311 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -1,7 +1,9 @@ // A basic application simulating a server with multiple clients. // The clients submit requests to the server and they are processed in parallel. +#include "arg.h" #include "common.h" +#include "sampling.h" #include "llama.h" #include @@ -50,8 +52,8 @@ static std::vector k_prompts = { struct client { ~client() { - if (ctx_sampling) { - llama_sampling_free(ctx_sampling); + if (smpl) { + gpt_sampler_free(smpl); } } @@ -72,7 +74,7 @@ struct client { std::string prompt; std::string response; - struct llama_sampling_context * ctx_sampling = nullptr; + struct gpt_sampler * smpl = nullptr; }; static void print_date_time() { @@ -100,8 +102,7 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { return 1; } @@ -161,7 +162,7 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < clients.size(); ++i) { auto & client = clients[i]; client.id = i; - client.ctx_sampling = llama_sampling_init(params.sparams); + client.smpl = gpt_sampler_init(model, params.sparams); } std::vector tokens_system; @@ -253,7 +254,7 @@ int main(int argc, char ** argv) { client.prompt = client.input + "\nAssistant:"; client.response = ""; - llama_sampling_reset(client.ctx_sampling); + gpt_sampler_reset(client.smpl); // do not prepend BOS because we have a system prompt! std::vector tokens_prompt; @@ -341,9 +342,9 @@ int main(int argc, char ** argv) { //printf("client %d, seq %d, token %d, pos %d, batch %d\n", // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); - const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i); + const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i); - llama_sampling_accept(client.ctx_sampling, ctx, id, true); + gpt_sampler_accept(client.smpl, id, true); if (client.n_decoded == 1) { // start measuring generation time after the first token to make sure all concurrent clients @@ -371,7 +372,7 @@ int main(int argc, char ** argv) { } // delete only the generated part of the sequence, i.e. keep the system prompt in the cache - llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1); + llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1); llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1); const auto t_main_end = ggml_time_us(); @@ -413,7 +414,8 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); - llama_print_timings(ctx); + // TODO: print sampling/grammar timings for all clients + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); llama_batch_free(batch); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index d03215cd1..d3d5ab46f 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -6,9 +7,7 @@ #include #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); LOG_TEE("\n"); @@ -21,13 +20,10 @@ int main(int argc, char ** argv) { params.n_keep = 32; params.i_pos = -1; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { return 1; } - srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed); - int n_junk = params.n_junk; int n_keep = params.n_keep; int n_grp = params.grp_attn_n; @@ -80,12 +76,17 @@ int main(int argc, char ** argv) { GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp"); llama_context * ctx = llama_new_context_with_model(model, ctx_params); - if (ctx == NULL) { fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); return 1; } + auto sparams = llama_sampler_chain_default_params(); + + llama_sampler * smpl = llama_sampler_chain_init(sparams); + + llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + // tokenize the prompt std::vector tokens_list; tokens_list = ::llama_tokenize(ctx, params.prompt, true); @@ -217,20 +218,7 @@ int main(int argc, char ** argv) { while (n_cur <= n_len) { // sample the next token { - auto n_vocab = llama_n_vocab(model); - auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1); - - std::vector candidates; - candidates.reserve(n_vocab); - - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - // sample the most likely token - const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); + const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); // is it an end of generation? if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { @@ -267,10 +255,13 @@ int main(int argc, char ** argv) { LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - llama_print_timings(ctx); + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); fprintf(stderr, "\n"); + llama_sampler_free(smpl); + llama_batch_free(batch); llama_free(ctx); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 484dd5891..c7d617988 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,18 +1,19 @@ +#include "arg.h" #include "common.h" #include "llama.h" +#include +#include #include #include #include #include +#include +#include +#include #include #include -#include -#include #include -#include -#include -#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -76,7 +77,7 @@ static void write_logfile( fprintf(logfile, "ppl_value: %f\n", results.ppl_value); yaml_dump_vector_float(logfile, "probs", results.probs); - llama_dump_timing_info_yaml(logfile, ctx); + llama_perf_dump_yaml(logfile, ctx); fclose(logfile); } @@ -1967,8 +1968,7 @@ int main(int argc, char ** argv) { params.n_ctx = 512; params.logits_all = true; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { return 1; } @@ -2007,13 +2007,7 @@ int main(int argc, char ** argv) { print_build_info(); - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); + LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); llama_backend_init(); llama_numa_init(params.numa); @@ -2054,7 +2048,8 @@ int main(int argc, char ** argv) { results = perplexity(ctx, params, n_ctx); } - llama_print_timings(ctx); + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); write_logfile(ctx, params, model, results); llama_free(ctx); diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 68cf8d359..498cbbe3c 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,7 +1,7 @@ -#define LLAMA_API_INTERNAL #include "common.h" #include "ggml.h" #include "llama.h" +#include "llama-impl.h" #include #include @@ -319,8 +319,7 @@ int main(int argc, char ** argv) { } auto cparams = llama_context_default_params(); - cparams.n_ctx = 256; - cparams.seed = 1; + cparams.n_ctx = 256; ctx = llama_new_context_with_model(model, cparams); diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 5d1e11c67..704f0d56b 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis Several quantization methods are supported. They differ in the resulting model disk size and inference speed. +The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format. + *(outdated)* | Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index aab9d8105..7a360b731 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -1,12 +1,11 @@ +#include "arg.h" #include "common.h" #include "llama.h" #include #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); LOG_TEE("\n"); @@ -113,8 +112,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { return 1; } @@ -293,9 +291,11 @@ int main(int argc, char ** argv) { } } + LOG_TEE("\n"); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + // clean up llama_batch_free(query_batch); - llama_print_timings(ctx); llama_free(ctx); llama_free_model(model); llama_backend_free(); diff --git a/examples/rpc/README.md b/examples/rpc/README.md index adedc8909..312bb634d 100644 --- a/examples/rpc/README.md +++ b/examples/rpc/README.md @@ -10,20 +10,21 @@ This can be used for distributed LLM inference with `llama.cpp` in the following ```mermaid flowchart TD - rpcb---|TCP|srva - rpcb---|TCP|srvb - rpcb-.-|TCP|srvn + rpcb<-->|TCP|srva + rpcb<-->|TCP|srvb + rpcb<-.->|TCP|srvn subgraph hostn[Host N] - srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"] + srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"] end subgraph hostb[Host B] - srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"] + srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"] end subgraph hosta[Host A] - srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"] + srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"] end subgraph host[Main Host] - ggml[llama.cpp]---rpcb[RPC backend] + local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli] + ggml[llama-cli]<-->rpcb[RPC backend] end style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5 ``` @@ -62,17 +63,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. -On the main host build `llama.cpp` only with `-DGGML_RPC=ON`: - -```bash -mkdir build-rpc -cd build-rpc -cmake .. -DGGML_RPC=ON -cmake --build . --config Release -``` - -Finally, use the `--rpc` option to specify the host and port of each `rpc-server`: +On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options. +Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`: ```bash $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 ``` + +This way you can offload model layers to both local and remote devices. + diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 3ea7c790d..0117d9357 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,17 +1,17 @@ +#include "arg.h" #include "common.h" #include "llama.h" #include #include -#include int main(int argc, char ** argv) { gpt_params params; params.prompt = "The quick brown fox"; + params.sparams.seed = 1234; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } @@ -38,6 +38,13 @@ int main(int argc, char ** argv) { return 1; } + auto sparams = llama_sampler_chain_default_params(); + + llama_sampler * smpl = llama_sampler_chain_init(sparams); + + llama_sampler_chain_add(smpl, llama_sampler_init_softmax()); + llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed)); + // tokenize prompt auto tokens = llama_tokenize(ctx, params.prompt, true); @@ -64,16 +71,7 @@ int main(int argc, char ** argv) { printf("\nfirst run: %s", params.prompt.c_str()); for (auto i = 0; i < params.n_predict; i++) { - auto * logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(model); - - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - auto next_token = llama_sample_token(ctx, &candidates_p); + auto next_token = llama_sampler_sample(smpl, ctx, -1); auto next_token_str = llama_token_to_piece(ctx, next_token); printf("%s", next_token_str.c_str()); @@ -96,6 +94,11 @@ int main(int argc, char ** argv) { // make new context auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); + llama_sampler * smpl2 = llama_sampler_chain_init(sparams); + + llama_sampler_chain_add(smpl2, llama_sampler_init_softmax()); + llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed)); + printf("\nsecond run: %s", params.prompt.c_str()); // load state (rng, logits, embedding and kv_cache) from file @@ -124,15 +127,7 @@ int main(int argc, char ** argv) { // second run for (auto i = 0; i < params.n_predict; i++) { - auto * logits = llama_get_logits(ctx2); - auto n_vocab = llama_n_vocab(model); - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - auto next_token = llama_sample_token(ctx2, &candidates_p); + auto next_token = llama_sampler_sample(smpl2, ctx2, -1); auto next_token_str = llama_token_to_piece(ctx2, next_token); printf("%s", next_token_str.c_str()); @@ -157,7 +152,12 @@ int main(int argc, char ** argv) { } // make new context - auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); + auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); + + llama_sampler * smpl3 = llama_sampler_chain_init(sparams); + + llama_sampler_chain_add(smpl3, llama_sampler_init_softmax()); + llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed)); printf("\nsingle seq run: %s", params.prompt.c_str()); @@ -215,15 +215,7 @@ int main(int argc, char ** argv) { // third run with seq 1 instead of 0 for (auto i = 0; i < params.n_predict; i++) { - auto * logits = llama_get_logits(ctx3); - auto n_vocab = llama_n_vocab(model); - std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - auto next_token = llama_sample_token(ctx3, &candidates_p); + auto next_token = llama_sampler_sample(smpl3, ctx3, -1); auto next_token_str = llama_token_to_piece(ctx3, next_token); printf("%s", next_token_str.c_str()); @@ -240,6 +232,10 @@ int main(int argc, char ** argv) { printf("\n"); + llama_sampler_free(smpl); + llama_sampler_free(smpl2); + llama_sampler_free(smpl3); + llama_free(ctx3); llama_free_model(model); diff --git a/examples/server/README.md b/examples/server/README.md index 805e05b4a..79196e9c1 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -17,262 +17,126 @@ The project is under active development, and we are [looking for feedback and co ## Usage -``` -usage: ./llama-server [options] +| Argument | Explanation | +| -------- | ----------- | +| `-h, --help, --usage` | print usage and exit | +| `--version` | show version and build info | +| `-v, --verbose` | print verbose information | +| `--verbosity N` | set specific verbosity level (default: 0) | +| `-t, --threads N` | number of threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | +| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | +| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") | +| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask | +| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)
| +| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)
| +| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)
| +| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) | +| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch | +| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) | +| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)
| +| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) | +| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE) | +| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
(env: LLAMA_ARG_N_PREDICT) | +| `-b, --batch-size N` | logical maximum batch size (default: 2048)
(env: LLAMA_ARG_BATCH) | +| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | +| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | +| `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | +| `-p, --prompt PROMPT` | prompt to start generation with | +| `-f, --file FNAME` | a file containing the prompt (default: none) | +| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | +| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | +| `--no-escape` | do not process escape sequences | +| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | +| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) | +| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) | +| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) | +| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | +| `--penalize-nl` | penalize newline tokens (default: false) | +| `--temp N` | temperature (default: 0.8) | +| `--top-k N` | top-k sampling (default: 40, 0 = disabled) | +| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | +| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | +| `--tfs N` | tail free sampling, parameter z (default: 1.0, 1.0 = disabled) | +| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) | +| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | +| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) | +| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) | +| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) | +| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | +| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | +| `--mirostat N` | use Mirostat sampling.
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | +| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) | +| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) | +| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar-file FNAME` | file to read grammar from | +| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | +| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model | +| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N | +| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model) | +| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N | +| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size) | +| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0) | +| `-gan, --grp-attn-n N` | group-attention factor (default: 1) | +| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0) | +| `-dkvc, --dump-kv-cache` | verbose print of the KV cache | +| `-nkvo, --no-kv-offload` | disable KV offload | +| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) | +| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) | +| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | +| `-np, --parallel N` | number of parallel sequences to decode (default: 1) | +| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | +| `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | +| `--mlock` | force system to keep model in RAM rather than swapping or compressing | +| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) | +| `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437 | +| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | +| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs | +| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 | +| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) | +| `--check-tensors` | check model tensor data for invalid values (default: false) | +| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false | +| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) | +| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) | +| `--control-vector FNAME` | add a control vector
note: this argument can be repeated to add multiple control vectors | +| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors | +| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | +| `-a, --alias STRING` | set alias for model name (to be used by REST API) | +| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)
(env: LLAMA_ARG_MODEL) | +| `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | +| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)
(env: LLAMA_ARG_HF_REPO) | +| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)
(env: LLAMA_ARG_HF_FILE) | +| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | +| `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | +| `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | +| `--path PATH` | path to serve static files from (default: ) | +| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | +| `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | +| `--api-key-file FNAME` | path to file containing API keys (default: none) | +| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key | +| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate | +| `-to, --timeout N` | server read/write timeout in seconds (default: 600) | +| `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | +| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications | +| `--log-format {text, json}` | log output format: json or text (default: json) | +| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | +| `--no-slots` | disables slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | +| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| +| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | +| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) | +| `--log-test` | Log test | +| `--log-disable` | Log disable | +| `--log-enable` | Log enable | +| `--log-new` | Log new | +| `--log-append` | Log append | +| `--log-file FNAME` | Log file | -general: - - -h, --help, --usage print usage and exit - --version show version and build info - -v, --verbose print verbose information - --verbosity N set specific verbosity level (default: 0) - --verbose-prompt print a verbose prompt before generation (default: false) - --no-display-prompt don't print prompt at generation (default: false) - -co, --color colorise output to distinguish prompt and user input from generations (default: false) - -s, --seed SEED RNG seed (default: -1, use random seed for < 0) - -t, --threads N number of threads to use during generation (default: 8) - -tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads) - -td, --threads-draft N number of threads to use during generation (default: same as --threads) - -tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft) - --draft N number of tokens to draft for speculative decoding (default: 5) - -ps, --p-split N speculative decoding split probability (default: 0.1) - -lcs, --lookup-cache-static FNAME - path to static lookup cache to use for lookup decoding (not updated by generation) - -lcd, --lookup-cache-dynamic FNAME - path to dynamic lookup cache to use for lookup decoding (updated by generation) - -c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model) - -n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled) - -b, --batch-size N logical maximum batch size (default: 2048) - -ub, --ubatch-size N physical maximum batch size (default: 512) - --keep N number of tokens to keep from the initial prompt (default: 0, -1 = all) - --chunks N max number of chunks to process (default: -1, -1 = all) - -fa, --flash-attn enable Flash Attention (default: disabled) - -p, --prompt PROMPT prompt to start generation with - in conversation mode, this will be used as system prompt - (default: '') - -f, --file FNAME a file containing the prompt (default: none) - --in-file FNAME an input file (repeat to specify multiple files) - -bf, --binary-file FNAME binary file containing the prompt (default: none) - -e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true) - --no-escape do not process escape sequences - -ptc, --print-token-count N print token count every N tokens (default: -1) - --prompt-cache FNAME file to cache prompt state for faster startup (default: none) - --prompt-cache-all if specified, saves user input and generations to cache as well - not supported with --interactive or other interactive options - --prompt-cache-ro if specified, uses the prompt cache but does not update it - -r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode - can be specified more than once for multiple prompts - -sp, --special special tokens output enabled (default: false) - -cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix - if suffix/prefix are not specified, default chat template will be used - (default: false) - -i, --interactive run in interactive mode (default: false) - -if, --interactive-first run in interactive mode and wait for input right away (default: false) - -mli, --multiline-input allows you to write or paste multiple lines without ending each in '\' - --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string - --in-prefix STRING string to prefix user inputs with (default: empty) - --in-suffix STRING string to suffix after user inputs with (default: empty) - --spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) - -sampling: - - --samplers SAMPLERS samplers that will be used for generation in the order, separated by ';' - (default: top_k;tfs_z;typical_p;top_p;min_p;temperature) - --sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt) - --ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf) - --penalize-nl penalize newline tokens (default: false) - --temp N temperature (default: 0.8) - --top-k N top-k sampling (default: 40, 0 = disabled) - --top-p N top-p sampling (default: 0.9, 1.0 = disabled) - --min-p N min-p sampling (default: 0.1, 0.0 = disabled) - --tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled) - --typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) - --repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) - --repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) - --presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled) - --frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) - --dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled) - --dynatemp-exp N dynamic temperature exponent (default: 1.0) - --mirostat N use Mirostat sampling. - Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used. - (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) - --mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1) - --mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0) - -l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion, - i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello', - or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' - --cfg-negative-prompt PROMPT - negative prompt to use for guidance (default: '') - --cfg-negative-prompt-file FNAME - negative prompt file to use for guidance - --cfg-scale N strength of guidance (default: 1.0, 1.0 = disable) - --chat-template JINJA_TEMPLATE - set custom jinja chat template (default: template taken from model's metadata) - if suffix/prefix are specified, template will be disabled - only commonly used templates are accepted: - https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template - -grammar: - - --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') - --grammar-file FNAME file to read grammar from - -j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object - For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead - -embedding: - - --pooling {none,mean,cls,last} - pooling type for embeddings, use model default if unspecified - --attention {causal,non-causal} - attention type for embeddings, use model default if unspecified - -context hacking: - - --rope-scaling {none,linear,yarn} - RoPE frequency scaling method, defaults to linear unless specified by the model - --rope-scale N RoPE context scaling factor, expands context by a factor of N - --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model) - --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N - --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size) - --yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) - --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0) - --yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0) - --yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0) - -gan, --grp-attn-n N group-attention factor (default: 1) - -gaw, --grp-attn-w N group-attention width (default: 512.0) - -dkvc, --dump-kv-cache verbose print of the KV cache - -nkvo, --no-kv-offload disable KV offload - -ctk, --cache-type-k TYPE KV cache data type for K (default: f16) - -ctv, --cache-type-v TYPE KV cache data type for V (default: f16) - -perplexity: - - --all-logits return logits for all tokens in the batch (default: false) - --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f - --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400) - --winogrande compute Winogrande score over random tasks from datafile supplied with -f - --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0) - --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f - --multiple-choice-tasks N - number of tasks to use when computing the multiple choice score (default: 0) - --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base - --ppl-stride N stride for perplexity calculation (default: 0) - --ppl-output-type {0,1} output type for perplexity calculation (default: 0) - -parallel: - - -dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled) - -np, --parallel N number of parallel sequences to decode (default: 1) - -ns, --sequences N number of sequences to decode (default: 1) - -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled) - -multi-modality: - - --mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md - --image FILE path to an image file. use with multimodal models. Specify multiple times for batching - -backend: - - --rpc SERVERS comma separated list of RPC servers - --mlock force system to keep model in RAM rather than swapping or compressing - --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock) - --numa TYPE attempt optimizations that help on some NUMA systems - - distribute: spread execution evenly over all nodes - - isolate: only spawn threads on CPUs on the node that execution started on - - numactl: use the CPU map provided by numactl - if run without this previously, it is recommended to drop the system page cache before using this - see https://github.com/ggerganov/llama.cpp/issues/1437 - -model: - - --check-tensors check model tensor data for invalid values (default: false) - --override-kv KEY=TYPE:VALUE - advanced option to override model metadata by key. may be specified multiple times. - types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false - --lora FNAME apply LoRA adapter (implies --no-mmap) - --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap) - --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter - --control-vector FNAME add a control vector - note: this argument can be repeated to add multiple control vectors - --control-vector-scaled FNAME SCALE - add a control vector with user defined scaling SCALE - note: this argument can be repeated to add multiple scaled control vectors - --control-vector-layer-range START END - layer range to apply the control vector(s) to, start and end inclusive - -m, --model FNAME model path (default: models/$filename with filename from --hf-file - or --model-url if set, otherwise models/7B/ggml-model-f16.gguf) - -md, --model-draft FNAME draft model for speculative decoding (default: unused) - -mu, --model-url MODEL_URL model download url (default: unused) - -hfr, --hf-repo REPO Hugging Face model repository (default: unused) - -hff, --hf-file FILE Hugging Face model file (default: unused) - -hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable) - -server: - - --host HOST ip address to listen (default: 127.0.0.1) - --port PORT port to listen (default: 8080) - --path PATH path to serve static files from (default: ) - --embedding(s) restrict to only support embedding use case; use only with dedicated embedding models (default: disabled) - --api-key KEY API key to use for authentication (default: none) - --api-key-file FNAME path to file containing API keys (default: none) - --ssl-key-file FNAME path to file a PEM-encoded SSL private key - --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate - --timeout N server read/write timeout in seconds (default: 600) - --threads-http N number of threads used to process HTTP requests (default: -1) - --system-prompt-file FNAME - set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications - --log-format {text,json} - log output format: json or text (default: json) - --metrics enable prometheus compatible metrics endpoint (default: disabled) - --no-slots disables slots monitoring endpoint (default: enabled) - --slot-save-path PATH path to save slot kv cache (default: disabled) - --chat-template JINJA_TEMPLATE - set custom jinja chat template (default: template taken from model's metadata) - only commonly used templates are accepted: - https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template - -sps, --slot-prompt-similarity SIMILARITY - how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled) - --lora-init-without-apply - load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) - -logging: - - --simple-io use basic IO for better compatibility in subprocesses and limited consoles - -ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset) - --log-test Run simple logging test - --log-disable Disable trace logs - --log-enable Enable trace logs - --log-file FNAME Specify a log filename (without extension) - --log-new Create a separate new log file on start. Each log file will have unique name: "..log" - --log-append Don't truncate the old log file. -``` - -Available environment variables (if specified, these variables will override parameters specified in arguments): - -- `LLAMA_CACHE`: cache directory, used by `--hf-repo` -- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo` -- `LLAMA_ARG_MODEL`: equivalent to `-m` -- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu` -- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a` -- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo` -- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file` -- `LLAMA_ARG_THREADS`: equivalent to `-t` -- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c` -- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np` -- `LLAMA_ARG_BATCH`: equivalent to `-b` -- `LLAMA_ARG_UBATCH`: equivalent to `-ub` -- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl` -- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http` -- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template` -- `LLAMA_ARG_N_PREDICT`: equivalent to `-n` -- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`) -- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default. -- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`) -- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`) -- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default. -- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt` -- `LLAMA_ARG_HOST`: equivalent to `--host` -- `LLAMA_ARG_PORT`: equivalent to `--port` +Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. Example usage of docker compose with environment variables: @@ -289,7 +153,7 @@ services: LLAMA_ARG_MODEL: /models/my_model.gguf LLAMA_ARG_CTX_SIZE: 4096 LLAMA_ARG_N_PARALLEL: 2 - LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0 + LLAMA_ARG_ENDPOINT_METRICS: 1 LLAMA_ARG_PORT: 8080 ``` @@ -470,8 +334,6 @@ node index.js `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled. - `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`. - `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0. `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0` @@ -724,7 +586,6 @@ Example: "stopping_word": "" }, "penalize_nl": true, - "penalty_prompt_tokens": [], "presence_penalty": 0.0, "prompt": "Say hello to llama.cpp", "repeat_last_n": 64, @@ -748,8 +609,7 @@ Example: "tfs_z": 1.0, "top_k": 40, "top_p": 0.949999988079071, - "typical_p": 1.0, - "use_penalty_prompt_tokens": false + "typical_p": 1.0 } ] ``` diff --git a/examples/server/server.cpp b/examples/server/server.cpp index cc65c57ab..7495821f9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,9 +1,10 @@ #include "utils.hpp" +#include "arg.h" #include "common.h" +#include "sampling.h" #include "json-schema-to-grammar.h" #include "llama.h" -#include "grammar-parser.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT @@ -169,11 +170,13 @@ struct server_slot { std::string stopping_word; // sampling - llama_token sampled; - struct llama_sampling_params sparams; - llama_sampling_context * ctx_sampling = nullptr; json json_schema; + struct gpt_sampler_params sparams; + struct gpt_sampler * smpl = nullptr; + + llama_token sampled; + int32_t ga_i = 0; // group-attention state int32_t ga_n = 1; // group-attention factor int32_t ga_w = 512; // group-attention width @@ -612,7 +615,7 @@ struct server_context { gpt_params params; - llama_batch batch; + llama_batch batch = {}; bool clean_kv_cache = true; bool add_bos_token = true; @@ -651,8 +654,8 @@ struct server_context { // Clear any sampling context for (server_slot & slot : slots) { - if (slot.ctx_sampling != nullptr) { - llama_sampling_free(slot.ctx_sampling); + if (slot.smpl != nullptr) { + gpt_sampler_free(slot.smpl); } } @@ -883,8 +886,8 @@ struct server_context { bool launch_slot_with_task(server_slot & slot, const server_task & task) { slot_params default_params; // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - llama_sampling_params default_sparams = params.sparams; - auto & data = task.data; + auto default_sparams = params.sparams; + const auto & data = task.data; if (data.count("__oaicompat") != 0) { slot.oaicompat = true; @@ -901,7 +904,7 @@ struct server_context { slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); - slot.sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); + slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range); slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent); @@ -923,7 +926,8 @@ struct server_context { if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST); return false; - } else if (data.contains("json_schema") && !data.contains("grammar")) { + } + if (data.contains("json_schema") && !data.contains("grammar")) { try { auto schema = json_value(data, "json_schema", json::object()); slot.sparams.grammar = json_schema_to_grammar(schema); @@ -973,56 +977,11 @@ struct server_context { } } - // penalize user-provided tokens - { - slot.sparams.penalty_prompt_tokens.clear(); - slot.sparams.use_penalty_prompt_tokens = false; - - const auto & penalty_prompt = data.find("penalty_prompt"); - - if (penalty_prompt != data.end()) { - if (penalty_prompt->is_string()) { - const auto penalty_prompt_string = penalty_prompt->get(); - slot.sparams.penalty_prompt_tokens = llama_tokenize(model, penalty_prompt_string, false); - - if (slot.params.n_predict > 0) { - slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict); - } - slot.sparams.use_penalty_prompt_tokens = true; - - LOG_VERBOSE("penalty_prompt_tokens", { - {"id_slot", slot.id}, - {"tokens", slot.sparams.penalty_prompt_tokens}, - }); - } - else if (penalty_prompt->is_array()) { - const auto n_tokens = penalty_prompt->size(); - slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict)); - - const int n_vocab = llama_n_vocab(model); - for (const auto & penalty_token : *penalty_prompt) { - if (penalty_token.is_number_integer()) { - const auto tok = penalty_token.get(); - if (tok >= 0 && tok < n_vocab) { - slot.sparams.penalty_prompt_tokens.push_back(tok); - } - } - } - slot.sparams.use_penalty_prompt_tokens = true; - - LOG_VERBOSE("penalty_prompt_tokens", { - {"id_slot", slot.id}, - {"tokens", slot.sparams.penalty_prompt_tokens}, - }); - } - } - } - { slot.sparams.logit_bias.clear(); if (json_value(data, "ignore_eos", false) && has_eos_token) { - slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; + slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY}); } const auto & logit_bias = data.find("logit_bias"); @@ -1043,12 +1002,12 @@ struct server_context { if (el[0].is_number_integer()) { llama_token tok = el[0].get(); if (tok >= 0 && tok < n_vocab) { - slot.sparams.logit_bias[tok] = bias; + slot.sparams.logit_bias.push_back({tok, bias}); } } else if (el[0].is_string()) { auto toks = llama_tokenize(model, el[0].get(), false); for (auto tok : toks) { - slot.sparams.logit_bias[tok] = bias; + slot.sparams.logit_bias.push_back({tok, bias}); } } } @@ -1070,26 +1029,27 @@ struct server_context { } { - const auto & samplers_sequence = data.find("samplers"); - if (samplers_sequence != data.end() && samplers_sequence->is_array()) { + const auto & samplers = data.find("samplers"); + if (samplers != data.end() && samplers->is_array()) { std::vector sampler_names; - for (const auto & sampler_name : *samplers_sequence) { - if (sampler_name.is_string()) { - sampler_names.emplace_back(sampler_name); + for (const auto & name : *samplers) { + if (name.is_string()) { + sampler_names.emplace_back(name); } } - slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false); + slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false); } else { - slot.sparams.samplers_sequence = default_sparams.samplers_sequence; + slot.sparams.samplers = default_sparams.samplers; } } { - if (slot.ctx_sampling != nullptr) { - llama_sampling_free(slot.ctx_sampling); + if (slot.smpl != nullptr) { + gpt_sampler_free(slot.smpl); } - slot.ctx_sampling = llama_sampling_init(slot.sparams); - if (slot.ctx_sampling == nullptr) { + + slot.smpl = gpt_sampler_init(model, slot.sparams); + if (slot.smpl == nullptr) { // for now, the only error that may happen here is invalid grammar send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); return false; @@ -1178,11 +1138,6 @@ struct server_context { slot.generated_text += token_str; slot.has_next_token = true; - if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) { - // we can change penalty_prompt_tokens because it is always created from scratch each request - slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); - } - // check if there is incomplete UTF-8 character at the end bool incomplete = false; for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) { @@ -1300,13 +1255,10 @@ struct server_context { } json get_formated_generation(const server_slot & slot) const { - const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); - const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); - - std::vector samplers_sequence; - samplers_sequence.reserve(slot.sparams.samplers_sequence.size()); - for (const auto & sampler_type : slot.sparams.samplers_sequence) { - samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type)); + std::vector samplers; + samplers.reserve(slot.sparams.samplers.size()); + for (const auto & sampler : slot.sparams.samplers) { + samplers.emplace_back(gpt_sampler_type_to_str(sampler)); } return json { @@ -1321,13 +1273,11 @@ struct server_context { {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, {"tfs_z", slot.sparams.tfs_z}, - {"typical_p", slot.sparams.typical_p}, + {"typical_p", slot.sparams.typ_p}, {"repeat_last_n", slot.sparams.penalty_last_n}, {"repeat_penalty", slot.sparams.penalty_repeat}, {"presence_penalty", slot.sparams.penalty_present}, {"frequency_penalty", slot.sparams.penalty_freq}, - {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, - {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, {"mirostat", slot.sparams.mirostat}, {"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_eta", slot.sparams.mirostat_eta}, @@ -1336,13 +1286,13 @@ struct server_context { {"max_tokens", slot.params.n_predict}, // User configured n_predict {"n_keep", slot.params.n_keep}, {"n_discard", slot.params.n_discard}, - {"ignore_eos", ignore_eos}, + {"ignore_eos", slot.sparams.ignore_eos}, {"stream", slot.params.stream}, - {"logit_bias", slot.sparams.logit_bias}, + //{"logit_bias", slot.sparams.logit_bias}, {"n_probs", slot.sparams.n_probs}, {"min_keep", slot.sparams.min_keep}, {"grammar", slot.sparams.grammar}, - {"samplers", samplers_sequence} + {"samplers", samplers}, }; } @@ -2136,7 +2086,7 @@ struct server_context { GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } - llama_sampling_reset(slot.ctx_sampling); + gpt_sampler_reset(slot.smpl); if (!slot.params.cache_prompt) { slot.n_past_se = 0; @@ -2149,7 +2099,7 @@ struct server_context { // push the prompt into the sampling context (do not apply grammar) for (int i = 0; i < slot.n_past; ++i) { - llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false); + gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false); } } } @@ -2202,7 +2152,7 @@ struct server_context { slot.n_past_se = 0; slot.ga_i = 0; // TODO: is the system prompt ever in the sampling context? - llama_sampling_reset(slot.ctx_sampling); + gpt_sampler_reset(slot.smpl); } // remove the non-common part from the cache @@ -2375,18 +2325,18 @@ struct server_context { slot.release(); slot.i_batch = -1; continue; // continue loop of slots - } else { - // prompt evaluated for next-token prediction - slot.state = SLOT_STATE_GENERATING; } + + // prompt evaluated for next-token prediction + slot.state = SLOT_STATE_GENERATING; } else if (slot.state != SLOT_STATE_GENERATING) { continue; // continue loop of slots } completion_token_output result; - const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i); + const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i); - llama_sampling_accept(slot.ctx_sampling, ctx, id, true); + gpt_sampler_accept(slot.smpl, id, true); slot.n_decoded += 1; if (slot.n_decoded == 1) { @@ -2395,34 +2345,15 @@ struct server_context { metrics.on_prompt_eval(slot); } - llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; result.tok = id; - const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs); - if (n_probs > 0) { - const size_t n_valid = slot.ctx_sampling->n_valid; + const auto * cur_p = gpt_sampler_get_candidates(slot.smpl); - // Make sure at least n_probs top tokens are at the front of the vector: - if (slot.sparams.temp == 0.0f && n_probs > n_valid) { - llama_sample_top_k(ctx, &cur_p, n_probs, 0); - } - - if (slot.sparams.temp == 0.0f) { - // With greedy sampling the probabilities have possibly not been calculated. - for (size_t i = 0; i < n_probs; ++i) { - result.probs.push_back({ - cur_p.data[i].id, - i == 0 ? 1.0f : 0.0f - }); - } - } else { - for (size_t i = 0; i < n_probs; ++i) { - result.probs.push_back({ - cur_p.data[i].id, - i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability. - }); - } - } + for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { + result.probs.push_back({ + cur_p->data[i].id, + i >= cur_p->size ? 0.0f : cur_p->data[i].p, + }); } if (!process_token(result, slot)) { @@ -2494,14 +2425,10 @@ int main(int argc, char ** argv) { // own arguments required by this example gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { return 1; } - // parse arguments from environment variables - gpt_params_parse_from_env(params); - // TODO: not great to use extern vars server_log_json = params.log_json; server_verbose = params.verbosity > 0; diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index 6f163ce04..e1eade6cd 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -9,8 +9,11 @@ Feature: llama.cpp server And a model alias bert-bge-small And 42 as server seed And 2 slots - And 1024 as batch size - And 1024 as ubatch size + # the bert-bge-small model has context size of 512 + # since the generated prompts are as big as the batch size, we need to set the batch size to 512 + # ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20 + And 512 as batch size + And 512 as ubatch size And 2048 KV cache size And embeddings extraction Then the server is starting diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 69a92cf7d..3fdc04394 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -6,9 +7,7 @@ #include #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); LOG_TEE("\n"); @@ -20,8 +19,7 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; params.n_predict = 32; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { return 1; } @@ -55,6 +53,14 @@ int main(int argc, char ** argv) { return 1; } + auto sparams = llama_sampler_chain_default_params(); + + sparams.no_perf = false; + + llama_sampler * smpl = llama_sampler_chain_init(sparams); + + llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + // tokenize the prompt std::vector tokens_list; @@ -110,20 +116,7 @@ int main(int argc, char ** argv) { while (n_cur <= n_predict) { // sample the next token { - auto n_vocab = llama_n_vocab(model); - auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1); - - std::vector candidates; - candidates.reserve(n_vocab); - - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); - } - - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - - // sample the most likely token - const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); + const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); // is it an end of generation? if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { @@ -160,12 +153,14 @@ int main(int argc, char ** argv) { LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - llama_print_timings(ctx); + LOG_TEE("\n"); + llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); fprintf(stderr, "\n"); llama_batch_free(batch); - + llama_sampler_free(smpl); llama_free(ctx); llama_free_model(model); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 1616edecb..214e4932b 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -1,11 +1,13 @@ +#include "arg.h" #include "common.h" +#include "sampling.h" #include "llama.h" -#include #include #include #include #include +#include #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 @@ -21,14 +23,13 @@ struct seq_draft { std::vector tokens; std::vector> dists; - struct llama_sampling_context * ctx_sampling; + struct gpt_sampler * smpl = nullptr; }; int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { return 1; } @@ -43,10 +44,7 @@ int main(int argc, char ** argv) { // probability threshold for splitting a draft branch (only for n_seq_dft > 1) const float p_split = params.p_split; - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - std::default_random_engine rng(params.seed); + std::default_random_engine rng(params.sparams.seed); std::uniform_real_distribution<> u_dist; #ifndef LOG_DISABLE_LOGS @@ -179,19 +177,17 @@ int main(int argc, char ** argv) { // used to determine end of generation bool has_eos = false; - // target model sampling context - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); + // target model sampling context (reuse the llama_context's sampling instance) + struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams); + + struct llama_sampler * softmax = llama_sampler_init_softmax(); // draft sequence data std::vector drafts(n_seq_dft); - params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar - if (params.sparams.temp == 0) { - params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model - } - for (int s = 0; s < n_seq_dft; ++s) { - drafts[s].ctx_sampling = llama_sampling_init(params.sparams); + // allocate gpt_sampler for each draft sequence + drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams); } llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); @@ -233,12 +229,12 @@ int main(int argc, char ** argv) { bool accept = false; if (params.sparams.temp > 0) { // stochastic verification + gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); - llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL); - llama_sample_softmax(ctx_tgt, &dist_tgt); - float p_tgt = 0, p_dft = 0; + auto & dist_tgt = *gpt_sampler_get_candidates(smpl); - // GGML_ASSERT(dist_tgt.size() == dist_dft.size()); + float p_tgt = 0.0f; + float p_dft = 0.0f; while (active_seqs.size() > 0) { // randomly select a sequence to verify from active sequences @@ -257,9 +253,13 @@ int main(int argc, char ** argv) { } continue; } + LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size()); float r = u_dist(rng); - llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true }; + llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true }; + + //GGML_ASSERT(dist_tgt.size <= dist_dft.size); + // acquire the token probabilities assigned by the draft and target models for (size_t i = 0; i < dist_tgt.size; i++) { if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) { @@ -278,7 +278,7 @@ int main(int argc, char ** argv) { accept = true; token_id = drafts[s].tokens[i_dft]; token_str = llama_token_to_piece(ctx_tgt, token_id); - llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); + gpt_sampler_accept(smpl, token_id, true); LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); break; @@ -289,7 +289,6 @@ int main(int argc, char ** argv) { // calculate residual probability GGML_ASSERT(dist_tgt.sorted); GGML_ASSERT(dist_dft.sorted); - float sum_probs = 0.0f; // sort dist by id std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) { @@ -299,10 +298,18 @@ int main(int argc, char ** argv) { return a.id < b.id; }); + float sum_probs = 0.0f; + for (size_t i = 0; i < dist_tgt.size; i++) { - dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p); + if (i < dist_dft.size) { + dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p); + } else { + dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p); + } + sum_probs += dist_tgt.data[i].p; } + for (size_t i = 0; i < dist_tgt.size; i++) { dist_tgt.data[i].p /= sum_probs; } @@ -332,21 +339,29 @@ int main(int argc, char ** argv) { // all drafted tokens were rejected // sample from the target model LOG("all drafted tokens were rejected, sampling from residual distribution\n"); - token_id = llama_sample_token(ctx_tgt, &dist_tgt); - llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); + std::vector probs(dist_tgt.size); + for (size_t i = 0; i < dist_tgt.size; ++i) { + probs[i] = dist_tgt.data[i].p; + } + + std::discrete_distribution<> dist(probs.begin(), probs.end()); + + const int idx = dist(rng); + + token_id = dist_tgt.data[idx].id; + gpt_sampler_accept(smpl, token_id, true); token_str = llama_token_to_piece(ctx_tgt, token_id); } - } else { // greedy verification // sample from the target model LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); - token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); + token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); - llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true); + gpt_sampler_accept(smpl, token_id, true); - //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str()); + //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str()); token_str = llama_token_to_piece(ctx_tgt, token_id); @@ -434,7 +449,10 @@ int main(int argc, char ** argv) { break; } - llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling); + if (drafts[0].smpl) { + gpt_sampler_free(drafts[0].smpl); + } + drafts[0].smpl = gpt_sampler_clone(smpl); int n_seq_cur = 1; int n_past_cur = n_past_dft; @@ -463,20 +481,20 @@ int main(int argc, char ** argv) { continue; } - llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft); + gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true); - const auto & cur_p = drafts[s].ctx_sampling->cur; + const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl); - for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) { + for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) { LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", - k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str()); + k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); } std::vector sa(1, s); // attempt to split the branch if the probability is high enough for (int f = 1; f < 8; ++f) { - if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) { + if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) { LOG("splitting seq %3d into %3d\n", s, n_seq_cur); llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1); @@ -503,7 +521,10 @@ int main(int argc, char ** argv) { drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft; drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; - llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling); + if (drafts[n_seq_cur].smpl) { + gpt_sampler_free(drafts[n_seq_cur].smpl); + } + drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl); sa.push_back(n_seq_cur); @@ -515,15 +536,15 @@ int main(int argc, char ** argv) { // add drafted token for each sequence for (int is = 0; is < (int) sa.size(); ++is) { - const llama_token id = cur_p[is].id; + const llama_token id = cur_p->data[is].id; const int s = sa[is]; - llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true); + gpt_sampler_accept(drafts[s].smpl, id, true); drafts[s].tokens.push_back(id); // save cur_p.data into drafts[s].dists - drafts[s].dists.push_back(cur_p); + drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size}); // add unique drafted tokens to the target batch drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); @@ -593,17 +614,19 @@ int main(int argc, char ** argv) { LOG_TEE("n_accept = %d\n", n_accept); LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - LOG_TEE("\ndraft:\n"); - llama_print_timings(ctx_dft); + LOG_TEE("\ndraft:\n\n"); + // TODO: print sampling/grammar timings for all drafts + llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT); - LOG_TEE("\ntarget:\n"); - llama_print_timings(ctx_tgt); + LOG_TEE("\ntarget:\n\n"); + gpt_perf_print(ctx_tgt, smpl); - llama_sampling_free(ctx_sampling); + gpt_sampler_free(smpl); for (int s = 0; s < n_seq_dft; ++s) { - llama_sampling_free(drafts[s].ctx_sampling); + gpt_sampler_free(drafts[s].smpl); } + llama_sampler_free(softmax); llama_batch_free(batch_dft); llama_free(ctx_tgt); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 09c72b095..536018b66 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -681,8 +681,8 @@ extern "C" { struct ggml_hash_set { size_t size; - ggml_bitset_t * used; - struct ggml_tensor ** keys; + ggml_bitset_t * used; // whether or not the keys are in use i.e. set + struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i) }; // computation graph @@ -1272,7 +1272,7 @@ extern "C" { size_t nb1, size_t nb2, size_t nb3, - size_t offset); + size_t offset); // in bytes // b -> view(a,offset,nb1,nb2,3), return view(a) GGML_API struct ggml_tensor * ggml_set_inplace( @@ -1282,19 +1282,19 @@ extern "C" { size_t nb1, size_t nb2, size_t nb3, - size_t offset); + size_t offset); // in bytes GGML_API struct ggml_tensor * ggml_set_1d( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - size_t offset); + size_t offset); // in bytes GGML_API struct ggml_tensor * ggml_set_1d_inplace( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - size_t offset); + size_t offset); // in bytes // b -> view(a,offset,nb1,nb2,3), return modified a GGML_API struct ggml_tensor * ggml_set_2d( @@ -1302,7 +1302,7 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, - size_t offset); + size_t offset); // in bytes // b -> view(a,offset,nb1,nb2,3), return view(a) GGML_API struct ggml_tensor * ggml_set_2d_inplace( @@ -1310,7 +1310,7 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, - size_t offset); + size_t offset); // in bytes // a -> b, return view(b) GGML_API struct ggml_tensor * ggml_cpy( diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 239ed4553..4524820a5 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -832,6 +832,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float case GGML_OP_MUL_MAT: return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; + case GGML_OP_ROPE_BACK: + return op->src[2] == NULL && (op->op_params[2] & 4) == 0; + case GGML_OP_IM2COL_BACK: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; default: return true; } diff --git a/ggml/src/ggml-cann/Doxyfile b/ggml/src/ggml-cann/Doxyfile index 2b009e8f9..3290a4859 100644 --- a/ggml/src/ggml-cann/Doxyfile +++ b/ggml/src/ggml-cann/Doxyfile @@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8 # title of most generated pages and in a few other places. # The default value is: My Project. -PROJECT_NAME = "llama.cpp" +PROJECT_NAME = "ggml" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version @@ -44,7 +44,7 @@ PROJECT_NUMBER = # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. -PROJECT_BRIEF = "llama inference engine" +PROJECT_BRIEF = "Tensor library for machine learning" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index d33988d02..d53de4edd 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -27,6 +27,7 @@ #include "ggml-cuda/rope.cuh" #include "ggml-cuda/scale.cuh" #include "ggml-cuda/softmax.cuh" +#include "ggml-cuda/sum.cuh" #include "ggml-cuda/sumrows.cuh" #include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/unary.cuh" @@ -2180,6 +2181,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ggml_cuda_dup(ctx, dst); break; case GGML_OP_ADD: + case GGML_OP_ADD1: // TODO: more efficient implementation ggml_cuda_op_add(ctx, dst); break; case GGML_OP_SUB: @@ -2196,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg break; case GGML_OP_UNARY: switch (ggml_get_unary_op(dst)) { + case GGML_UNARY_OP_NEG: + ggml_cuda_op_neg(ctx, dst); + break; case GGML_UNARY_OP_GELU: ggml_cuda_op_gelu(ctx, dst); break; @@ -2304,6 +2309,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_POOL_2D: ggml_cuda_op_pool2d(ctx, dst); break; + case GGML_OP_SUM: + ggml_cuda_op_sum(ctx, dst); + break; case GGML_OP_SUM_ROWS: ggml_cuda_op_sum_rows(ctx, dst); break; @@ -2544,7 +2552,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + + if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); @@ -2748,6 +2760,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: @@ -2877,6 +2890,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_TRANSPOSE: case GGML_OP_NORM: case GGML_OP_ADD: + case GGML_OP_ADD1: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: @@ -2887,14 +2901,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_SIN: case GGML_OP_COS: case GGML_OP_CLAMP: + return true; case GGML_OP_CONT: + return op->src[0]->type != GGML_TYPE_BF16; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: return true; case GGML_OP_ROPE: return ggml_is_contiguous(op->src[0]); case GGML_OP_IM2COL: + return op->src[0]->type == GGML_TYPE_F16; case GGML_OP_POOL_2D: + case GGML_OP_SUM: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu index a14043e70..5575a90f6 100644 --- a/ggml/src/ggml-cuda/cross-entropy-loss.cu +++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu @@ -1,6 +1,6 @@ #include "common.cuh" #include "cross-entropy-loss.cuh" -#include "sumrows.cuh" +#include "sum.cuh" #include #include @@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * cross_entropy_loss_f32<<>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows); // Combine results from individual blocks: - sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream); + sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream); } diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index f87f33b3e..f28a19d40 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -152,7 +152,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g } \ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_tensor * Q = dst->src[1]; + ggml_tensor * Q = dst->src[0]; ggml_tensor * K = dst->src[1]; ggml_tensor * V = dst->src[2]; @@ -227,7 +227,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg } \ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_tensor * Q = dst->src[1]; + ggml_tensor * Q = dst->src[0]; ggml_tensor * K = dst->src[1]; ggml_tensor * V = dst->src[2]; diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu new file mode 100644 index 000000000..21da63509 --- /dev/null +++ b/ggml/src/ggml-cuda/sum.cu @@ -0,0 +1,43 @@ +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh. +// For this reason CUB must be included BEFORE anything else. +#include +using namespace cub; +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + +#include "sumrows.cuh" +#include "sum.cuh" + +#include + +void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) { +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + size_t tmp_size = 0; + DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream); + ggml_cuda_pool_alloc tmp_alloc(pool, tmp_size); + DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream); +#else + // Use (inefficient) sum_rows implementation as a fallback. + // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14. + sum_rows_f32_cuda(x, dst, ne, 1, stream); + GGML_UNUSED(pool); +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +} + +void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + + const float * src0_d = (const float *) src0->data; + float * dst_d = (float *) dst->data; + + const int64_t ne = ggml_nelements(src0); + + ggml_cuda_pool & pool = ctx.pool(); + cudaStream_t stream = ctx.stream(); + + sum_f32_cuda(pool, src0_d, dst_d, ne, stream); +} diff --git a/ggml/src/ggml-cuda/sum.cuh b/ggml/src/ggml-cuda/sum.cuh new file mode 100644 index 000000000..8cadc3736 --- /dev/null +++ b/ggml/src/ggml-cuda/sum.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream); + +void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 89abfc21d..8ac669f94 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -1,5 +1,15 @@ #include "unary.cuh" +static __global__ void neg_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + dst[i] = -x[i]; +} + static __global__ void gelu_f32(const float * x, float * dst, const int k) { const float GELU_COEF_A = 0.044715f; const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; @@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) { dst[i] = cosf(x[i]); } +static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE; + neg_f32<<>>(x, dst, k); +} + static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; gelu_f32<<>>(x, dst, k); @@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t cos_f32<<>>(x, dst, k); } +void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(ggml_is_contiguous(src0)); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); +} + void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *)src0->data; diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh index c610e996a..ed2ffc461 100644 --- a/ggml/src/ggml-cuda/unary.cuh +++ b/ggml/src/ggml-cuda/unary.cuh @@ -1,5 +1,6 @@ #include "common.cuh" +#define CUDA_NEG_BLOCK_SIZE 256 #define CUDA_GELU_BLOCK_SIZE 256 #define CUDA_SILU_BLOCK_SIZE 256 #define CUDA_TANH_BLOCK_SIZE 256 @@ -12,6 +13,8 @@ #define CUDA_SIN_BLOCK_SIZE 256 #define CUDA_COS_BLOCK_SIZE 256 +void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 91b5e61b2..6d8a7c898 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -17,8 +17,8 @@ #define GGML_METAL_LOG_WARN(...) #define GGML_METAL_LOG_ERROR(...) #else -#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) -#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #endif @@ -799,8 +799,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx return ctx->support_simdgroup_reduction; case GGML_OP_NORM: case GGML_OP_ROPE: - case GGML_OP_IM2COL: return true; + case GGML_OP_IM2COL: + return op->src[0]->type == GGML_TYPE_F16; case GGML_OP_POOL_1D: case GGML_OP_POOL_2D: return false; @@ -3038,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute( if (status != MTLCommandBufferStatusCompleted) { GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); if (status == MTLCommandBufferStatusError) { - NSString * error_code = [command_buffer error].localizedDescription; - GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]); + GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 8c31e2cca..322c85d2a 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4003,42 +4003,141 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (ggml_sve_cnt_b == QK8_0) { - const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); - const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); - svfloat32_t sumv0 = svdup_n_f32(0.0f); - svfloat32_t sumv1 = svdup_n_f32(0.0f); + const int vector_length = ggml_sve_cnt_b*8; - for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * restrict x0 = &x[ib + 0]; - const block_q4_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + // VLA Implementation using switch case + switch (vector_length) { + case 128: + { + // predicate for activating higher lanes for 4 float32 elements + const svbool_t ph4 = svptrue_pat_b32(SV_VL4); - // load x - const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); - const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * restrict x0 = &x[ib + 0]; + const block_q4_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; - // 4-bit -> 8-bit - const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04)); - const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04)); + // load x + const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); + const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); - // sub 8 - const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); - const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); + // 4-bit -> 8-bit + const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F)); + const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04)); + const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F)); + const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04)); - // load y - const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); - const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + // sub 8 + const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8); + const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8); + const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8); + const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8); - // dot product - sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } + // load y + const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16); + const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs); + const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16); - sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + // dot product + sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4, + svdot_s32(svdup_n_s32(0), qx0ls, qy0l), + svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4, + svdot_s32(svdup_n_s32(0), qx1ls, qy1l), + svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 256: + { + // predicate for activating higher lanes for 16 int8 elements + const svbool_t ph16 = svptrue_pat_b8(SV_VL16); + // predicate for activating lower lanes for 16 int8 elements + const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * restrict x0 = &x[ib + 0]; + const block_q4_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + // load x + const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); + const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); + + // 4-bit -> 8-bit + const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04)); + const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04)); + + // sub 8 + const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); + const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + // dot product + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 512: + { + // predicate for activating higher lanes for 32 int8 elements + const svbool_t ph32 = svptrue_pat_b8(SV_VL32); + + // predicate for activating higher lanes for 16 int8 elements + const svbool_t ph16 = svptrue_pat_b8(SV_VL16); + // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes + const svbool_t pl16 = svnot_b_z(ph32, ph16); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * restrict x0 = &x[ib + 0]; + const block_q4_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + // load x + const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs); + const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs); + + // 4-bit -> 8-bit + const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04)); + const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04)); + + // sub 8 + const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8); + const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8); + + // load y + const svint8_t qy0 = svld1_s8(ph32, y0->qs); + const svint8_t qy1 = svld1_s8(ph32, y1->qs); + + // dot product + sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32, + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32, + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1)); + } break; + default: + assert(false && "Unsupported vector length"); + break; } + #elif defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -5488,29 +5587,124 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (ggml_sve_cnt_b == QK8_0) { - svfloat32_t sumv0 = svdup_n_f32(0.0f); - svfloat32_t sumv1 = svdup_n_f32(0.0f); + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); - for (; ib + 1 < nb; ib += 2) { - const block_q8_0 * restrict x0 = &x[ib + 0]; - const block_q8_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const int vector_length = ggml_sve_cnt_b*8; - // load x - const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); - const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); + //VLA Implemenation for SVE + switch (vector_length) { + case 128: + { + // predicate for activating lanes for 16 Int8 elements + const svbool_t ph16 = svptrue_pat_b8 (SV_VL16); + const svbool_t pl16 = svptrue_pat_b32(SV_VL4); - // load y - const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); - const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * restrict x0 = &x[ib + 0]; + const block_q8_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; - sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } + // load x + const svint8_t qx0_0 = svld1_s8(ph16, x0->qs); + const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16); + const svint8_t qx1_0 = svld1_s8(ph16, x1->qs); + const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16); - sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + // load y + const svint8_t qy0_0 = svld1_s8(ph16, y0->qs); + const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16); + const svint8_t qy1_0 = svld1_s8(ph16, y1->qs); + const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16); + + sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16, + svdot_s32(svdup_n_s32(0), qx0_0, qy0_0), + svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16, + svdot_s32(svdup_n_s32(0), qx1_0, qy1_0), + svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1)); + } break; + case 256: + { + //printf("sve256"); + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * restrict x0 = &x[ib + 0]; + const block_q8_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + // load x + const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); + const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 512: + { + // predicate for activating high 256 bit + const svbool_t ph32 = svptrue_pat_b8(SV_VL32); + // predicate for activating low 256 bit + const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32); + + // predicate for activating high lanes for 8 float32 elements + const svbool_t ph8 = svptrue_pat_b32(SV_VL8); + // predicate for activating low lanes for 8 float32 elements + const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8); + + svfloat32_t sumv00 = svdup_n_f32(0.0f); + + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * restrict x0 = &x[ib + 0]; + const block_q8_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits + // and add them to make one 64 element vector + // load x + const svint8_t qx_32 = svld1_s8(ph32, x0->qs); + svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2); + + qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64); + + // load y + const svint8_t qy_32 = svld1_s8(ph32, y0->qs); + svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2); + + qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64); + + // scale creation + const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d); + const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d); + + // duplicate deq1 in first half of vector and deq2 in second half of vector + const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2); + + const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64)); + + sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp); + } + + sumf = svaddv_f32(svptrue_b32(), sumv00); + break; + } + default: + assert(false && "Unsupported vector length"); + break; } #elif defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index 8f9d0a460..9c600c7ca 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -883,15 +883,17 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp } result->buffer = reinterpret_cast(tensor->buffer); if (result->buffer && buffers.find(result->buffer) == buffers.end()) { - return nullptr; + result->buffer = nullptr; } - // require that the tensor data does not go beyond the buffer end - uint64_t tensor_size = (uint64_t) ggml_nbytes(result); - uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); - uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); - GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow - GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + if (result->buffer) { + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow + GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + } result->op = (ggml_op) tensor->op; for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { @@ -1060,7 +1062,7 @@ bool rpc_server::graph_compute(const std::vector & input, std::vectordevice, dev_ptr, buft_ctx->stream); return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); } @@ -4570,7 +4579,11 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, */ SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device( size, *stream))); - + if (!buf) { + char err_buf[1024]; + snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size); + throw std::runtime_error(err_buf); + } // set padding to 0 to avoid possible NaN values if (size > original_size) { /* diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 0b7a5b6e2..83737c1d9 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { if (ctx->seqs.empty()) { + if (fence) { + ctx->q->queue.submit({}, fence); + } return; } VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")"); @@ -4616,7 +4619,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const }, dryrun); } -static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4626,10 +4629,10 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, - }); + }, dryrun); } -static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4639,7 +4642,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, - }); + }, dryrun); } static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { @@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } } -static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){ +static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence); + +// Returns true if node has enqueued work into the queue, false otherwise +// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. +static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; if (ggml_is_empty(node) || extra == nullptr) { - return; + return false; } VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")"); @@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: case GGML_OP_NONE: - return; + return false; case GGML_OP_UNARY: switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: @@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_UNARY_OP_TANH: break; default: - return; + return false; } break; case GGML_OP_REPEAT: @@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod default: std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; GGML_ABORT("fatal error"); - return; + return false; } vk_context compute_ctx; @@ -5783,11 +5790,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod break; case GGML_OP_SIN: - ggml_vk_sin(ctx, compute_ctx, src0, node); + ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_COS: - ggml_vk_cos(ctx, compute_ctx, src0, node); + ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_CLAMP: @@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun); break; default: - return; + return false; } break; case GGML_OP_DIAG_MASK_INF: @@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod break; default: - return; + return false; } if (dryrun) { - return; + return false; } ctx->tensor_ctxs[node_idx] = compute_ctx; @@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod last_node = true; #endif - if (last_node) { + if (submit || last_node) { ggml_vk_ctx_end(compute_ctx); - compute_ctx->exit_tensor_idx = node_idx; + + // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward + if (last_node) { + compute_ctx->exit_tensor_idx = node_idx_begin; + } + else { + compute_ctx->exit_tensor_idx = -1; + } + ctx->compute_ctx.reset(); + + bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false); + if (!ok) { + if (node->op == GGML_OP_UNARY) { + std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast(node->op_params[0])) << ")" << std::endl; + } + else { + std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl; + } + } + } + return true; } -static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){ +static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){ ggml_tensor_extra_gpu * extra = nullptr; switch (tensor->op) { @@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); -#ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_0(tensor); -#endif - vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock(); -#ifdef GGML_VULKAN_PERF - std::chrono::steady_clock::time_point start; -#endif // GGML_VULKAN_PERF + // always wait for the GPU work to be done for the last submit + if (tensor_idx == subctx->exit_tensor_idx) { + use_fence = true; + } // Only run if ctx hasn't been submitted yet if (!subctx->seqs.empty()) { +#ifdef GGML_VULKAN_CHECK_RESULTS + ggml_vk_check_results_0(tensor); + use_fence = true; +#endif + // Do staging buffer copies for (auto& cpy : subctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } -#ifdef GGML_VULKAN_PERF - start = std::chrono::steady_clock::now(); -#endif // GGML_VULKAN_PERF + ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{}); - ggml_vk_submit(subctx, ctx->fence); + if (use_fence) { + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); + + ctx->device->device.resetFences({ ctx->fence }); + } +#ifdef GGML_VULKAN_CHECK_RESULTS + ggml_vk_check_results_1(tensor); +#endif } if (tensor_idx == subctx->exit_tensor_idx) { - VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - -#ifdef GGML_VULKAN_PERF - auto duration = std::chrono::duration_cast(std::chrono::steady_clock::now() - start); - ctx->device->perf_logger->log_timing(tensor, duration.count()); -#endif // GGML_VULKAN_PERF - - ctx->device->device.resetFences({ ctx->fence }); - // Do staging buffer copies for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); @@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true); + ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false); } ggml_vk_preallocate_buffers(ctx); ggml_pipeline_allocate_descriptor_sets(ctx->device); @@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen // Reserve tensor context space for all nodes ctx->tensor_ctxs.resize(cgraph->n_nodes); - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false); - } + bool first_node_in_batch = true; // true if next node will be first node in a batch + int submit_node_idx = 0; // index to first node in a batch + // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution + constexpr int submit_count = 100; + int submitted_nodes = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - - if (ggml_vk_is_empty(node)) { - continue; + if (first_node_in_batch) { + submit_node_idx = i; } - bool ok = ggml_vk_compute_forward(ctx, node, i); - if (!ok) { - if (node->op == GGML_OP_UNARY) { - std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast(node->op_params[0])) << ")" << std::endl; - } else { - std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl; + bool submit = (submitted_nodes >= submit_count) || (i == last_node); + + + bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit); + + if (enqueued) { + ++submitted_nodes; + +#ifndef GGML_VULKAN_CHECK_RESULTS + if (first_node_in_batch) { + first_node_in_batch = false; } - } -#ifdef GGML_VULKAN_CHECK_RESULTS - else { - ggml_vk_check_results_1(node); - } #endif - GGML_ASSERT(ok); + } + + if (submit) { + first_node_in_batch = true; + submitted_nodes = 0; + } } #ifdef GGML_VULKAN_PERF @@ -6602,6 +6632,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const return false; } } break; + case GGML_OP_CONT: case GGML_OP_CPY: case GGML_OP_DUP: { @@ -6642,7 +6673,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_PAD: - case GGML_OP_CONT: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c98ca32bd..d7157ca6d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3847,7 +3847,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + size_needed, ctx->mem_size); + __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size); assert(false); return NULL; } @@ -5267,6 +5267,7 @@ struct ggml_tensor * ggml_concat( bool is_node = false; if (a->grad || b->grad) { + GGML_ABORT("fatal error"); // TODO: implement is_node = true; } @@ -5388,6 +5389,7 @@ struct ggml_tensor * ggml_leaky_relu( bool is_node = false; if (!inplace && (a->grad)) { + GGML_ABORT("fatal error"); // TODO: not implemented is_node = true; } @@ -5826,6 +5828,7 @@ static struct ggml_tensor * ggml_set_impl( // make a view of the destination struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + GGML_ASSERT(offset < (size_t)(1 << 30)); int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; ggml_set_op_params(result, params, sizeof(params)); @@ -6783,14 +6786,12 @@ struct ggml_tensor * ggml_rope_back( GGML_ASSERT(ggml_is_vector(b)); GGML_ASSERT(b->type == GGML_TYPE_I32); GGML_ASSERT(a->ne[2] == b->ne[0]); - GGML_ASSERT(c == NULL && "freq factors not implemented yet"); - - GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); bool is_node = false; if (a->grad) { - is_node = false; // TODO: implement backward + GGML_ASSERT(false && "backwards pass not implemented"); + is_node = false; } struct ggml_tensor * result = ggml_dup_tensor(ctx, a); @@ -6808,6 +6809,7 @@ struct ggml_tensor * ggml_rope_back( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; + result->src[2] = c; return result; } @@ -7361,6 +7363,11 @@ struct ggml_tensor * ggml_argsort( enum ggml_sort_order order) { bool is_node = false; + if (a->grad) { + GGML_ABORT("fatal error"); // TODO: not implemented + is_node = true; + } + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); @@ -8322,8 +8329,7 @@ static void ggml_compute_forward_dup_same_cont( GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); GGML_ASSERT(src0->type == dst->type); - const size_t nb00 = src0->nb[0]; - const size_t nb0 = dst->nb[0]; + const size_t nb0 = ggml_type_size(src0->type); const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -8337,8 +8343,8 @@ static void ggml_compute_forward_dup_same_cont( if (ie0 < ie1) { memcpy( ((char *) dst->data + ie0*nb0), - ((char *) src0->data + ie0*nb00), - (ie1 - ie0) * ggml_type_size(src0->type)); + ((char *) src0->data + ie0*nb0), + (ie1 - ie0) * nb0); } } @@ -8355,11 +8361,6 @@ static void ggml_compute_forward_dup_f16( const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, dst); - return; - } - // parallelize by rows const int nr = ne01; // number of rows per thread @@ -8624,11 +8625,6 @@ static void ggml_compute_forward_dup_bf16( const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, dst); - return; - } - // parallelize by rows const int nr = ne01; // number of rows per thread @@ -8980,11 +8976,6 @@ static void ggml_compute_forward_dup_f32( const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, dst); - return; - } - // parallelize by rows const int nr = ne01; // number of rows per thread @@ -9294,13 +9285,13 @@ static void ggml_compute_forward_dup_bytes( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(src0->type == dst->type); + GGML_TENSOR_UNARY_OP_LOCALS; + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { ggml_compute_forward_dup_same_cont(params, dst); return; } - GGML_TENSOR_UNARY_OP_LOCALS; - const size_t type_size = ggml_type_size(src0->type); const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -10969,9 +10960,6 @@ static void ggml_compute_forward_sum_f32( return; } - assert(ggml_is_scalar(dst)); - - assert(ggml_is_scalar(dst)); assert(src0->nb[0] == sizeof(float)); @@ -13721,7 +13709,7 @@ static void ggml_compute_forward_get_rows_q( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - assert(i01 >= 0 && i01 < ne01); + GGML_ASSERT(i01 >= 0 && i01 < ne01); dequantize_row_q( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), @@ -13762,7 +13750,7 @@ static void ggml_compute_forward_get_rows_f16( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - assert(i01 >= 0 && i01 < ne01); + GGML_ASSERT(i01 >= 0 && i01 < ne01); ggml_fp16_to_fp32_row( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), @@ -13803,7 +13791,7 @@ static void ggml_compute_forward_get_rows_bf16( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - assert(i01 >= 0 && i01 < ne01); + GGML_ASSERT(i01 >= 0 && i01 < ne01); ggml_bf16_to_fp32_row( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), @@ -13844,7 +13832,7 @@ static void ggml_compute_forward_get_rows_f32( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - assert(i01 >= 0 && i01 < ne01); + GGML_ASSERT(i01 >= 0 && i01 < ne01); ggml_vec_cpy_f32(nc, (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), @@ -18372,14 +18360,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor if (src0->grad || src1->grad) { GGML_ASSERT(src0->type == tensor->type); GGML_ASSERT(tensor->grad->type == tensor->type); - GGML_ASSERT(tensor->grad->type == src1->grad->type); + GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type); tensor_grad_view = ggml_view_4d(ctx, - tensor->grad, - src1->grad->ne[0], - src1->grad->ne[1], - src1->grad->ne[2], - src1->grad->ne[3], + tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], nb1, nb2, nb3, offset); } @@ -18448,9 +18432,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor memcpy(&offset, tensor->op_params, sizeof(offset)); - size_t nb1 = tensor->nb[1]; - size_t nb2 = tensor->nb[2]; - size_t nb3 = tensor->nb[3]; + size_t nb1 = tensor->nb[1]; + size_t nb2 = tensor->nb[2]; + size_t nb3 = tensor->nb[3]; if (src0->type != src0->grad->type) { // gradient is typically F32, but src0 could be other type @@ -19146,7 +19130,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { } for (size_t i = 0; i < src->visited_hash_set.size; ++i) { - if (src->visited_hash_set.keys[i]) { + // copy all hashset keys (tensors) that are in use + if (ggml_bitset_get(src->visited_hash_set.used, i)) { ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]); } } @@ -19556,7 +19541,8 @@ static bool ggml_thread_apply_priority(int32_t prio) { return true; } -#else // posix? +#elif defined(__gnu_linux__) +// TODO: this may not work on BSD, to be verified static bool ggml_thread_apply_affinity(const bool * mask) { cpu_set_t cpuset; @@ -19611,6 +19597,18 @@ static bool ggml_thread_apply_priority(int32_t prio) { return true; } +#else // unsupported platforms + +static bool ggml_thread_apply_affinity(const bool * mask) { + UNUSED(mask); + return true; +} + +static bool ggml_thread_apply_priority(int32_t prio) { + UNUSED(prio); + return true; +} + #endif static bool ggml_thread_cpumask_is_valid(const bool * mask) { diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp index f0988ba7c..d0c2bb284 100644 --- a/ggml/src/llamafile/sgemm.cpp +++ b/ggml/src/llamafile/sgemm.cpp @@ -1006,6 +1006,10 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda assert(nth > 0); assert(ith < nth); + // only enable sgemm for prompt processing + if (n < 2) + return false; + if (Ctype != GGML_TYPE_F32) return false; diff --git a/include/llama.h b/include/llama.h index a495e866d..93b3e6e85 100644 --- a/include/llama.h +++ b/include/llama.h @@ -33,12 +33,15 @@ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF +// TODO: use everywhere in the implementation +#define LLAMA_TOKEN_NULL -1 + #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 8 +#define LLAMA_SESSION_VERSION 9 #define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ #define LLAMA_STATE_SEQ_VERSION 2 @@ -53,8 +56,10 @@ extern "C" { // TODO: show sample usage // + // struct llama_vocab; // TODO: add in the future struct llama_model; struct llama_context; + struct llama_sampler; typedef int32_t llama_pos; typedef int32_t llama_token; @@ -201,6 +206,7 @@ extern "C" { LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs }; + // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979) typedef struct llama_token_data { llama_token id; // token id float logit; // log-odds of the token @@ -208,8 +214,10 @@ extern "C" { } llama_token_data; typedef struct llama_token_data_array { + // TODO: consider SoA llama_token_data * data; size_t size; + int64_t selected; // this is the index in the data array (i.e. not the token id) bool sorted; } llama_token_data_array; @@ -302,7 +310,6 @@ extern "C" { // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations // https://github.com/ggerganov/llama.cpp/pull/7544 struct llama_context_params { - uint32_t seed; // RNG seed, -1 for random uint32_t n_ctx; // text context, 0 = from model uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_ubatch; // physical maximum batch size @@ -330,11 +337,13 @@ extern "C" { enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] - // Keep the booleans together to avoid misalignment during copy-by-value. + // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. + // TODO: move at the end of the struct bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] + //bool no_perf; // whether to measure performance timings, TODO: implement // Abort callback // if it returns true, execution of llama_decode() will be aborted @@ -358,56 +367,14 @@ extern "C" { void * kv_overrides; // pointer to vector containing overrides } llama_model_quantize_params; - // grammar types - struct llama_grammar; + typedef struct llama_logit_bias { + llama_token token; + float bias; + } llama_logit_bias; - // grammar element type - enum llama_gretype { - // end of rule definition - LLAMA_GRETYPE_END = 0, - - // start of alternate definition for rule - LLAMA_GRETYPE_ALT = 1, - - // non-terminal element: reference to rule - LLAMA_GRETYPE_RULE_REF = 2, - - // terminal element: character (code point) - LLAMA_GRETYPE_CHAR = 3, - - // inverse char(s) ([^a], [^a-b] [^abc]) - LLAMA_GRETYPE_CHAR_NOT = 4, - - // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to - // be an inclusive range ([a-z]) - LLAMA_GRETYPE_CHAR_RNG_UPPER = 5, - - // modifies a preceding LLAMA_GRETYPE_CHAR or - // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) - LLAMA_GRETYPE_CHAR_ALT = 6, - - // any character (.) - LLAMA_GRETYPE_CHAR_ANY = 7, - }; - - typedef struct llama_grammar_element { - enum llama_gretype type; - uint32_t value; // Unicode code point or rule ID - } llama_grammar_element; - - // performance timing information - struct llama_timings { - double t_start_ms; - double t_end_ms; - double t_load_ms; - double t_sample_ms; - double t_p_eval_ms; - double t_eval_ms; - - int32_t n_sample; - int32_t n_p_eval; - int32_t n_eval; - }; + typedef struct llama_sampler_chain_params { + bool no_perf; // whether to measure performance timings + } llama_sampler_chain_params; // used in chat template typedef struct llama_chat_message { @@ -419,8 +386,10 @@ extern "C" { struct llama_lora_adapter; // Helpers for getting default parameters - LLAMA_API struct llama_model_params llama_model_default_params(void); - LLAMA_API struct llama_context_params llama_context_default_params(void); + // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172) + LLAMA_API struct llama_model_params llama_model_default_params(void); + LLAMA_API struct llama_context_params llama_context_default_params(void); + LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); // Initialize the llama + ggml backend @@ -443,10 +412,11 @@ extern "C" { LLAMA_API struct llama_model * llama_load_model_from_file( const char * path_model, - struct llama_model_params params); + struct llama_model_params params); LLAMA_API void llama_free_model(struct llama_model * model); + // TODO: rename to llama_init_from_model LLAMA_API struct llama_context * llama_new_context_with_model( struct llama_model * model, struct llama_context_params params); @@ -462,23 +432,22 @@ extern "C" { LLAMA_API bool llama_supports_mlock (void); LLAMA_API bool llama_supports_gpu_offload(void); - LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); - LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); - LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); - - LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); - LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); - LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_n_embd (const struct llama_model * model); LLAMA_API int32_t llama_n_layer (const struct llama_model * model); + LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); + + LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); + LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); + LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); + // Get the model's RoPE frequency scaling factor LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); @@ -706,7 +675,7 @@ extern "C" { // // Returns the *actual* size in bytes of the state - // (rng, logits, embedding and kv_cache) + // (logits, embedding and kv_cache) // Only use when saving the state, not when restoring it, otherwise the size may be too small. LLAMA_API size_t llama_state_get_size(struct llama_context * ctx); LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx), @@ -1009,121 +978,110 @@ extern "C" { int32_t length); // - // Grammar + // Sampling API + // + // Sample usage: + // + // // prepare the sampling chain at the start + // auto sparams = llama_sampler_chain_default_params(); + // + // llama_sampler * smpl = llama_sampler_chain_init(sparams); + // + // llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50)); + // llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1)); + // llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8)); + // + // // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat" + // // this sampler will be responsible to select the actual token + // llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed)); + // + // ... + // + // // decoding loop: + // while (...) { + // ... + // + // llama_decode(ctx, batch); + // + // // sample from the logits of the last token in the batch + // const llama_token id = llama_sampler_sample(smpl, ctx, -1); + // + // // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.) + // llama_sampler_accept(smpl, id); + // ... + // } + // + // llama_sampler_free(smpl); + // + // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU). + // TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab // - /// Initialize a llama_grammar. - /// - /// @param rules The rule elements of the grammar to initialize. - /// @param n_rules The number of rules. - /// @param start_rule_index The index of the root rule (the starting point of the grammar). - /// @return The initialized llama_grammar or nullptr if initialization failed. - LLAMA_API struct llama_grammar * llama_grammar_init( - const llama_grammar_element ** rules, - size_t n_rules, - size_t start_rule_index); + typedef void * llama_sampler_context_t; - LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); + // user code can implement the interface below in order to create custom llama_sampler + struct llama_sampler_i { + const char * (*name) (const struct llama_sampler * smpl); // can be NULL + void (*accept)( struct llama_sampler * smpl, llama_token token); // can be NULL + void (*apply) ( struct llama_sampler * smpl, llama_token_data_array * cur_p); // required + void (*reset) ( struct llama_sampler * smpl); // can be NULL + struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL + void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL - LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar); + // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph + //void (*apply_ggml) (struct llama_sampler * smpl, ...); + }; - /// @details Apply constraints from grammar - LLAMA_API void llama_grammar_sample( - const struct llama_grammar * grammar, - const struct llama_context * ctx, - llama_token_data_array * candidates); - LLAMA_API DEPRECATED(void llama_sample_grammar( - struct llama_context * ctx, - llama_token_data_array * candidates, - const struct llama_grammar * grammar), - "use llama_grammar_sample instead"); + struct llama_sampler { + struct llama_sampler_i * iface; + llama_sampler_context_t ctx; + }; - /// @details Accepts the sampled token into the grammar - LLAMA_API void llama_grammar_accept_token( - struct llama_grammar * grammar, - struct llama_context * ctx, - llama_token token); + // mirror of llama_sampler_i: + LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl); + LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token); + LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p); + LLAMA_API void llama_sampler_reset ( struct llama_sampler * smpl); + LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl); + // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add) + LLAMA_API void llama_sampler_free ( struct llama_sampler * smpl); - // - // Sampling functions - // + // llama_sampler_chain + // a type of llama_sampler that can chain multiple samplers one after another - // Sets the current rng seed. - LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed); + LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params); - /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. - /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. - LLAMA_API void llama_sample_repetition_penalties( - struct llama_context * ctx, - llama_token_data_array * candidates, - const llama_token * last_tokens, - size_t penalty_last_n, - float penalty_repeat, - float penalty_freq, - float penalty_present); + // important: takes ownership of the sampler object and will free it when llama_sampler_free is called + LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl); + LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i); + LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain); - /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 - /// @param logits Logits extracted from the original generation context. - /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. - /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. - LLAMA_API void llama_sample_apply_guidance( - struct llama_context * ctx, - float * logits, - float * logits_guidance, - float scale); + // available samplers: + + LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void); + LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. - LLAMA_API void llama_sample_softmax( - struct llama_context * ctx, - llama_token_data_array * candidates); + LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - LLAMA_API void llama_sample_top_k( - struct llama_context * ctx, - llama_token_data_array * candidates, - int32_t k, - size_t min_keep); + LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - LLAMA_API void llama_sample_top_p( - struct llama_context * ctx, - llama_token_data_array * candidates, - float p, - size_t min_keep); + LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep); /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 - LLAMA_API void llama_sample_min_p( - struct llama_context * ctx, - llama_token_data_array * candidates, - float p, - size_t min_keep); + LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep); /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. - LLAMA_API void llama_sample_tail_free( - struct llama_context * ctx, - llama_token_data_array * candidates, - float z, - size_t min_keep); + LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep); /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. - LLAMA_API void llama_sample_typical( - struct llama_context * ctx, - llama_token_data_array * candidates, - float p, - size_t min_keep); + LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep); + LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t); - /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. - LLAMA_API void llama_sample_entropy( - struct llama_context * ctx, - llama_token_data_array * candidates_p, - float min_temp, - float max_temp, - float exponent_val); - - LLAMA_API void llama_sample_temp( - struct llama_context * ctx, - llama_token_data_array * candidates, - float temp); + /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. + LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent); /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. @@ -1131,36 +1089,58 @@ extern "C" { /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - LLAMA_API llama_token llama_sample_token_mirostat( - struct llama_context * ctx, - llama_token_data_array * candidates, - float tau, - float eta, - int32_t m, - float * mu); + LLAMA_API struct llama_sampler * llama_sampler_init_mirostat( + int32_t n_vocab, + uint32_t seed, + float tau, + float eta, + int32_t m); /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - LLAMA_API llama_token llama_sample_token_mirostat_v2( - struct llama_context * ctx, - llama_token_data_array * candidates, - float tau, - float eta, - float * mu); + LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2( + uint32_t seed, + float tau, + float eta); - /// @details Selects the token with the highest probability. - /// Does not compute the token probabilities. Use llama_sample_softmax() instead. - LLAMA_API llama_token llama_sample_token_greedy( - struct llama_context * ctx, - llama_token_data_array * candidates); + LLAMA_API struct llama_sampler * llama_sampler_init_grammar( + const struct llama_model * model, + const char * grammar_str, + const char * grammar_root); - /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx. - LLAMA_API llama_token llama_sample_token( - struct llama_context * ctx, - llama_token_data_array * candidates); + LLAMA_API struct llama_sampler * llama_sampler_init_penalties( + int32_t n_vocab, // llama_n_vocab() + llama_token special_eos_id, // llama_token_eos() + llama_token linefeed_id, // llama_token_nl() + int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat, // 1.0 = disabled + float penalty_freq, // 0.0 = disabled + float penalty_present, // 0.0 = disabled + bool penalize_nl, // consider newlines as a repeatable token + bool ignore_eos); // ignore the end-of-sequence token + + LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( + int32_t n_vocab, + int32_t n_logit_bias, + const llama_logit_bias * logit_bias); + + /// @details Sample and accept a token from the idx-th output of the last evaluation + // + // Shorthand for: + // const auto * logits = llama_get_logits_ith(ctx, idx); + // llama_token_data_array cur_p = { ... init from logits ... }; + // llama_sampler_apply(smpl, &cur_p); + // auto token = cur_p.data[cur_p.selected].id; + // llama_sampler_accept(smpl, token); + // return token; + // Returns the sampled token + LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx); + + // TODO: extend in the future + //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...); // // Model split @@ -1176,12 +1156,6 @@ extern "C" { // Returns the split_prefix length. LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count); - // Performance information - LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); - - LLAMA_API void llama_print_timings(struct llama_context * ctx); - LLAMA_API void llama_reset_timings(struct llama_context * ctx); - // Print system information LLAMA_API const char * llama_print_system_info(void); @@ -1189,65 +1163,24 @@ extern "C" { // If this is not called, or NULL is supplied, everything is output on stderr. LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); - LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); + // + // Performance utils + // + // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. + // + + enum llama_perf_type { + LLAMA_PERF_TYPE_CONTEXT = 0, + LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1, + }; + + LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type); + LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type); + + LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); #ifdef __cplusplus } #endif -// Internal API to be implemented by llama.cpp and used by tests/benchmarks only -#ifdef LLAMA_API_INTERNAL - -#include -#include -#include - -struct ggml_tensor; - -const std::vector> & llama_internal_get_tensor_map( - struct llama_context * ctx -); - -struct llama_partial_utf8 { - uint32_t value; // bit value so far (unshifted) - int n_remain; // num bytes remaining; -1 indicates invalid sequence -}; - -struct llama_grammar_candidate { - size_t index; - const uint32_t * code_points; - llama_partial_utf8 partial_utf8; -}; - -using llama_grammar_rule = std::vector< llama_grammar_element>; -using llama_grammar_stack = std::vector; - -using llama_grammar_rules = std::vector; -using llama_grammar_stacks = std::vector; -using llama_grammar_candidates = std::vector; - -const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar); - llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar); - -void llama_grammar_accept( - const llama_grammar_rules & rules, - const llama_grammar_stacks & stacks, - const uint32_t chr, - llama_grammar_stacks & new_stacks); - -std::vector llama_grammar_reject_candidates_for_stack( - const llama_grammar_rules & rules, - const llama_grammar_stack & stack, - const llama_grammar_candidates & candidates); - -std::pair, llama_partial_utf8> decode_utf8( - const std::string & src, - llama_partial_utf8 partial_start); - -// Randomly selects a token from the candidates based on their probabilities using given std::mt19937. -// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences. -llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng); - -#endif // LLAMA_API_INTERNAL - #endif // LLAMA_H diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index b29892565..f16336594 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -5,7 +5,7 @@ # Usage: # # $ cd /path/to/llama.cpp -# $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2... +# $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2... -C 3 # set -e @@ -25,9 +25,23 @@ lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last) echo "Syncing ggml changes since commit $lc" to_skip="" -if [ "$1" == "-skip" ]; then - to_skip=$2 -fi + +# context for git patches in number of lines +ctx="8" + +while [ "$1" != "" ]; do + case $1 in + -skip ) + shift + to_skip=$1 + ;; + -C ) + shift + ctx=$1 + ;; + esac + shift +done cd $SRC_GGML @@ -52,7 +66,7 @@ while read c; do fi fi - git format-patch -k $c~1..$c --stdout -- \ + git format-patch -U${ctx} -k $c~1..$c --stdout -- \ CMakeLists.txt \ src/CMakeLists.txt \ cmake/FindSIMD.cmake \ @@ -191,7 +205,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then > ggml-src.patch.tmp mv ggml-src.patch.tmp ggml-src.patch - git am ggml-src.patch + git am -C${ctx} ggml-src.patch rm -v $SRC_LLAMA/ggml-src.patch fi diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 1e6db754f..3d2dfb413 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -28b7633d733bbeef0026570fbc61c79c5e9aa5ae +10e83a412717c20d57ba19f025248e18e43addf3 diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index b123d7331..74e9f64b3 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -3,11 +3,31 @@ #include "llama-vocab.h" #include "llama-sampling.h" +#include #include +#include -// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as -// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`. -std::pair, llama_partial_utf8> decode_utf8( +// +// helpers +// + +// NOTE: assumes valid utf8 (but checks for overrun) +static std::pair decode_utf8(const char * src) { + static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t first_byte = static_cast(*src); + uint8_t highbits = first_byte >> 4; + int len = lookup[highbits]; + uint8_t mask = (1 << (8 - len)) - 1; + uint32_t value = first_byte & mask; + const char * end = src + len; // may overrun! + const char * pos = src + 1; + for ( ; pos < end && *pos; pos++) { + value = (value << 6) + (static_cast(*pos) & 0x3F); + } + return std::make_pair(value, pos); +} + +static std::pair, llama_partial_utf8> decode_utf8( const std::string & src, llama_partial_utf8 partial_start) { static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; @@ -40,7 +60,7 @@ std::pair, llama_partial_utf8> decode_utf8( while (*pos != 0) { uint8_t first_byte = static_cast(*pos); uint8_t highbits = first_byte >> 4; - n_remain = lookup[highbits] - 1; + n_remain = lookup[highbits] - 1; if (n_remain < 0) { // invalid sequence, abort @@ -50,7 +70,7 @@ std::pair, llama_partial_utf8> decode_utf8( } uint8_t mask = (1 << (7 - n_remain)) - 1; - value = first_byte & mask; + value = first_byte & mask; ++pos; while (*pos != 0 && n_remain > 0) { @@ -67,12 +87,510 @@ std::pair, llama_partial_utf8> decode_utf8( return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain }); } -const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) { - return grammar->rules; +static bool is_digit_char(char c) { + return '0' <= c && c <= '9'; } -llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) { - return grammar->stacks; +static bool is_word_char(char c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c); +} + +static std::pair parse_hex(const char * src, int size) { + const char * pos = src; + const char * end = src + size; + uint32_t value = 0; + for ( ; pos < end && *pos; pos++) { + value <<= 4; + char c = *pos; + if ('a' <= c && c <= 'f') { + value += c - 'a' + 10; + } else if ('A' <= c && c <= 'F') { + value += c - 'A' + 10; + } else if ('0' <= c && c <= '9') { + value += c - '0'; + } else { + break; + } + } + if (pos != end) { + throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src); + } + return std::make_pair(value, pos); +} + +static const char * parse_space(const char * src, bool newline_ok) { + const char * pos = src; + while (*pos == ' ' || *pos == '\t' || *pos == '#' || + (newline_ok && (*pos == '\r' || *pos == '\n'))) { + if (*pos == '#') { + while (*pos && *pos != '\r' && *pos != '\n') { + pos++; + } + } else { + pos++; + } + } + return pos; +} + +static const char * parse_name(const char * src) { + const char * pos = src; + while (is_word_char(*pos)) { + pos++; + } + if (pos == src) { + throw std::runtime_error(std::string("expecting name at ") + src); + } + return pos; +} + +static const char * parse_int(const char * src) { + const char * pos = src; + while (is_digit_char(*pos)) { + pos++; + } + if (pos == src) { + throw std::runtime_error(std::string("expecting integer at ") + src); + } + return pos; +} + +static std::pair parse_char(const char * src) { + if (*src == '\\') { + switch (src[1]) { + case 'x': return parse_hex(src + 2, 2); + case 'u': return parse_hex(src + 2, 4); + case 'U': return parse_hex(src + 2, 8); + case 't': return std::make_pair('\t', src + 2); + case 'r': return std::make_pair('\r', src + 2); + case 'n': return std::make_pair('\n', src + 2); + case '\\': + case '"': + case '[': + case ']': + return std::make_pair(src[1], src + 2); + default: + throw std::runtime_error(std::string("unknown escape at ") + src); + } + } else if (*src) { + return decode_utf8(src); + } + throw std::runtime_error("unexpected end of input"); +} + +static void print_grammar_char(FILE * file, uint32_t c) { + if (0x20 <= c && c <= 0x7f) { + fprintf(file, "%c", static_cast(c)); + } else { + // cop out of encoding UTF-8 + fprintf(file, "", c); + } +} + +static bool is_char_element(llama_grammar_element elem) { + switch (elem.type) { + case LLAMA_GRETYPE_CHAR: return true; + case LLAMA_GRETYPE_CHAR_NOT: return true; + case LLAMA_GRETYPE_CHAR_ALT: return true; + case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true; + case LLAMA_GRETYPE_CHAR_ANY: return true; + default: return false; + } +} + +static void print_rule_binary(FILE * file, const llama_grammar_rule & rule) { + for (auto elem : rule) { + switch (elem.type) { + case LLAMA_GRETYPE_END: fprintf(file, "END"); break; + case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break; + case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break; + case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break; + case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break; + case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break; + case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break; + case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break; + } + switch (elem.type) { + case LLAMA_GRETYPE_END: + case LLAMA_GRETYPE_ALT: + case LLAMA_GRETYPE_RULE_REF: + fprintf(file, "(%u) ", elem.value); + break; + case LLAMA_GRETYPE_CHAR: + case LLAMA_GRETYPE_CHAR_NOT: + case LLAMA_GRETYPE_CHAR_RNG_UPPER: + case LLAMA_GRETYPE_CHAR_ALT: + case LLAMA_GRETYPE_CHAR_ANY: + fprintf(file, "(\""); + print_grammar_char(file, elem.value); + fprintf(file, "\") "); + break; + } + } + fprintf(file, "\n"); +} + +static void print_rule( + FILE * file, + uint32_t rule_id, + const llama_grammar_rule & rule, + const std::map & symbol_id_names) { + if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) { + throw std::runtime_error( + "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id)); + } + fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str()); + for (size_t i = 0, end = rule.size() - 1; i < end; i++) { + llama_grammar_element elem = rule[i]; + switch (elem.type) { + case LLAMA_GRETYPE_END: + throw std::runtime_error( + "unexpected end of rule: " + std::to_string(rule_id) + "," + + std::to_string(i)); + case LLAMA_GRETYPE_ALT: + fprintf(file, "| "); + break; + case LLAMA_GRETYPE_RULE_REF: + fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str()); + break; + case LLAMA_GRETYPE_CHAR: + fprintf(file, "["); + print_grammar_char(file, elem.value); + break; + case LLAMA_GRETYPE_CHAR_NOT: + fprintf(file, "[^"); + print_grammar_char(file, elem.value); + break; + case LLAMA_GRETYPE_CHAR_RNG_UPPER: + if (i == 0 || !is_char_element(rule[i - 1])) { + throw std::runtime_error( + "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " + + std::to_string(rule_id) + "," + std::to_string(i)); + } + fprintf(file, "-"); + print_grammar_char(file, elem.value); + break; + case LLAMA_GRETYPE_CHAR_ALT: + if (i == 0 || !is_char_element(rule[i - 1])) { + throw std::runtime_error( + "LLAMA_GRETYPE_CHAR_ALT without preceding char: " + + std::to_string(rule_id) + "," + std::to_string(i)); + } + print_grammar_char(file, elem.value); + break; + case LLAMA_GRETYPE_CHAR_ANY: + fprintf(file, "."); + break; + } + if (is_char_element(elem)) { + switch (rule[i + 1].type) { + case LLAMA_GRETYPE_CHAR_ALT: + case LLAMA_GRETYPE_CHAR_RNG_UPPER: + case LLAMA_GRETYPE_CHAR_ANY: + break; + default: + fprintf(file, "] "); + } + } + } + fprintf(file, "\n"); +} + +// +// implementation +// + +uint32_t llama_grammar_parser::get_symbol_id(const char * src, size_t len) { + uint32_t next_id = static_cast(symbol_ids.size()); + auto result = symbol_ids.emplace(std::string(src, len), next_id); + return result.first->second; +} + +uint32_t llama_grammar_parser::generate_symbol_id(const std::string & base_name) { + uint32_t next_id = static_cast(symbol_ids.size()); + symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; + return next_id; +} + +void llama_grammar_parser::add_rule(uint32_t rule_id, const llama_grammar_rule & rule) { + if (rules.size() <= rule_id) { + rules.resize(rule_id + 1); + } + rules[rule_id] = rule; +} + +const char * llama_grammar_parser::parse_alternates( + const char * src, + const std::string & rule_name, + uint32_t rule_id, + bool is_nested) { + llama_grammar_rule rule; + const char * pos = parse_sequence(src, rule_name, rule, is_nested); + while (*pos == '|') { + rule.push_back({LLAMA_GRETYPE_ALT, 0}); + pos = parse_space(pos + 1, true); + pos = parse_sequence(pos, rule_name, rule, is_nested); + } + rule.push_back({LLAMA_GRETYPE_END, 0}); + add_rule(rule_id, rule); + return pos; +} + +const char * llama_grammar_parser::parse_sequence( + const char * src, + const std::string & rule_name, + llama_grammar_rule & rule, + bool is_nested) { + size_t last_sym_start = rule.size(); + const char * pos = src; + + auto handle_repetitions = [&](int min_times, int max_times) { + + if (last_sym_start == rule.size()) { + throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos); + } + + // apply transformation to previous symbol (last_sym_start to end) according to + // the following rewrite rules: + // S{m,n} --> S S S (m times) S'(n-m) + // S'(x) ::= S S'(x-1) | + // (... n-m definitions of these S' rules ...) + // S'(1) ::= S | + // S{m,} --> S S S (m times) S' + // S' ::= S S' | + // S* --> S{0,} + // --> S' ::= S S' | + // S+ --> S{1,} + // --> S S' + // S' ::= S S' | + // S? --> S{0,1} + // --> S' + // S' ::= S | + + llama_grammar_rule prev_rule(rule.begin() + last_sym_start, rule.end()); + if (min_times == 0) { + rule.resize(last_sym_start); + } else { + // Repeat the previous elements (min_times - 1) times + for (int i = 1; i < min_times; i++) { + rule.insert(rule.end(), prev_rule.begin(), prev_rule.end()); + } + } + + uint32_t last_rec_rule_id = 0; + auto n_opt = max_times < 0 ? 1 : max_times - min_times; + + llama_grammar_rule rec_rule(prev_rule); + for (int i = 0; i < n_opt; i++) { + rec_rule.resize(prev_rule.size()); + uint32_t rec_rule_id = generate_symbol_id( rule_name); + if (i > 0 || max_times < 0) { + rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id}); + } + rec_rule.push_back({LLAMA_GRETYPE_ALT, 0}); + rec_rule.push_back({LLAMA_GRETYPE_END, 0}); + add_rule( rec_rule_id, rec_rule); + last_rec_rule_id = rec_rule_id; + } + if (n_opt > 0) { + rule.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id}); + } + }; + + while (*pos) { + if (*pos == '"') { // literal string + pos++; + last_sym_start = rule.size(); + while (*pos != '"') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } + auto char_pair = parse_char(pos); + pos = char_pair.second; + rule.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); + } + pos = parse_space(pos + 1, is_nested); + } else if (*pos == '[') { // char range(s) + pos++; + enum llama_gretype start_type = LLAMA_GRETYPE_CHAR; + if (*pos == '^') { + pos++; + start_type = LLAMA_GRETYPE_CHAR_NOT; + } + last_sym_start = rule.size(); + while (*pos != ']') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } + auto char_pair = parse_char(pos); + pos = char_pair.second; + enum llama_gretype type = last_sym_start < rule.size() + ? LLAMA_GRETYPE_CHAR_ALT + : start_type; + + rule.push_back({type, char_pair.first}); + if (pos[0] == '-' && pos[1] != ']') { + if (!pos[1]) { + throw std::runtime_error("unexpected end of input"); + } + auto endchar_pair = parse_char(pos + 1); + pos = endchar_pair.second; + rule.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); + } + } + pos = parse_space(pos + 1, is_nested); + } else if (is_word_char(*pos)) { // rule reference + const char * name_end = parse_name(pos); + uint32_t ref_rule_id = get_symbol_id(pos, name_end - pos); + pos = parse_space(name_end, is_nested); + last_sym_start = rule.size(); + rule.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); + } else if (*pos == '(') { // grouping + // parse nested alternates into synthesized rule + pos = parse_space(pos + 1, true); + uint32_t sub_rule_id = generate_symbol_id(rule_name); + pos = parse_alternates(pos, rule_name, sub_rule_id, true); + last_sym_start = rule.size(); + // output reference to synthesized rule + rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); + if (*pos != ')') { + throw std::runtime_error(std::string("expecting ')' at ") + pos); + } + pos = parse_space(pos + 1, is_nested); + } else if (*pos == '.') { // any char + last_sym_start = rule.size(); + rule.push_back({LLAMA_GRETYPE_CHAR_ANY, 0}); + pos = parse_space(pos + 1, is_nested); + } else if (*pos == '*') { + pos = parse_space(pos + 1, is_nested); + handle_repetitions(0, -1); + } else if (*pos == '+') { + pos = parse_space(pos + 1, is_nested); + handle_repetitions(1, -1); + } else if (*pos == '?') { + pos = parse_space(pos + 1, is_nested); + handle_repetitions(0, 1); + } else if (*pos == '{') { + pos = parse_space(pos + 1, is_nested); + + if (!is_digit_char(*pos)) { + throw std::runtime_error(std::string("expecting an int at ") + pos); + } + const char * int_end = parse_int(pos); + int min_times = std::stoul(std::string(pos, int_end - pos)); + pos = parse_space(int_end, is_nested); + + int max_times = -1; + + if (*pos == '}') { + max_times = min_times; + pos = parse_space(pos + 1, is_nested); + } else if (*pos == ',') { + pos = parse_space(pos + 1, is_nested); + + if (is_digit_char(*pos)) { + const char * int_end = parse_int(pos); + max_times = std::stoul(std::string(pos, int_end - pos)); + pos = parse_space(int_end, is_nested); + } + + if (*pos != '}') { + throw std::runtime_error(std::string("expecting '}' at ") + pos); + } + pos = parse_space(pos + 1, is_nested); + } else { + throw std::runtime_error(std::string("expecting ',' at ") + pos); + } + handle_repetitions(min_times, max_times); + } else { + break; + } + } + return pos; + } + +const char * llama_grammar_parser::parse_rule(const char * src) { + const char * name_end = parse_name(src); + const char * pos = parse_space(name_end, false); + size_t name_len = name_end - src; + uint32_t rule_id = get_symbol_id(src, name_len); + const std::string name(src, name_len); + + if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) { + throw std::runtime_error(std::string("expecting ::= at ") + pos); + } + pos = parse_space(pos + 3, true); + + pos = parse_alternates(pos, name, rule_id, false); + + if (*pos == '\r') { + pos += pos[1] == '\n' ? 2 : 1; + } else if (*pos == '\n') { + pos++; + } else if (*pos) { + throw std::runtime_error(std::string("expecting newline or end at ") + pos); + } + return parse_space(pos, true); + } + +bool llama_grammar_parser::parse(const char * src) { + try { + const char * pos = parse_space(src, true); + while (*pos) { + pos = parse_rule(pos); + } + // Validate the state to ensure that all rules are defined + for (const auto & rule : rules) { + if (rule.empty()) { + throw std::runtime_error("Undefined rule"); + } + for (const auto & elem : rule) { + if (elem.type == LLAMA_GRETYPE_RULE_REF) { + // Ensure that the rule at that location exists + if (elem.value >= rules.size() || rules[elem.value].empty()) { + // Get the name of the rule that is missing + for (const auto & kv : symbol_ids) { + if (kv.second == elem.value) { + throw std::runtime_error("Undefined rule identifier '" + kv.first + "'"); + } + } + } + } + } + } + } catch (const std::exception & err) { + fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what()); + rules.clear(); + return false; + } + + return true; +} + +void llama_grammar_parser::print(FILE * file) { + try { + std::map symbol_id_names; + for (const auto & kv : symbol_ids) { + symbol_id_names[kv.second] = kv.first; + } + for (size_t i = 0, end = rules.size(); i < end; i++) { + // fprintf(file, "%zu: ", i); + // print_rule_binary(file, rules[i]); + print_rule(file, uint32_t(i), rules[i], symbol_id_names); + // fprintf(file, "\n"); + } + } catch (const std::exception & err) { + fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what()); + } +} + +llama_grammar_stack llama_grammar_parser::c_rules() const { + llama_grammar_stack ret; + ret.reserve(rules.size()); + for (const auto & rule : rules) { + ret.push_back(rule.data()); + } + return ret; } // returns true iff pos points to the end of one of the definitions of a rule @@ -89,7 +607,6 @@ static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) static std::pair llama_grammar_match_char( const llama_grammar_element * pos, const uint32_t chr) { - bool found = false; bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY; @@ -225,16 +742,93 @@ static void llama_grammar_advance_stack( } } -// takes a set of possible pushdown stacks on a grammar, which are required to -// be positioned at a character range (see `llama_grammar_advance_stack`), and -// produces the N possible stacks if the given char is accepted at those -// positions +static llama_grammar_candidates llama_grammar_reject_candidates( + const llama_grammar_rules & rules, + const llama_grammar_stacks & stacks, + const llama_grammar_candidates & candidates) { + GGML_ASSERT(!stacks.empty()); // REVIEW + + if (candidates.empty()) { + return {}; + } + + auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates); + + for (size_t i = 1, size = stacks.size(); i < size; ++i) { + rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects); + } + + return rejects; +} + +static bool llama_grammar_detect_left_recursion( + const llama_grammar_rules & rules, + size_t rule_index, + std::vector * rules_visited, + std::vector * rules_in_progress, + std::vector * rules_may_be_empty) { + if ((*rules_in_progress)[rule_index]) { + return true; + } + + (*rules_in_progress)[rule_index] = true; + + const llama_grammar_rule & rule = rules[rule_index]; + + // First check if the rule might produce the empty string. This could be done combined with the second + // step but it's more readable as two steps. + bool at_rule_start = true; + for (size_t i = 0; i < rule.size(); i++) { + if (llama_grammar_is_end_of_sequence(&rule[i])) { + if (at_rule_start) { + (*rules_may_be_empty)[rule_index] = true; + break; + } + at_rule_start = true; + } else { + at_rule_start = false; + } + } + + // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may + // be empty) + bool recurse_into_nonterminal = true; + for (size_t i = 0; i < rule.size(); i++) { + if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) { + if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) { + return true; + } + if (!((*rules_may_be_empty)[(size_t)rule[i].value])) { + recurse_into_nonterminal = false; + } + } else if (llama_grammar_is_end_of_sequence(&rule[i])) { + recurse_into_nonterminal = true; + } else { + recurse_into_nonterminal = false; + } + } + + (*rules_in_progress)[rule_index] = false; + (*rules_visited)[rule_index] = true; + + return false; +} + +const llama_grammar_rules & llama_grammar_get_rules(const struct llama_grammar * grammar) { + return grammar->rules; +} + +llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) { + return grammar->stacks; +} + void llama_grammar_accept( const llama_grammar_rules & rules, const llama_grammar_stacks & stacks, const uint32_t chr, - llama_grammar_stacks & new_stacks) { - new_stacks.clear(); + llama_grammar_stacks & stacks_new) { + stacks_new.clear(); + stacks_new.reserve(stacks.size()); for (const auto & stack : stacks) { if (stack.empty()) { @@ -250,29 +844,11 @@ void llama_grammar_accept( if (!llama_grammar_is_end_of_sequence(pos)) { new_stack.push_back(pos); } - llama_grammar_advance_stack(rules, new_stack, new_stacks); + llama_grammar_advance_stack(rules, new_stack, stacks_new); } } } -static llama_grammar_candidates llama_grammar_reject_candidates( - const llama_grammar_rules & rules, - const llama_grammar_stacks & stacks, - const llama_grammar_candidates & candidates) { - GGML_ASSERT(!stacks.empty()); // REVIEW - - if (candidates.empty()) { - return {}; - } - - auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates); - - for (size_t i = 1, size = stacks.size(); i < size; ++i) { - rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects); - } - return rejects; -} - llama_grammar_candidates llama_grammar_reject_candidates_for_stack( const llama_grammar_rules & rules, const llama_grammar_stack & stack, @@ -328,66 +904,13 @@ llama_grammar_candidates llama_grammar_reject_candidates_for_stack( return rejects; } -static bool llama_grammar_detect_left_recursion( - const llama_grammar_rules & rules, - size_t rule_index, - std::vector * rules_visited, - std::vector * rules_in_progress, - std::vector * rules_may_be_empty) { - if ((*rules_in_progress)[rule_index]) { - return true; - } - - (*rules_in_progress)[rule_index] = true; - - const llama_grammar_rule & rule = rules[rule_index]; - - // First check if the rule might produce the empty string. This could be done combined with the second - // step but it's more readable as two steps. - bool at_rule_start = true; - for (size_t i = 0; i < rule.size(); i++) { - if (llama_grammar_is_end_of_sequence(&rule[i])) { - if (at_rule_start) { - (*rules_may_be_empty)[rule_index] = true; - break; - } - at_rule_start = true; - } else { - at_rule_start = false; - } - } - - // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may - // be empty) - bool recurse_into_nonterminal = true; - for (size_t i = 0; i < rule.size(); i++) { - if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) { - if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) { - return true; - } - if (!((*rules_may_be_empty)[(size_t)rule[i].value])) { - recurse_into_nonterminal = false; - } - } else if (llama_grammar_is_end_of_sequence(&rule[i])) { - recurse_into_nonterminal = true; - } else { - recurse_into_nonterminal = false; - } - } - - (*rules_in_progress)[rule_index] = false; - (*rules_visited)[rule_index] = true; - return false; -} - -// -// grammar - external -// +//////////////////// struct llama_grammar * llama_grammar_init_impl( - const llama_grammar_element ** rules, - size_t n_rules, - size_t start_rule_index) { + const struct llama_vocab * vocab, + const llama_grammar_element ** rules, + size_t n_rules, + size_t start_rule_index) { const llama_grammar_element * pos; // copy rule definitions into vectors @@ -438,22 +961,104 @@ struct llama_grammar * llama_grammar_init_impl( // Important: vec_rules has to be moved here, not copied, because stacks contains // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar // then the pointers would be invalidated when the local vec_rules goes out of scope. - return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} }; + return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, }; +} + +struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) { + llama_grammar_parser parser; + + // if there is a grammar, parse it + if (!parser.parse(grammar_str)) { + return nullptr; + } + + // will be empty (default) if there are parse errors + if (parser.rules.empty()) { + fprintf(stderr, "%s: failed to parse grammar\n", __func__); + return nullptr; + } + + // Ensure that there is a "root" node. + if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) { + fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__); + return nullptr; + } + + std::vector grammar_rules(parser.c_rules()); + + const size_t n_rules = grammar_rules.size(); + const size_t start_rule_index = parser.symbol_ids.at(grammar_root); + + const llama_grammar_element * pos; + + // copy rule definitions into vectors + llama_grammar_rules vec_rules(n_rules); + for (size_t i = 0; i < n_rules; i++) { + for (pos = grammar_rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) { + vec_rules[i].push_back(*pos); + } + vec_rules[i].push_back({LLAMA_GRETYPE_END, 0}); + } + + // Check for left recursion + std::vector rules_visited(n_rules); + std::vector rules_in_progress(n_rules); + std::vector rules_may_be_empty(n_rules); + for (size_t i = 0; i < n_rules; i++) { + if (rules_visited[i]) { + continue; + } + if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) { + LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i); + return nullptr; + } + } + + // loop over alternates of start rule to build initial stacks + llama_grammar_stacks stacks; + pos = vec_rules[start_rule_index].data(); + do { + llama_grammar_stack stack; + if (!llama_grammar_is_end_of_sequence(pos)) { + // if alternate is nonempty, add to stack + stack.push_back(pos); + } + llama_grammar_advance_stack(vec_rules, stack, stacks); + while (!llama_grammar_is_end_of_sequence(pos)) { + // scan to end of alternate def + pos++; + } + if (pos->type == LLAMA_GRETYPE_ALT) { + // there's another alternate def of this rule to process + pos++; + } else { + break; + } + } while (true); + + // Important: vec_rules has to be moved here, not copied, because stacks contains + // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar + // then the pointers would be invalidated when the local vec_rules goes out of scope. + return new llama_grammar { vocab, std::move(vec_rules), std::move(stacks), {}, }; } void llama_grammar_free_impl(struct llama_grammar * grammar) { + if (grammar == nullptr) { + return; + } + delete grammar; } -struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar) { - llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 }; +struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) { + llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, }; // redirect elements in stacks to point to new rules for (size_t is = 0; is < result->stacks.size(); is++) { for (size_t ie = 0; ie < result->stacks[is].size(); ie++) { - for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) { - for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) { - if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) { + for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) { + for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) { + if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) { result->stacks[is][ie] = &result->rules[ir0][ir1]; } } @@ -464,14 +1069,11 @@ struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * gram return result; } -void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token_data_array * candidates) { - GGML_ASSERT(grammar); - GGML_ASSERT(vocab); - - int64_t t_start_sample_us = ggml_time_us(); +void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_data_array * cur_p) { + GGML_ASSERT(grammar.vocab != nullptr); bool allow_eog = false; - for (const auto & stack : grammar->stacks) { + for (const auto & stack : grammar.stacks) { if (stack.empty()) { allow_eog = true; break; @@ -479,40 +1081,38 @@ void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struc } std::vector, llama_partial_utf8>> candidates_decoded; - candidates_decoded.reserve(candidates->size); + candidates_decoded.reserve(cur_p->size); llama_grammar_candidates candidates_grammar; - candidates_grammar.reserve(candidates->size); + candidates_grammar.reserve(cur_p->size); - for (size_t i = 0; i < candidates->size; ++i) { - const llama_token id = candidates->data[i].id; - const std::string & piece = vocab->cache_token_to_piece.at(id); + for (size_t i = 0; i < cur_p->size; ++i) { + const llama_token id = cur_p->data[i].id; + const std::string & piece = grammar.vocab->cache_token_to_piece.at(id); - if (llama_token_is_eog_impl(*vocab, id)) { + if (llama_token_is_eog_impl(*grammar.vocab, id)) { if (!allow_eog) { - candidates->data[i].logit = -INFINITY; + cur_p->data[i].logit = -INFINITY; } } else if (piece.empty() || piece[0] == 0) { - candidates->data[i].logit = -INFINITY; + cur_p->data[i].logit = -INFINITY; } else { - candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8)); + candidates_decoded.push_back(decode_utf8(piece, grammar.partial_utf8)); candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second }); } } - const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar); + const auto rejects = llama_grammar_reject_candidates(grammar.rules, grammar.stacks, candidates_grammar); for (const auto & reject : rejects) { - candidates->data[reject.index].logit = -INFINITY; + cur_p->data[reject.index].logit = -INFINITY; } - - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; } -void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struct llama_vocab * vocab, const struct llama_sampling * smpl, llama_token token) { - const int64_t t_start_sample_us = ggml_time_us(); +void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) { + GGML_ASSERT(grammar.vocab != nullptr); - if (llama_token_is_eog_impl(*vocab, token)) { - for (const auto & stack : grammar->stacks) { + if (llama_token_is_eog_impl(*grammar.vocab, token)) { + for (const auto & stack : grammar.stacks) { if (stack.empty()) { return; } @@ -520,20 +1120,19 @@ void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struc GGML_ABORT("fatal error"); } - const std::string & piece = vocab->cache_token_to_piece.at(token); + const std::string & piece = grammar.vocab->cache_token_to_piece.at(token); // Note terminating 0 in decoded string - const auto decoded = decode_utf8(piece, grammar->partial_utf8); + const auto decoded = decode_utf8(piece, grammar.partial_utf8); const auto & code_points = decoded.first; - llama_grammar_stacks tmp_new_stacks; + llama_grammar_stacks stacks_new; + for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks); - grammar->stacks = tmp_new_stacks; + llama_grammar_accept(grammar.rules, grammar.stacks, *it, stacks_new); + grammar.stacks = std::move(stacks_new); } - grammar->partial_utf8 = decoded.second; - GGML_ASSERT(!grammar->stacks.empty()); - - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; + grammar.partial_utf8 = decoded.second; + GGML_ASSERT(!grammar.stacks.empty()); } diff --git a/src/llama-grammar.h b/src/llama-grammar.h index 695ea0632..f529ce351 100644 --- a/src/llama-grammar.h +++ b/src/llama-grammar.h @@ -2,11 +2,115 @@ #include "llama-impl.h" +#include + struct llama_vocab; -struct llama_sampling; + +// grammar element type +enum llama_gretype { + // end of rule definition + LLAMA_GRETYPE_END = 0, + + // start of alternate definition for rule + LLAMA_GRETYPE_ALT = 1, + + // non-terminal element: reference to rule + LLAMA_GRETYPE_RULE_REF = 2, + + // terminal element: character (code point) + LLAMA_GRETYPE_CHAR = 3, + + // inverse char(s) ([^a], [^a-b] [^abc]) + LLAMA_GRETYPE_CHAR_NOT = 4, + + // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to + // be an inclusive range ([a-z]) + LLAMA_GRETYPE_CHAR_RNG_UPPER = 5, + + // modifies a preceding LLAMA_GRETYPE_CHAR or + // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) + LLAMA_GRETYPE_CHAR_ALT = 6, + + // any character (.) + LLAMA_GRETYPE_CHAR_ANY = 7, +}; + +typedef struct llama_grammar_element { + enum llama_gretype type; + uint32_t value; // Unicode code point or rule ID +} llama_grammar_element; + +struct llama_partial_utf8 { + uint32_t value; // bit value so far (unshifted) + int n_remain; // num bytes remaining; -1 indicates invalid sequence +}; + +struct llama_grammar_candidate { + size_t index; + const uint32_t * code_points; + llama_partial_utf8 partial_utf8; +}; + +using llama_grammar_rule = std::vector< llama_grammar_element>; +using llama_grammar_stack = std::vector; + +using llama_grammar_rules = std::vector; +using llama_grammar_stacks = std::vector; +using llama_grammar_candidates = std::vector; + +const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar); + llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar); + +// takes a set of possible pushdown stacks on a grammar, which are required to +// be positioned at a character range (see `llama_grammar_advance_stack`), and +// produces the N possible stacks if the given char is accepted at those +// positions +void llama_grammar_accept( + const llama_grammar_rules & rules, + const llama_grammar_stacks & stacks, + uint32_t chr, + llama_grammar_stacks & stacks_new); + +std::vector llama_grammar_reject_candidates_for_stack( + const llama_grammar_rules & rules, + const llama_grammar_stack & stack, + const llama_grammar_candidates & candidates); + +struct llama_grammar_parser { + std::map symbol_ids; + + llama_grammar_rules rules; + + llama_grammar_stack c_rules() const; + + uint32_t get_symbol_id(const char * src, size_t len); + uint32_t generate_symbol_id(const std::string & base_name); + + void add_rule(uint32_t rule_id, const llama_grammar_rule & rule); + + const char * parse_alternates( + const char * src, + const std::string & rule_name, + uint32_t rule_id, + bool is_nested); + + const char * parse_sequence( + const char * src, + const std::string & rule_name, + llama_grammar_rule & rule, + bool is_nested); + + const char * parse_rule(const char * src); + + bool parse(const char * src); + void print(FILE * file); +}; struct llama_grammar { - const llama_grammar_rules rules; + // note: allow null vocab for testing (not great) + const llama_vocab * vocab; + + const llama_grammar_rules rules; // TODO: shared ptr llama_grammar_stacks stacks; // buffer for partially generated UTF-8 sequence from accepted tokens @@ -17,23 +121,24 @@ struct llama_grammar { // internal API // +// note: needed for tests (not great) struct llama_grammar * llama_grammar_init_impl( - const llama_grammar_element ** rules, - size_t n_rules, - size_t start_rule_index); + const struct llama_vocab * vocab, + const llama_grammar_element ** rules, + size_t n_rules, + size_t start_rule_index); + +struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root); void llama_grammar_free_impl(struct llama_grammar * grammar); -struct llama_grammar * llama_grammar_copy_impl(const struct llama_grammar * grammar); +struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar); -void llama_grammar_sample_impl( - const struct llama_grammar * grammar, - const struct llama_vocab * vocab, - const struct llama_sampling * smpl, - llama_token_data_array * candidates); +// TODO: move the API below as member functions of llama_grammar +void llama_grammar_apply_impl( + const struct llama_grammar & grammar, + llama_token_data_array * cur_p); -void llama_grammar_accept_token_impl( - struct llama_grammar * grammar, - const struct llama_vocab * vocab, - const struct llama_sampling * smpl, +void llama_grammar_accept_impl( + struct llama_grammar & grammar, llama_token token); diff --git a/src/llama-impl.h b/src/llama-impl.h index 952774096..87012617f 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -1,8 +1,11 @@ #pragma once -#define LLAMA_API_INTERNAL #include "llama.h" +#include +#include +#include + #ifdef __GNUC__ #ifdef __MINGW32__ #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) @@ -29,6 +32,20 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void * // helpers // +struct time_meas { + time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {} + + ~time_meas() { + if (t_start_us >= 0) { + t_acc += ggml_time_us() - t_start_us; + } + } + + const int64_t t_start_us; + + int64_t & t_acc; +}; + static void replace_all(std::string & s, const std::string & search, const std::string & replace) { if (search.empty()) { return; @@ -45,3 +62,117 @@ static void replace_all(std::string & s, const std::string & search, const std:: builder.append(s, last_pos, std::string::npos); s = std::move(builder); } + +const std::vector> & llama_internal_get_tensor_map( + struct llama_context * ctx +); + +// the ring buffer works similarly to std::deque, but with a fixed capacity +template +struct ring_buffer { + ring_buffer(size_t cap) : capacity(cap), data(cap) {} + + T & front() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[first]; + } + + const T & front() const { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[first]; + } + + T & back() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[pos]; + } + + const T & back() const { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + return data[pos]; + } + + void push_back(const T & value) { + if (capacity == 0) { + throw std::runtime_error("ring buffer: capacity is zero"); + } + + if (sz == capacity) { + // advance the start when buffer is full + first = (first + 1) % capacity; + } else { + sz++; + } + data[pos] = value; + pos = (pos + 1) % capacity; + } + + T pop_front() { + if (sz == 0) { + throw std::runtime_error("ring buffer is empty"); + } + T value = data[first]; + first = (first + 1) % capacity; + sz--; + return value; + } + + //T & operator[](size_t i) { + // if (i >= sz) { + // throw std::runtime_error("ring buffer: index out of bounds"); + // } + // return data[(first + i) % capacity]; + //} + + //const T & at(size_t i) const { + // if (i >= sz) { + // throw std::runtime_error("ring buffer: index out of bounds"); + // } + // return data[(first + i) % capacity]; + //} + + const T & rat(size_t i) const { + if (i >= sz) { + throw std::runtime_error("ring buffer: index out of bounds"); + } + return data[(first + sz - i - 1) % capacity]; + } + + std::vector to_vector() const { + std::vector result; + result.reserve(sz); + for (size_t i = 0; i < sz; i++) { + result.push_back(data[(first + i) % capacity]); + } + return result; + } + + void clear() { + // here only reset the status of the buffer + sz = 0; + first = 0; + pos = 0; + } + + bool empty() const { + return sz == 0; + } + + size_t size() const { + return sz; + } + + size_t capacity = 0; + size_t sz = 0; + size_t first = 0; + size_t pos = 0; + std::vector data; +}; diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 8f4841d9d..6f448b80c 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1,12 +1,51 @@ #include "llama-sampling.h" +#include "llama-vocab.h" +#include "llama-grammar.h" + +#include #include #include #include #include +#include #include +#include #include +static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) { + // iterator for the probabilities +#ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#endif + + struct probs_iterator { + typedef std::input_iterator_tag iterator_category; + typedef float value_type; + typedef float * pointer; + typedef float & reference; + typedef ptrdiff_t difference_type; + + const llama_token_data * data; + + bool operator==(const probs_iterator & other) const { return data == other.data; } + bool operator!=(const probs_iterator & other) const { return data != other.data; } + const float & operator*() const { return data->p; } + probs_iterator & operator++() { ++data; return *this; } + probs_iterator operator++(int) { probs_iterator tmp = *this; ++data; return tmp; } + }; + +#ifdef __GNUC__ + #pragma GCC diagnostic pop +#endif + + std::discrete_distribution dist(probs_iterator{cur_p->data}, probs_iterator{cur_p->data + cur_p->size}); + + return dist(rng); +} + +/* static void llama_log_softmax(float * array, size_t size) { float max_l = *std::max_element(array, array + size); float sum = 0.f; @@ -20,66 +59,52 @@ static void llama_log_softmax(float * array, size_t size) { array[i] = logf(array[i] / sum); } } +*/ -void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed) { - if (seed == LLAMA_DEFAULT_SEED) { - seed = time(NULL); - } - - smpl->rng.seed(seed); -} - -void llama_sample_softmax_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) { - GGML_ASSERT(candidates->size > 0); - - const int64_t t_start_sample_us = ggml_time_us(); +static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) { + GGML_ASSERT(cur_p->size > 0); // Sort the logits in descending order - if (!candidates->sorted) { - std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + if (!cur_p->sorted) { + std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }); - candidates->sorted = true; + cur_p->sorted = true; } - float max_l = candidates->data[0].logit; + float max_l = cur_p->data[0].logit; float cum_sum = 0.0f; - for (size_t i = 0; i < candidates->size; ++i) { - float p = expf(candidates->data[i].logit - max_l); - candidates->data[i].p = p; + + for (size_t i = 0; i < cur_p->size; ++i) { + float p = expf(cur_p->data[i].logit - max_l); + cur_p->data[i].p = p; cum_sum += p; } - for (size_t i = 0; i < candidates->size; ++i) { - candidates->data[i].p /= cum_sum; - } - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= cum_sum; } } -void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep) { +static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) { // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast - // if (k >= (int32_t)candidates->size) { + // if (k >= (int32_t)cur_p->size) { // return; // } - const int64_t t_start_sample_us = ggml_time_us(); - if (k <= 0) { - k = candidates->size; + k = cur_p->size; } - k = std::max(k, (int) min_keep); - k = std::min(k, (int) candidates->size); + k = std::min(k, (int) cur_p->size); // Sort scores in descending order - if (!candidates->sorted) { + if (!cur_p->sorted) { auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }; if (k <= 128) { - std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp); + std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp); } else { constexpr int nbuckets = 128; constexpr float bucket_low = -10.0f; @@ -87,11 +112,11 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low); constexpr float bucket_inter = -bucket_low * bucket_scale; - std::vector bucket_idx(candidates->size); + std::vector bucket_idx(cur_p->size); std::vector histo(nbuckets, 0); - for (int i = 0; i < (int)candidates->size; ++i) { - const float val = candidates->data[i].logit; + for (int i = 0; i < (int)cur_p->size; ++i) { + const float val = cur_p->data[i].logit; int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); ib = std::max(0, std::min(nbuckets-1, ib)); bucket_idx[i] = ib; @@ -101,20 +126,22 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra int ib = nbuckets - 1; for ( ; ib >= 0; --ib) { nhave += histo[ib]; - if (nhave >= k) break; + if (nhave >= k) { + break; + } } std::vector tmp_tokens(nhave); - auto ptr = tmp_tokens.data(); + auto * ptr = tmp_tokens.data(); std::vector bucket_ptrs; bucket_ptrs.reserve(nbuckets - ib); for (int j = nbuckets - 1; j >= ib; --j) { bucket_ptrs.push_back(ptr); ptr += histo[j]; } - for (int i = 0; i < (int)candidates->size; ++i) { + for (int i = 0; i < (int)cur_p->size; ++i) { int j = bucket_idx[i]; if (j >= ib) { - *bucket_ptrs[nbuckets-1-j]++ = candidates->data[i]; + *bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i]; } } @@ -127,125 +154,551 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra } std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp); - std::memcpy(candidates->data, tmp_tokens.data(), k*sizeof(llama_token_data)); + std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data)); } - candidates->sorted = true; + cur_p->sorted = true; } - candidates->size = k; + cur_p->size = k; +} - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; +// llama_sampler API + +const char * llama_sampler_name(const struct llama_sampler * smpl) { + if (!smpl->iface) { + return "(null)"; + } + + return smpl->iface->name(smpl); +} + +void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) { + if (smpl->iface->accept) { + smpl->iface->accept(smpl, token); } } -void llama_sample_top_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) { - if (p >= 1.0f) { +void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) { + GGML_ASSERT(smpl->iface->apply); + smpl->iface->apply(smpl, cur_p); +} + +void llama_sampler_reset(struct llama_sampler * smpl) { + if (smpl->iface->reset) { + smpl->iface->reset(smpl); + } +} + +struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) { + if (smpl->iface->clone) { + return smpl->iface->clone(smpl); + } + + if (smpl->ctx == nullptr) { + return new llama_sampler { + /* .iface = */ smpl->iface, + /* .ctx = */ nullptr, + }; + } + + GGML_ABORT("the sampler does not support cloning"); +} + +void llama_sampler_free(struct llama_sampler * smpl) { + if (smpl == nullptr) { return; } - llama_sample_softmax_impl(smpl, candidates); + if (smpl->iface->free) { + smpl->iface->free(smpl); + } - const int64_t t_start_sample_us = ggml_time_us(); + delete smpl; +} + +llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) { + const auto * logits = llama_get_logits_ith(ctx, idx); + + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + + // TODO: do not allocate each time + std::vector cur(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + } + + llama_token_data_array cur_p = { + /* .data = */ cur.data(), + /* .size = */ cur.size(), + /* .selected = */ -1, + /* .sorted = */ false, + }; + + llama_sampler_apply(smpl, &cur_p); + + GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size); + + auto token = cur_p.data[cur_p.selected].id; + + llama_sampler_accept(smpl, token); + + return token; +} + +// sampler chain + +static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) { + return "chain"; +} + +static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token token) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + time_meas tm(chain->t_sample_us, chain->params.no_perf); + + for (auto * smpl : chain->samplers) { + llama_sampler_accept(smpl, token); + } + + chain->n_sample++; +} + +static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + time_meas tm(chain->t_sample_us, chain->params.no_perf); + + for (auto * smpl : chain->samplers) { + llama_sampler_apply(smpl, cur_p); + } +} + +static void llama_sampler_chain_reset(struct llama_sampler * smpl) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + for (auto * smpl : chain->samplers) { + llama_sampler_reset(smpl); + } + + chain->t_sample_us = 0; + chain->n_sample = 0; +} + +static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) { + const auto * chain_src = (const llama_sampler_chain *) smpl->ctx; + + auto * result = llama_sampler_chain_init(chain_src->params); + + for (auto * smpl : chain_src->samplers) { + llama_sampler_chain_add(result, llama_sampler_clone(smpl)); + } + + return result; +} + +static void llama_sampler_chain_free(struct llama_sampler * smpl) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + for (auto * smpl : chain->samplers) { + llama_sampler_free(smpl); + } + + delete chain; +} + +static struct llama_sampler_i llama_sampler_chain_i = { + /* .name = */ llama_sampler_chain_name, + /* .accept = */ llama_sampler_chain_accept, + /* .apply = */ llama_sampler_chain_apply, + /* .reset = */ llama_sampler_chain_reset, + /* .clone = */ llama_sampler_chain_clone, + /* .free = */ llama_sampler_chain_free, +}; + +struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) { + return new llama_sampler { + /* .iface = */ &llama_sampler_chain_i, + /* .ctx = */ new llama_sampler_chain { + /* .params = */ params, + /* .samplers = */ {}, + /* .t_sample_us = */ 0, + /* .n_sample = */ 0, + }, + }; +} + +void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) { + auto * p = (llama_sampler_chain *) chain->ctx; + p->samplers.push_back(smpl); +} + +struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) { + const auto * p = (const llama_sampler_chain *) chain->ctx; + + if (i < 0 || i >= (int32_t) p->samplers.size()) { + return nullptr; + } + + return p->samplers[i]; +} + +int llama_sampler_chain_n(const struct llama_sampler * chain) { + const auto * p = (const llama_sampler_chain *) chain->ctx; + + return p->samplers.size(); +} + +// +// samplers +// + +// greedy + +static const char * llama_sampler_greedy_name(const struct llama_sampler * /*smpl*/) { + return "greedy"; +} + +static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) { + cur_p->selected = 0; + for (size_t i = 1; i < cur_p->size; ++i) { + if (cur_p->data[i].logit > cur_p->data[cur_p->selected].logit) { + cur_p->selected = i; + } + } +} + +static struct llama_sampler_i llama_sampler_greedy_i = { + /* .name = */ llama_sampler_greedy_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_greedy_apply, + /* .reset = */ nullptr, + /* .clone = */ nullptr, + /* .free = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_greedy() { + return new llama_sampler { + /* .iface = */ &llama_sampler_greedy_i, + /* .ctx = */ nullptr, + }; +} + +// dist + +struct llama_sampler_dist { + const uint32_t seed; + + std::mt19937 rng; +}; + +static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) { + return "dist"; +} + +static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_dist *) smpl->ctx; + cur_p->selected = llama_sample_dist(cur_p, ctx->rng); +} + +static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_dist *) smpl->ctx; + auto * result = llama_sampler_init_dist(ctx->seed); + + // copy the state + { + auto * result_ctx = (llama_sampler_dist *) result->ctx; + + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_dist_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_dist *) smpl->ctx; + ctx->rng = std::mt19937(ctx->seed); +} + +static void llama_sampler_dist_free(struct llama_sampler * smpl) { + delete (llama_sampler_dist *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_dist_i = { + /* .name = */ llama_sampler_dist_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_dist_apply, + /* .reset = */ llama_sampler_dist_reset, + /* .clone = */ llama_sampler_dist_clone, + /* .free = */ llama_sampler_dist_free, +}; + +struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { + return new llama_sampler { + /* .iface = */ &llama_sampler_dist_i, + /* .ctx = */ new llama_sampler_dist { + /* .seed = */ seed, + /* .rng = */ std::mt19937(seed), + }, + }; +} + +// softmax + +static const char * llama_sampler_softmax_name(const struct llama_sampler * /*smpl*/) { + return "softmax"; +} + +static void llama_sampler_softmax_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) { + llama_sampler_softmax_impl(cur_p); +} + +static struct llama_sampler_i llama_sampler_softmax_i = { + /* .name = */ llama_sampler_softmax_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_softmax_apply, + /* .reset = */ nullptr, + /* .clone = */ nullptr, + /* .free = */ nullptr, +}; + +struct llama_sampler * llama_sampler_init_softmax() { + return new llama_sampler { + /* .iface = */ &llama_sampler_softmax_i, + /* .ctx = */ nullptr, + }; +} + +// top-k + +struct llama_sampler_top_k { + const int32_t k; +}; + +static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl*/) { + return "top-k"; +} + +static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_top_k *) smpl->ctx; + llama_sampler_top_k_impl(cur_p, ctx->k); +} + +static struct llama_sampler * llama_sampler_top_k_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_top_k *) smpl->ctx; + return llama_sampler_init_top_k(ctx->k); +} + +static void llama_sampler_top_k_free(struct llama_sampler * smpl) { + delete (llama_sampler_top_k *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_top_k_i = { + /* .name = */ llama_sampler_top_k_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_top_k_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_top_k_clone, + /* .free = */ llama_sampler_top_k_free, +}; + +struct llama_sampler * llama_sampler_init_top_k(int32_t k) { + return new llama_sampler { + /* .iface = */ &llama_sampler_top_k_i, + /* .ctx = */ new llama_sampler_top_k { + /* .k = */ k, + }, + }; +} + +// top-p + +struct llama_sampler_top_p { + const float p; + const size_t min_keep; +}; + +static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) { + return "top-p"; +} + +static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_top_p *) smpl->ctx; + + if (ctx->p >= 1.0f) { + return; + } + + llama_sampler_softmax_impl(cur_p); // Compute the cumulative probabilities float cum_sum = 0.0f; - size_t last_idx = candidates->size; + size_t last_idx = cur_p->size; - for (size_t i = 0; i < candidates->size; ++i) { - cum_sum += candidates->data[i].p; + for (size_t i = 0; i < cur_p->size; ++i) { + cum_sum += cur_p->data[i].p; // Check if the running sum is at least p or if we have kept at least min_keep tokens // we set the last index to i+1 to indicate that the current iterate should be included in the set - if (cum_sum >= p && i + 1 >= min_keep) { + if (cum_sum >= ctx->p && i + 1 >= ctx->min_keep) { last_idx = i + 1; break; } } // Resize the output vector to keep only the top-p tokens - candidates->size = last_idx; - - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; - } + cur_p->size = last_idx; } -void llama_sample_min_p_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) { - if (p <= 0.0f || !candidates->size) { +static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_top_p *) smpl->ctx; + return llama_sampler_init_top_p(ctx->p, ctx->min_keep); +} + +static void llama_sampler_top_p_free(struct llama_sampler * smpl) { + delete (llama_sampler_top_p *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_top_p_i = { + /* .name = */ llama_sampler_top_p_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_top_p_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_top_p_clone, + /* .free = */ llama_sampler_top_p_free, +}; + +struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) { + return new llama_sampler { + /* .iface = */ &llama_sampler_top_p_i, + /* .ctx = */ new llama_sampler_top_p { + /* .p = */ p, + /* .min_keep = */ min_keep, + }, + }; +} + +// min-p + +struct llama_sampler_min_p { + const float p; + const size_t min_keep; +}; + +static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl*/) { + return "min-p"; +} + +static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_min_p *) smpl->ctx; + + if (ctx->p <= 0.0f || !cur_p->size) { return; } - const int64_t t_start_sample_us = ggml_time_us(); - bool min_p_applied = false; - // if the candidates aren't sorted, try the unsorted implementation first - if (!candidates->sorted) { + // if the cur_p aren't sorted, try the unsorted implementation first + if (!cur_p->sorted) { std::vector filtered_tokens; float max_logit = -FLT_MAX; - for (size_t i = 0; i < candidates->size; ++i) { - max_logit = std::max(max_logit, candidates->data[i].logit); + for (size_t i = 0; i < cur_p->size; ++i) { + max_logit = std::max(max_logit, cur_p->data[i].logit); } - const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max + const float min_logit = max_logit + logf(ctx->p); // min logit for p_i >= p * p_max - for (size_t i = 0; i < candidates->size; ++i) { - if (candidates->data[i].logit >= min_logit) { - filtered_tokens.push_back(candidates->data[i]); + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].logit >= min_logit) { + filtered_tokens.push_back(cur_p->data[i]); } } // if we have enough values the operation was a success - if (filtered_tokens.size() >= min_keep) { - memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data)); - candidates->size = filtered_tokens.size(); + if (filtered_tokens.size() >= ctx->min_keep) { + memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data)); + cur_p->size = filtered_tokens.size(); min_p_applied = true; } } - // if the candidates are sorted or the unsorted implementation failed, use this implementation + // if the cur_p are sorted or the unsorted implementation failed, use this implementation if (!min_p_applied) { // Sort the logits in descending order - if (!candidates->sorted) { - std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + if (!cur_p->sorted) { + std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }); - candidates->sorted = true; + cur_p->sorted = true; } - const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max + const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max size_t i = 1; // first token always matches - for (; i < candidates->size; ++i) { - if (candidates->data[i].logit < min_logit && i >= min_keep) { + for (; i < cur_p->size; ++i) { + if (cur_p->data[i].logit < min_logit && i >= ctx->min_keep) { break; // prob too small } } // Resize the output vector to keep only the matching tokens - candidates->size = i; - } - - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; + cur_p->size = i; } } -void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep) { - if (z >= 1.0f || candidates->size <= 2) { +static struct llama_sampler * llama_sampler_min_p_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_min_p *) smpl->ctx; + return llama_sampler_init_min_p(ctx->p, ctx->min_keep); +} + +static void llama_sampler_min_p_free(struct llama_sampler * smpl) { + delete (llama_sampler_min_p *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_min_p_i = { + /* .name = */ llama_sampler_min_p_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_min_p_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_min_p_clone, + /* .free = */ llama_sampler_min_p_free, +}; + +struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) { + return new llama_sampler { + /* .iface = */ &llama_sampler_min_p_i, + /* .ctx = */ new llama_sampler_min_p { + /* .p = */ p, + /* .min_keep = */ min_keep, + }, + }; +} + +// tail-free + +struct llama_sampler_tail_free { + const float z; + const size_t min_keep; +}; + +static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) { + return "tail-free"; +} + +static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_tail_free *) smpl->ctx; + + if (ctx->z >= 1.0f || cur_p->size <= 2) { return; } - llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates); - const int64_t t_start_sample_us = ggml_time_us(); + llama_sampler_softmax_impl(cur_p); // Compute the first and second derivatives - std::vector first_derivatives(candidates->size - 1); - std::vector second_derivatives(candidates->size - 2); + std::vector first_derivatives(cur_p->size - 1); + std::vector second_derivatives(cur_p->size - 2); for (size_t i = 0; i < first_derivatives.size(); ++i) { - first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p; + first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p; } for (size_t i = 0; i < second_derivatives.size(); ++i) { second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1]; @@ -272,51 +725,86 @@ void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_ } float cum_sum = 0.0f; - size_t last_idx = candidates->size; + size_t last_idx = cur_p->size; for (size_t i = 0; i < second_derivatives.size(); ++i) { cum_sum += second_derivatives[i]; // Check if the running sum is greater than z or if we have kept at least min_keep tokens - if (cum_sum > z && i >= min_keep) { + if (cum_sum > ctx->z && i >= ctx->min_keep) { last_idx = i; break; } } // Resize the output vector to keep only the tokens above the tail location - candidates->size = last_idx; - - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; - } + cur_p->size = last_idx; } -void llama_sample_typical_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep) { +static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx; + return llama_sampler_init_tail_free(ctx->z, ctx->min_keep); +} + +static void llama_sampler_tail_free_free(struct llama_sampler * smpl) { + delete (llama_sampler_tail_free *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_tail_free_i = { + /* .name = */ llama_sampler_tail_free_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_tail_free_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_tail_free_clone, + /* .free = */ llama_sampler_tail_free_free, +}; + +struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) { + return new llama_sampler { + /* .iface = */ &llama_sampler_tail_free_i, + /* .ctx = */ new llama_sampler_tail_free { + /* .z = */ z, + /*. min_keep = */ min_keep, + }, + }; +} + +// typical + +struct llama_sampler_typical { + const float p; + const size_t min_keep; +}; + +static const char * llama_sampler_typical_name(const struct llama_sampler * /*smpl*/) { + return "typical"; +} + +static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_typical *) smpl->ctx; + // Reference implementation: // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr - if (p >= 1.0f) { + if (ctx->p >= 1.0f) { return; } // Compute the softmax of logits and calculate entropy - llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates); - - const int64_t t_start_sample_us = ggml_time_us(); + llama_sampler_softmax_impl(cur_p); float entropy = 0.0f; - for (size_t i = 0; i < candidates->size; ++i) { - entropy += -candidates->data[i].p * logf(candidates->data[i].p); + for (size_t i = 0; i < cur_p->size; ++i) { + entropy += -cur_p->data[i].p * logf(cur_p->data[i].p); } // Compute the absolute difference between negative log probability and entropy for each candidate std::vector shifted_scores; - for (size_t i = 0; i < candidates->size; ++i) { - float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy); + for (size_t i = 0; i < cur_p->size; ++i) { + float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy); shifted_scores.push_back(shifted_score); } // Sort tokens based on the shifted_scores and their corresponding indices - std::vector indices(candidates->size); + std::vector indices(cur_p->size); std::iota(indices.begin(), indices.end(), 0); std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { @@ -329,134 +817,610 @@ void llama_sample_typical_impl(struct llama_sampling * smpl, llama_token_data_ar for (size_t i = 0; i < indices.size(); ++i) { size_t idx = indices[i]; - cum_sum += candidates->data[idx].p; + cum_sum += cur_p->data[idx].p; // Check if the running sum is greater than typical or if we have kept at least min_keep tokens - if (cum_sum > p && i >= min_keep - 1) { + if (cum_sum > ctx->p && i >= ctx->min_keep - 1) { last_idx = i + 1; break; } } // Resize the output vector to keep only the locally typical tokens - std::vector new_candidates; + std::vector cur_p_new; for (size_t i = 0; i < last_idx; ++i) { size_t idx = indices[i]; - new_candidates.push_back(candidates->data[idx]); + cur_p_new.push_back(cur_p->data[idx]); } - // Replace the data in candidates with the new_candidates data - std::copy(new_candidates.begin(), new_candidates.end(), candidates->data); - candidates->size = new_candidates.size(); - candidates->sorted = false; + // Replace the data in cur_p with the cur_p_new data + std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data); + cur_p->size = cur_p_new.size(); + cur_p->sorted = false; +} - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; +static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_typical *) smpl->ctx; + return llama_sampler_init_typical(ctx->p, ctx->min_keep); +} + +static void llama_sampler_typical_free(struct llama_sampler * smpl) { + delete (llama_sampler_typical *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_typical_i = { + /* .name = */ llama_sampler_typical_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_typical_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_typical_clone, + /* .free = */ llama_sampler_typical_free, +}; + +struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) { + return new llama_sampler { + /* .iface = */ &llama_sampler_typical_i, + /* .ctx = */ new llama_sampler_typical { + /* .p = */ p, + /* .min_keep = */ min_keep, + }, + }; +} + +// temp + +struct llama_sampler_temp { + const float temp; +}; + +static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl*/) { + return "temp"; +} + +static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_temp *) smpl->ctx; + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= ctx->temp; } } -void llama_sample_entropy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val) { - const int64_t t_start_sample_us = ggml_time_us(); +static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_temp *) smpl->ctx; + return llama_sampler_init_temp(ctx->temp); +} - // no need to do anything if there is only one (or zero) candidates - if(candidates->size <= 1) { +static void llama_sampler_temp_free(struct llama_sampler * smpl) { + delete (llama_sampler_temp *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_temp_i = { + /* .name = */ llama_sampler_temp_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_temp_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_temp_clone, + /* .free = */ llama_sampler_temp_free, +}; + +struct llama_sampler * llama_sampler_init_temp(float temp) { + return new llama_sampler { + /* .iface = */ &llama_sampler_temp_i, + /* .ctx = */ new llama_sampler_temp { + /*.temp = */ temp, + }, + }; +} + +// temp-ext + +struct llama_sampler_temp_ext { + const float temp; + const float delta; + const float exponent; +}; + +static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*smpl*/) { + return "temp-ext"; +} + +static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx; + if (ctx->delta > 0) { + const float min_temp = std::max(0.0f, ctx->temp - ctx->delta); + const float max_temp = ctx->temp + ctx->delta; + float exponent_val = ctx->exponent; + + // no need to do anything if there is only one (or zero) candidates + if (cur_p->size <= 1) { + return; + } + + // Calculate maximum possible entropy + float max_entropy = -logf(1.0f / cur_p->size); + + llama_sampler_softmax_impl(cur_p); + + // Calculate entropy of the softmax probabilities + float entropy = 0.0f; + for (size_t i = 0; i < cur_p->size; ++i) { + float prob = cur_p->data[i].p; + if (prob > 0.0f) { // Ensure no log(0) + entropy -= prob * logf(prob); + } + } + + // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above) + float normalized_entropy = entropy / max_entropy; + + // Map the normalized entropy to the desired temperature range using the power function + float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); + + #ifdef DEBUG + LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp); + LLAMA_LOG_INFO("Entropy: %f\n", entropy); + LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy); + LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy); + LLAMA_LOG_INFO("Exponent: %f\n", exponent_val); + LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); + #endif + + // Apply the dynamically calculated temperature scaling + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= dyn_temp; + } + + // Re-compute softmax probabilities after scaling logits with dynamic temperature + const double max_l_double = cur_p->data[0].logit; + + double cum_sum_double = 0.0; + for (size_t i = 0; i < cur_p->size; ++i) { + double p = exp(cur_p->data[i].logit - max_l_double); + cur_p->data[i].p = p; // Store the scaled probability + cum_sum_double += p; + } + + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities + } + + #ifdef DEBUG + // Print the updated top 25 probabilities after temperature scaling + LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); + for (size_t i = 0; i < 25 && i < cur_p->size; ++i) { + LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, cur_p->data[i].p * 100.0f); + } + #endif + } else { + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= ctx->temp; + } + } +} + +static struct llama_sampler * llama_sampler_temp_ext_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx; + return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent); +} + +static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) { + delete (llama_sampler_temp_ext *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_temp_ext_i = { + /* .name = */ llama_sampler_temp_ext_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_temp_ext_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_temp_ext_clone, + /* .free = */ llama_sampler_temp_ext_free, +}; + +struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) { + return new llama_sampler { + /* .iface = */ &llama_sampler_temp_ext_i, + /* .ctx = */ new llama_sampler_temp_ext { + /* .temp = */ temp, + /* .delta = */ delta, + /* .exponent = */ exponent, + }, + }; +} + +// mirostat + +struct llama_sampler_mirostat { + const int32_t n_vocab; + + const uint32_t seed; + + const float tau; + const float eta; + + const int32_t m; + + float mu; + + std::mt19937 rng; +}; + +static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) { + return "mirostat"; +} + +static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_mirostat *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p); + + // Estimate s_hat using the most probable m tokens + float s_hat = 0.0; + float sum_ti_bi = 0.0; + float sum_ti_sq = 0.0; + for (size_t i = 0; i < size_t(ctx->m - 1) && i < cur_p->size - 1; ++i) { + float t_i = logf(float(i + 2) / float(i + 1)); + float b_i = logf(cur_p->data[i].p / cur_p->data[i + 1].p); + sum_ti_bi += t_i * b_i; + sum_ti_sq += t_i * t_i; + } + s_hat = sum_ti_bi / sum_ti_sq; + + // Compute k from the estimated s_hat and target surprise value + float epsilon_hat = s_hat - 1; + float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat); + + llama_sampler_top_k_impl(cur_p, std::max(int(k), 1)); + llama_sampler_softmax_impl(cur_p); + + const int idx = llama_sample_dist(cur_p, ctx->rng); + + cur_p->selected = idx; + + float observed_surprise = -log2f(cur_p->data[idx].p); + float e = observed_surprise - ctx->tau; + + // Update mu using the learning rate and error + ctx->mu = ctx->mu - ctx->eta * e; +} + +static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_mirostat *) smpl->ctx; + auto * result = llama_sampler_init_mirostat(ctx->n_vocab, ctx->seed, ctx->tau, ctx->eta, ctx->m); + + // copy the state + { + auto * result_ctx = (llama_sampler_mirostat *) smpl->ctx; + + result_ctx->mu = ctx->mu; + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_mirostat *) smpl->ctx; + ctx->mu = 2.0f*ctx->tau; + ctx->rng = std::mt19937(ctx->seed); +} + +static void llama_sampler_mirostat_free(struct llama_sampler * smpl) { + delete (llama_sampler_mirostat *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_mirostat_i = { + /* .name = */ llama_sampler_mirostat_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_mirostat_apply, + /* .reset = */ llama_sampler_mirostat_reset, + /* .clone = */ llama_sampler_mirostat_clone, + /* .free = */ llama_sampler_mirostat_free, +}; + +struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) { + return new llama_sampler { + /* .iface = */ &llama_sampler_mirostat_i, + /* .ctx = */ new llama_sampler_mirostat { + /* .n_vocab = */ n_vocab, + /* .seed = */ seed, + /* .tau = */ tau, + /* .eta = */ eta, + /* .m = */ m, + /* .mu = */ 2.0f*tau, + /* .rng = */ std::mt19937(seed), + }, + }; +} + +// mirostat v2 + +struct llama_sampler_mirostat_v2 { + const uint32_t seed; + + const float tau; + const float eta; + + float mu; + + std::mt19937 rng; +}; + +static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) { + return "mirostat-v2"; +} + +static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p); + + // Truncate the words with surprise values greater than mu + cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) { + return -log2f(candidate.p) > ctx->mu; + })); + + if (cur_p->size == 0) { + cur_p->size = 1; + } + + // Normalize the probabilities of the remaining words + llama_sampler_softmax_impl(cur_p); + + const int idx = llama_sample_dist(cur_p, ctx->rng); + + cur_p->selected = idx; + + float observed_surprise = -log2f(cur_p->data[idx].p); + float e = observed_surprise - ctx->tau; + + // Update mu using the learning rate and error + ctx->mu = ctx->mu - ctx->eta * e; +} + +static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; + ctx->mu = 2.0f*ctx->tau; + ctx->rng = std::mt19937(ctx->seed); +} + +static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_mirostat_v2 *) smpl->ctx; + + auto * result = llama_sampler_init_mirostat_v2(ctx->seed, ctx->tau, ctx->eta); + + // copy the state + { + auto * result_ctx = (llama_sampler_mirostat_v2 *) result->ctx; + + result_ctx->mu = ctx->mu; + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_mirostat_v2_free(struct llama_sampler * smpl) { + delete (llama_sampler_mirostat_v2 *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_mirostat_v2_i = { + /* .name = */ llama_sampler_mirostat_v2_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_mirostat_v2_apply, + /* .reset = */ llama_sampler_mirostat_v2_reset, + /* .clone = */ llama_sampler_mirostat_v2_clone, + /* .free = */ llama_sampler_mirostat_v2_free, +}; + +struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) { + return new llama_sampler { + /* .iface = */ &llama_sampler_mirostat_v2_i, + /* .ctx = */ new llama_sampler_mirostat_v2 { + /* .seed = */ seed, + /* .tau = */ tau, + /* .eta = */ eta, + /* .mu = */ 2.0f*tau, + /* .rng = */ std::mt19937(seed), + }, + }; +} + +// grammar + +struct llama_sampler_grammar { + const struct llama_vocab * vocab; + + std::string grammar_str; + std::string grammar_root; + + struct llama_grammar * grammar; +}; + +static const char * llama_sampler_grammar_name(const struct llama_sampler * /*smpl*/) { + return "grammar"; +} + +static void llama_sampler_grammar_accept_impl(struct llama_sampler * smpl, llama_token token) { + auto * ctx = (llama_sampler_grammar *) smpl->ctx; + if (ctx->grammar) { + llama_grammar_accept_impl(*ctx->grammar, token); + } +} + +static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_grammar *) smpl->ctx; + if (ctx->grammar) { + llama_grammar_apply_impl(*ctx->grammar, cur_p); + } +} + +static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_grammar *) smpl->ctx; + if (!ctx->grammar) { return; } - // Calculate maximum possible entropy - float max_entropy = -logf(1.0f / candidates->size); + auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str()); - llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates); + llama_grammar_free_impl(ctx->grammar); + ctx->grammar = grammar_new; +} - // Calculate entropy of the softmax probabilities - float entropy = 0.0f; - for (size_t i = 0; i < candidates->size; ++i) { - float prob = candidates->data[i].p; - if (prob > 0.0f) { // Ensure no log(0) - entropy -= prob * logf(prob); +static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_grammar *) smpl->ctx; + + auto * result = llama_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr); + + // copy the state + { + auto * result_ctx = (llama_sampler_grammar *) result->ctx; + + if (ctx->grammar) { + result_ctx->grammar_str = ctx->grammar_str; + result_ctx->grammar_root = ctx->grammar_root; + + result_ctx->grammar = llama_grammar_clone_impl(*ctx->grammar); } } - // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates->size != 1 above) - float normalized_entropy = entropy / max_entropy; - - // Map the normalized entropy to the desired temperature range using the power function - float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); - -#ifdef DEBUG - LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp); - LLAMA_LOG_INFO("Entropy: %f\n", entropy); - LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy); - LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy); - LLAMA_LOG_INFO("Exponent: %f\n", exponent_val); - LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); -#endif - - // Apply the dynamically calculated temperature scaling - for (size_t i = 0; i < candidates->size; ++i) { - candidates->data[i].logit /= dyn_temp; - } - - // Re-compute softmax probabilities after scaling logits with dynamic temperature - double max_l_double = candidates->data[0].logit; - double cum_sum_double = 0.0; - for (size_t i = 0; i < candidates->size; ++i) { - double p = exp(candidates->data[i].logit - max_l_double); - candidates->data[i].p = p; // Store the scaled probability - cum_sum_double += p; - } - for (size_t i = 0; i < candidates->size; ++i) { - candidates->data[i].p /= cum_sum_double; // Re-normalize the probabilities - } - -#ifdef DEBUG - // Print the updated top 25 probabilities after temperature scaling - LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); - for (size_t i = 0; i < 25 && i < candidates->size; ++i) { - LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates->data[i].p * 100.0f); - } -#endif - - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; - } + return result; } -void llama_sample_temp_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float temp) { - const int64_t t_start_sample_us = ggml_time_us(); +static void llama_sampler_grammar_free(struct llama_sampler * smpl) { + const auto * ctx = (llama_sampler_grammar *) smpl->ctx; - for (size_t i = 0; i < candidates->size; ++i) { - candidates->data[i].logit /= temp; + if (ctx->grammar) { + llama_grammar_free_impl(ctx->grammar); } - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; - } + delete ctx; } -void llama_sample_repetition_penalties_impl( - struct llama_sampling * smpl, - llama_token_data_array * candidates, - const llama_token * last_tokens, - size_t penalty_last_n, - float penalty_repeat, - float penalty_freq, - float penalty_present) { - if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) { +static struct llama_sampler_i llama_sampler_grammar_i = { + /* .name = */ llama_sampler_grammar_name, + /* .accept = */ llama_sampler_grammar_accept_impl, + /* .apply = */ llama_sampler_grammar_apply, + /* .reset = */ llama_sampler_grammar_reset, + /* .clone = */ llama_sampler_grammar_clone, + /* .free = */ llama_sampler_grammar_free, +}; + +struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root) { + auto * ctx = new llama_sampler_grammar; + + if (grammar_str != nullptr && grammar_str[0] != '\0') { + *ctx = { + /* .vocab = */ &vocab, + /* .grammar_str = */ grammar_str, + /* .grammar_root = */ grammar_root, + /* .grammar = */ llama_grammar_init_impl(&vocab, grammar_str, grammar_root), + }; + } else { + *ctx = { + /* .vocab = */ &vocab, + /* .grammar_str = */ {}, + /* .grammar_root = */ {}, + /* .grammar = */ nullptr, + }; + } + + return new llama_sampler { + /* .iface = */ &llama_sampler_grammar_i, + /* .ctx = */ ctx, + }; +} + +// penalties + +struct llama_sampler_penalties { + const int32_t n_vocab; + const llama_token special_eos_id; + const llama_token linefeed_id; + + const int32_t penalty_last_n; + const float penalty_repeat; + const float penalty_freq; + const float penalty_present; + + const bool penalize_nl; + const bool ignore_eos; + + ring_buffer prev; +}; + +static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) { + return "penalties"; +} + +static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_token token) { + auto * ctx = (llama_sampler_penalties *) smpl->ctx; + if (ctx->penalty_last_n == 0) { return; } - const int64_t t_start_sample_us = ggml_time_us(); + ctx->prev.push_back(token); +} - // Create a frequency map to count occurrences of each token in last_tokens - std::unordered_map token_count; - for (size_t i = 0; i < penalty_last_n; ++i) { - token_count[last_tokens[i]]++; +static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_penalties *) smpl->ctx; + + if (ctx->ignore_eos) { + assert(ctx->special_eos_id >= 0); + + // optimistically check if the candidates are not yet sorted/shuffled/truncated + if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) { + cur_p->data[ctx->special_eos_id].logit = -INFINITY; + } else { + // else, search for the special EOS token + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].id == ctx->special_eos_id) { + cur_p->data[i].logit = -INFINITY; + break; + } + } + } } - // Apply frequency and presence penalties to the candidates - for (size_t i = 0; i < candidates->size; ++i) { - const auto token_iter = token_count.find(candidates->data[i].id); + if ((ctx->penalty_last_n == 0) || + (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) { + return; + } + + bool nl_found = false; + size_t nl_idx = 0; + float nl_logit = -INFINITY; + if (!ctx->penalize_nl) { + assert(ctx->linefeed_id >= 0); + + // optimistically check if the candidates are not yet sorted/shuffled/truncated + if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) { + nl_found = true; + nl_idx = ctx->linefeed_id; + nl_logit = cur_p->data[ctx->linefeed_id].logit; + } else { + // else, search for the linefeed token + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].id == ctx->linefeed_id) { + nl_found = true; + nl_idx = i; + nl_logit = cur_p->data[i].logit; + break; + } + } + } + } + + // Create a frequency map to count occurrences of each token in last_tokens + // TODO: optimize this by maintaining the token count in the sampler context + using llama_token_cnt = std::unordered_map; + llama_token_cnt token_count; + + for (int i = 0; i < std::min(ctx->penalty_last_n, ctx->prev.size()); ++i) { + token_count[ctx->prev.rat(i)]++; + } + + // Apply frequency and presence penalties to the cur_p + for (size_t i = 0; i < cur_p->size; ++i) { + const auto token_iter = token_count.find(cur_p->data[i].id); if (token_iter == token_count.end()) { continue; } @@ -465,171 +1429,173 @@ void llama_sample_repetition_penalties_impl( // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. // This is common fix for this problem, which is to multiply by the penalty instead of dividing. - if (candidates->data[i].logit <= 0) { - candidates->data[i].logit *= penalty_repeat; + if (cur_p->data[i].logit <= 0) { + cur_p->data[i].logit *= ctx->penalty_repeat; } else { - candidates->data[i].logit /= penalty_repeat; + cur_p->data[i].logit /= ctx->penalty_repeat; } - candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present; + cur_p->data[i].logit -= float(count) * ctx->penalty_freq + float(count > 0) * ctx->penalty_present; } - candidates->sorted = false; + cur_p->sorted = false; - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; + if (!ctx->penalize_nl && nl_found) { + // restore the logit of the newline token if it was penalized + cur_p->data[nl_idx].logit = nl_logit; } } -void llama_sample_apply_guidance_impl( - struct llama_sampling * smpl, - float * logits, - float * logits_guidance, - float scale) { - GGML_ASSERT(smpl); - - const auto t_start_sample_us = ggml_time_us(); - const auto n_vocab = smpl->n_vocab; - - llama_log_softmax(logits, n_vocab); - llama_log_softmax(logits_guidance, n_vocab); - - for (int i = 0; i < n_vocab; ++i) { - auto & l = logits[i]; - const auto & g = logits_guidance[i]; - - l = scale * (l - g) + g; - } - - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; +static void llama_sampler_penalties_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_penalties *) smpl->ctx; + ctx->prev.clear(); } -llama_token llama_sample_token_mirostat_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) { - GGML_ASSERT(smpl); +static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_penalties *) smpl->ctx; + auto * result = llama_sampler_init_penalties( + ctx->n_vocab, + ctx->special_eos_id, + ctx->linefeed_id, + ctx->penalty_last_n, + ctx->penalty_repeat, + ctx->penalty_freq, + ctx->penalty_present, + ctx->penalize_nl, + ctx->ignore_eos); - const int32_t n_vocab = float(smpl->n_vocab); + // copy the state + { + auto * result_ctx = (llama_sampler_penalties *) result->ctx; - int64_t t_start_sample_us = ggml_time_us(); - - llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates); - - // Estimate s_hat using the most probable m tokens - float s_hat = 0.0; - float sum_ti_bi = 0.0; - float sum_ti_sq = 0.0; - for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) { - float t_i = logf(float(i + 2) / float(i + 1)); - float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p); - sum_ti_bi += t_i * b_i; - sum_ti_sq += t_i * t_i; + result_ctx->prev = ctx->prev; } - s_hat = sum_ti_bi / sum_ti_sq; - - // Compute k from the estimated s_hat and target surprise value - float epsilon_hat = s_hat - 1; - float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(n_vocab, -epsilon_hat)), 1 / s_hat); - - // Sample the next word X using top-k sampling - llama_sample_top_k_impl((struct llama_sampling *) nullptr, candidates, int(k), 1); - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; - llama_token X = llama_sample_token_impl(smpl, candidates); - t_start_sample_us = ggml_time_us(); - - // Compute error as the difference between observed surprise and target surprise value - size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { - return candidate.id == X; - })); - float observed_surprise = -log2f(candidates->data[X_idx].p); - float e = observed_surprise - tau; - - // Update mu using the learning rate and error - *mu = *mu - eta * e; - - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; - return X; -} - -llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu) { - int64_t t_start_sample_us; - t_start_sample_us = ggml_time_us(); - - llama_sample_softmax_impl(smpl, candidates); - - // Truncate the words with surprise values greater than mu - candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { - return -log2f(candidate.p) > *mu; - })); - - if (candidates->size == 0) { - candidates->size = 1; - } - - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // Normalize the probabilities of the remaining words - llama_sample_softmax_impl(smpl, candidates); - - // Sample the next word X from the remaining words - llama_token X = llama_sample_token_impl(smpl, candidates); - t_start_sample_us = ggml_time_us(); - - // Compute error as the difference between observed surprise and target surprise value - size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { - return candidate.id == X; - })); - float observed_surprise = -log2f(candidates->data[X_idx].p); - float e = observed_surprise - tau; - - // Update mu using the learning rate and error - *mu = *mu - eta * e; - - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; - } - return X; -} - -llama_token llama_sample_token_greedy_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) { - const int64_t t_start_sample_us = ggml_time_us(); - - // Find max element - auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { - return a.logit < b.logit; - }); - - llama_token result = max_iter->id; - if (smpl) { - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; - smpl->n_sample++; - } - return result; -} - -llama_token llama_sample_token_with_rng_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng) { - GGML_ASSERT(smpl); - - const int64_t t_start_sample_us = ggml_time_us(); - llama_sample_softmax_impl((struct llama_sampling *) nullptr, candidates); - - std::vector probs; - probs.reserve(candidates->size); - for (size_t i = 0; i < candidates->size; ++i) { - probs.push_back(candidates->data[i].p); - } - - std::discrete_distribution<> dist(probs.begin(), probs.end()); - int idx = dist(rng); - - llama_token result = candidates->data[idx].id; - - smpl->t_sample_us += ggml_time_us() - t_start_sample_us; - smpl->n_sample++; return result; } -llama_token llama_sample_token_impl(struct llama_sampling * smpl, llama_token_data_array * candidates) { - return llama_sample_token_with_rng_impl(smpl, candidates, smpl->rng); +static void llama_sampler_penalties_free(struct llama_sampler * smpl) { + delete (llama_sampler_penalties *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_penalties_i = { + /* .name = */ llama_sampler_penalties_name, + /* .accept = */ llama_sampler_penalties_accept, + /* .apply = */ llama_sampler_penalties_apply, + /* .reset = */ llama_sampler_penalties_reset, + /* .clone = */ llama_sampler_penalties_clone, + /* .free = */ llama_sampler_penalties_free, +}; + +struct llama_sampler * llama_sampler_init_penalties( + int32_t n_vocab, + llama_token special_eos_id, + llama_token linefeed_id, + int32_t penalty_last_n, + float penalty_repeat, + float penalty_freq, + float penalty_present, + bool penalize_nl, + bool ignore_eos) { + if (linefeed_id == LLAMA_TOKEN_NULL) { + penalize_nl = true; + } + + if (special_eos_id == LLAMA_TOKEN_NULL) { + ignore_eos = false; + } + + return new llama_sampler { + /* .iface = */ &llama_sampler_penalties_i, + /* .ctx = */ new llama_sampler_penalties { + /* .n_vocab = */ n_vocab, + /* .special_eos_id = */ special_eos_id, + /* .linefeed_id = */ linefeed_id, + /* .penalty_last_n = */ penalty_last_n, + /* .penalty_repeat = */ penalty_repeat, + /* .penalty_freq = */ penalty_freq, + /* .penalty_present = */ penalty_present, + /* .penalize_nl = */ penalize_nl, + /* .ignore_eos = */ ignore_eos, + /* .prev = */ ring_buffer(penalty_last_n), + }, + }; +} + +// logit-bias + +struct llama_sampler_logit_bias { + const int32_t n_vocab; + + const std::vector logit_bias; + + std::vector to_search; +}; + +static const char * llama_sampler_logit_bias_name(const struct llama_sampler * /*smpl*/) { + return "logit-bias"; +} + +static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_logit_bias *) smpl->ctx; + + if (ctx->logit_bias.empty()) { + return; + } + + ctx->to_search.clear(); + + // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id) + for (const auto & lb : ctx->logit_bias) { + if (lb.token >= 0 && cur_p->size > (size_t) lb.token && cur_p->data[lb.token].id == lb.token) { + cur_p->data[lb.token].logit += lb.bias; + } else { + ctx->to_search.push_back(lb); + } + } + + if (ctx->to_search.empty()) { + return; + } + + // search for the remaining candidates that were not found in the previous step + for (size_t i = 0; i < cur_p->size; ++i) { + for (const auto & lb : ctx->to_search) { + if (cur_p->data[i].id == lb.token) { + cur_p->data[i].logit += lb.bias; + break; + } + } + } +} +static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx; + return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data()); +} + +static void llama_sampler_logit_bias_free(struct llama_sampler * smpl) { + delete (llama_sampler_logit_bias *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_logit_bias_i = { + /* .name = */ llama_sampler_logit_bias_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_logit_bias_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_logit_bias_clone, + /* .free = */ llama_sampler_logit_bias_free, +}; + +struct llama_sampler * llama_sampler_init_logit_bias( + int32_t n_vocab, + int32_t n_logit_bias, + const llama_logit_bias * logit_bias) { + return new llama_sampler { + /* .iface = */ &llama_sampler_logit_bias_i, + /* .ctx = */ new llama_sampler_logit_bias { + /* .n_vocab = */ n_vocab, + /* .logit_bias = */ std::vector(logit_bias, logit_bias + n_logit_bias), + /* .to_search = */ {}, + }, + }; } diff --git a/src/llama-sampling.h b/src/llama-sampling.h index f7f8e3ef7..d90b14713 100644 --- a/src/llama-sampling.h +++ b/src/llama-sampling.h @@ -1,56 +1,29 @@ #pragma once -#include "llama-impl.h" +// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ? -struct llama_sampling { - llama_sampling(int32_t n_vocab) : n_vocab(n_vocab) {} +#include "llama-grammar.h" - std::mt19937 rng; +#include - int32_t n_vocab = 0; +struct llama_vocab; +struct llama_grammar; - mutable int64_t t_sample_us = 0; - mutable int32_t n_sample = 0; +// sampler chain - void reset_timings() const { - t_sample_us = 0; - n_sample = 0; - } +struct llama_sampler_chain { + llama_sampler_chain_params params; + + std::vector samplers; + + // timing + + mutable int64_t t_sample_us; + + mutable int32_t n_sample; }; -// -// internal API -// - -void llama_set_rng_seed_impl(struct llama_sampling * smpl, uint32_t seed); - -void llama_sample_softmax_impl (struct llama_sampling * smpl, llama_token_data_array * candidates); -void llama_sample_top_k_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, int32_t k, size_t min_keep); -void llama_sample_top_p_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep); -void llama_sample_min_p_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep); -void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float z, size_t min_keep); -void llama_sample_typical_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float p, size_t min_keep); -void llama_sample_entropy_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float min_temp, float max_temp, float exponent_val); -void llama_sample_temp_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float temp); - -void llama_sample_repetition_penalties_impl( - struct llama_sampling * smpl, - llama_token_data_array * candidates, - const llama_token * last_tokens, - size_t penalty_last_n, - float penalty_repeat, - float penalty_freq, - float penalty_present); - -void llama_sample_apply_guidance_impl( - struct llama_sampling * smpl, - float * logits, - float * logits_guidance, - float scale); - -llama_token llama_sample_token_mirostat_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu); -llama_token llama_sample_token_mirostat_v2_impl(struct llama_sampling * smpl, llama_token_data_array * candidates, float tau, float eta, float * mu); -llama_token llama_sample_token_greedy_impl (struct llama_sampling * smpl, llama_token_data_array * candidates); -llama_token llama_sample_token_with_rng_impl (struct llama_sampling * smpl, llama_token_data_array * candidates, std::mt19937 & rng); -llama_token llama_sample_token_impl (struct llama_sampling * smpl, llama_token_data_array * candidates); - +struct llama_sampler * llama_sampler_init_grammar_impl( + const struct llama_vocab & vocab, + const char * grammar_str, + const char * grammar_root); diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 6e8f30be4..dc4b5f12f 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -18,6 +18,8 @@ struct llama_vocab { tattr attr; }; + uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab + enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; @@ -62,8 +64,6 @@ struct llama_vocab { int find_bpe_rank(const std::string & token_left, const std::string & token_right) const; }; -const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx); - // // internal API // @@ -76,6 +76,7 @@ std::vector llama_tokenize_internal( bool add_special, bool parse_special = false); +// TODO: move the API below as member functions of llama_vocab llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch); const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token); diff --git a/src/llama.cpp b/src/llama.cpp index 29301eb50..43fd2f759 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1,6 +1,5 @@ #include "llama-impl.h" #include "llama-vocab.h" -#include "llama-grammar.h" #include "llama-sampling.h" #include "unicode.h" @@ -3181,7 +3180,6 @@ struct llama_sbatch { struct llama_context { llama_context(const llama_model & model) : model(model) - , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {} @@ -3198,7 +3196,6 @@ struct llama_context { const struct llama_model & model; struct llama_cparams cparams; - struct llama_sampling sampling; struct llama_sbatch sbatch; struct llama_kv_cache kv_self; struct llama_control_vector cvec; @@ -3219,16 +3216,16 @@ struct llama_context { bool has_evaluated_once = false; - int64_t t_start_us; - int64_t t_load_us; - int64_t t_p_eval_us = 0; - int64_t t_eval_us = 0; + mutable int64_t t_start_us; + mutable int64_t t_load_us; + mutable int64_t t_p_eval_us = 0; + mutable int64_t t_eval_us = 0; - int64_t t_compute_start_us = 0; - int64_t n_queued_tokens = 0; + mutable int64_t t_compute_start_us = 0; + mutable int64_t n_queued_tokens = 0; - int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - int32_t n_eval = 0; // number of eval calls + mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) + mutable int32_t n_eval = 0; // number of eval calls // host buffer for the model output (logits and embeddings) ggml_backend_buffer_t buf_output = nullptr; @@ -6257,6 +6254,7 @@ static void llm_load_vocab( const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + vocab.n_vocab = n_vocab; vocab.id_to_token.resize(n_vocab); for (uint32_t i = 0; i < n_vocab; i++) { @@ -6407,6 +6405,11 @@ static void llm_load_vocab( ) ) { vocab.special_eot_id = t.second; + if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { + LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t.first.c_str()); + vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; + } break; } } @@ -6420,6 +6423,11 @@ static void llm_load_vocab( const auto & t = vocab.token_to_id.find("<|eom_id|>"); if (t != vocab.token_to_id.end()) { vocab.special_eom_id = t->second; + if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { + LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t->first.c_str()); + vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL; + } } } } @@ -9256,7 +9264,7 @@ static struct ggml_tensor * llm_build_copy_mask_state( // FIXME: zero-out NANs? states = ggml_mul(ctx, states, state_mask); - // copy states which won't be changed further (between n_seqs and n_rs) + // copy states which won't be changed further (between n_seqs and n_kv) ggml_build_forward_expand(graph, ggml_cpy(ctx, ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)), @@ -16074,6 +16082,13 @@ static int llama_decode_internal( return -1; } + for (uint32_t i = 0; i < n_tokens_all; ++i) { + if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); + return -1; + } + } + const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -16366,6 +16381,13 @@ static int llama_encode_internal( return -1; } + for (uint32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); + return -1; + } + } + const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -17514,6 +17536,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= name.find("time_mix_first.weight") == std::string::npos; quantize &= name.find("time_mix_w1.weight") == std::string::npos; quantize &= name.find("time_mix_w2.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; @@ -17898,7 +17922,6 @@ struct llama_model_params llama_model_default_params() { struct llama_context_params llama_context_default_params() { struct llama_context_params result = { - /*.seed =*/ LLAMA_DEFAULT_SEED, /*.n_ctx =*/ 512, /*.n_batch =*/ 2048, /*.n_ubatch =*/ 512, @@ -17931,6 +17954,14 @@ struct llama_context_params llama_context_default_params() { return result; } +struct llama_sampler_chain_params llama_sampler_chain_default_params() { + struct llama_sampler_chain_params result = { + /*.no_perf =*/ true, + }; + + return result; +} + struct llama_model_quantize_params llama_model_quantize_default_params() { struct llama_model_quantize_params result = { /*.nthread =*/ 0, @@ -18186,10 +18217,6 @@ struct llama_context * llama_new_context_with_model( cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; } - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); @@ -18200,10 +18227,10 @@ struct llama_context * llama_new_context_with_model( ctx->abort_callback = params.abort_callback; ctx->abort_callback_data = params.abort_callback_data; - ctx->sampling.rng = std::mt19937(params.seed); - ctx->logits_all = params.logits_all; + ctx->logits_all = params.logits_all; + // build worst-case graph for encoder if a model contains encoder - ctx->is_encoding = llama_model_has_encoder(model); + ctx->is_encoding = llama_model_has_encoder(model); uint32_t kv_size = cparams.n_ctx; ggml_type type_k = params.type_k; @@ -18491,14 +18518,6 @@ void llama_free(struct llama_context * ctx) { delete ctx; } -const struct llama_model * llama_get_model(const struct llama_context * ctx) { - return &ctx->model; -} - -const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) { - return &ctx->model.vocab; -} - uint32_t llama_n_ctx(const struct llama_context * ctx) { return ctx->cparams.n_ctx; } @@ -18519,6 +18538,30 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { return model->vocab.type; } +int32_t llama_n_vocab(const struct llama_model * model) { + return model->hparams.n_vocab; +} + +int32_t llama_n_ctx_train(const struct llama_model * model) { + return model->hparams.n_ctx_train; +} + +int32_t llama_n_embd(const struct llama_model * model) { + return model->hparams.n_embd; +} + +int32_t llama_n_layer(const struct llama_model * model) { + return model->hparams.n_layer; +} + +const struct llama_model * llama_get_model(const struct llama_context * ctx) { + return &ctx->model; +} + +enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) { + return ctx->cparams.pooling_type; +} + enum llama_rope_type llama_rope_type(const struct llama_model * model) { switch (model->arch) { // these models do not use RoPE @@ -18582,26 +18625,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { return LLAMA_ROPE_TYPE_NONE; } -enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) { - return ctx->cparams.pooling_type; -} - -int32_t llama_n_vocab(const struct llama_model * model) { - return model->hparams.n_vocab; -} - -int32_t llama_n_ctx_train(const struct llama_model * model) { - return model->hparams.n_ctx_train; -} - -int32_t llama_n_embd(const struct llama_model * model) { - return model->hparams.n_embd; -} - -int32_t llama_n_layer(const struct llama_model * model) { - return model->hparams.n_layer; -} - float llama_rope_freq_scale_train(const struct llama_model * model) { return model->hparams.rope_freq_scale_train; } @@ -19018,14 +19041,14 @@ struct llama_data_write { // TODO: add more model-specific info which should prevent loading the session file if not identical } - void write_rng(const std::mt19937 & rng) { - std::ostringstream rng_ss; - rng_ss << rng; + //void write_rng(const std::mt19937 & rng) { + // std::ostringstream rng_ss; + // rng_ss << rng; - const std::string & rng_str = rng_ss.str(); + // const std::string & rng_str = rng_ss.str(); - write_string(rng_str); - } + // write_string(rng_str); + //} void write_output_ids(struct llama_context * ctx) { llama_output_reorder(ctx); @@ -19245,17 +19268,17 @@ struct llama_data_read { // TODO: add more info which needs to be identical but which is not verified otherwise } - void read_rng(std::mt19937 & rng) { - std::string rng_str; - read_string(rng_str); + //void read_rng(std::mt19937 & rng) { + // std::string rng_str; + // read_string(rng_str); - std::istringstream rng_ss(rng_str); - rng_ss >> rng; + // std::istringstream rng_ss(rng_str); + // rng_ss >> rng; - if (rng_ss.fail()) { - throw std::runtime_error("failed to load RNG state"); - } - } + // if (rng_ss.fail()) { + // throw std::runtime_error("failed to load RNG state"); + // } + //} void read_output_ids(struct llama_context * ctx) { std::vector output_pos; @@ -19685,8 +19708,6 @@ static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_da data_ctx.write_model_info(ctx); - data_ctx.write_rng(ctx->sampling.rng); - // copy outputs data_ctx.write_output_ids(ctx); data_ctx.write_logits(ctx); @@ -19724,9 +19745,6 @@ static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_da data_ctx.read_model_info(ctx); - // set rng - data_ctx.read_rng(ctx->sampling.rng); - // set outputs data_ctx.read_output_ids(ctx); data_ctx.read_logits(ctx); @@ -20129,8 +20147,9 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG GGML_ABORT("fatal error"); -#endif +#else return nullptr; +#endif } } @@ -20178,8 +20197,9 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG GGML_ABORT("fatal error"); -#endif +#else return nullptr; +#endif } } @@ -20612,124 +20632,18 @@ int32_t llama_chat_apply_template( return res; } -// -// grammar -// - -struct llama_grammar * llama_grammar_init( - const llama_grammar_element ** rules, - size_t n_rules, - size_t start_rule_index) { - return llama_grammar_init_impl(rules, n_rules, start_rule_index); -} - -void llama_grammar_free(struct llama_grammar * grammar) { - llama_grammar_free_impl(grammar); -} - -struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) { - return llama_grammar_copy_impl(grammar); -} - -void llama_grammar_sample( - const struct llama_grammar * grammar, - const struct llama_context * ctx, - llama_token_data_array * candidates) { - llama_grammar_sample_impl(grammar, &ctx->model.vocab, &ctx->sampling, candidates); -} - -void llama_sample_grammar( - struct llama_context * ctx, - llama_token_data_array * candidates, - const struct llama_grammar * grammar) { - llama_grammar_sample(grammar, ctx, candidates); -} - -void llama_grammar_accept_token( - struct llama_grammar * grammar, - struct llama_context * ctx, - llama_token token) { - llama_grammar_accept_token_impl(grammar, &ctx->model.vocab, &ctx->sampling, token); -} - // // sampling // -void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) { - llama_set_rng_seed_impl(&ctx->sampling, seed); +// TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp +struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) { + return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root); } -void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) { - llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates); -} - -void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) { - llama_sample_top_k_impl(ctx ? &ctx->sampling : nullptr, candidates, k, min_keep); -} - -void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { - llama_sample_top_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep); -} - -void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { - llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep); -} - -void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) { - llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep); -} - -void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { - llama_sample_typical_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep); -} - -void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) { - llama_sample_entropy_impl(ctx ? &ctx->sampling : nullptr, candidates_p, min_temp, max_temp, exponent_val); -} - -void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { - llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp); -} - -void llama_sample_repetition_penalties( - struct llama_context * ctx, - llama_token_data_array * candidates, - const llama_token * last_tokens, - size_t penalty_last_n, - float penalty_repeat, - float penalty_freq, - float penalty_present) { - llama_sample_repetition_penalties_impl(ctx ? &ctx->sampling : nullptr, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present); -} - -void llama_sample_apply_guidance( - struct llama_context * ctx, - float * logits, - float * logits_guidance, - float scale) { - llama_sample_apply_guidance_impl(&ctx->sampling, logits, logits_guidance, scale); -} - -llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) { - return llama_sample_token_mirostat_impl(&ctx->sampling, candidates, tau, eta, m, mu); -} - -llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) { - return llama_sample_token_mirostat_v2_impl(ctx ? &ctx->sampling : nullptr, candidates, tau, eta, mu); -} - -llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) { - return llama_sample_token_greedy_impl(ctx ? &ctx->sampling : nullptr, candidates); -} - -llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) { - return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, rng); -} - -llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) { - return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, ctx->sampling.rng); -} +// +// model split +// int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) { static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf"; @@ -20755,45 +20669,6 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int return 0; } -struct llama_timings llama_get_timings(struct llama_context * ctx) { - struct llama_timings result = { - /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, - /*.t_end_ms =*/ 1.00 * ggml_time_ms(), - /*.t_load_ms =*/ 1e-3 * ctx->t_load_us, - /*.t_sample_ms =*/ 1e-3 * ctx->sampling.t_sample_us, - /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us, - /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, - - /*.n_sample =*/ std::max(1, ctx->sampling.n_sample), - /*.n_p_eval =*/ std::max(0, ctx->n_p_eval), - /*.n_eval =*/ std::max(1, ctx->n_eval), - }; - - return result; -} - -void llama_print_timings(struct llama_context * ctx) { - const llama_timings timings = llama_get_timings(ctx); - - LLAMA_LOG_INFO("\n"); - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms); - LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample); - LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval)); -} - -void llama_reset_timings(struct llama_context * ctx) { - ctx->t_start_us = ggml_time_us(); - ctx->t_eval_us = ctx->n_eval = 0; - ctx->t_p_eval_us = ctx->n_p_eval = 0; - - ctx->sampling.reset_timings(); -} - const char * llama_print_system_info(void) { static std::string s; @@ -20822,7 +20697,68 @@ const char * llama_print_system_info(void) { return s.c_str(); } -void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) { +void llama_perf_print(const void * ctx, enum llama_perf_type type) { + switch (type) { + case LLAMA_PERF_TYPE_CONTEXT: + { + const auto * p = (const struct llama_context *) ctx; + + const double t_start_ms = 1e-3 * p->t_start_us; + const double t_end_ms = 1.00 * ggml_time_ms(); + const double t_load_ms = 1e-3 * p->t_load_us; + const double t_p_eval_ms = 1e-3 * p->t_p_eval_us; + const double t_eval_ms = 1e-3 * p->t_eval_us; + + const int32_t n_p_eval = std::max(0, p->n_p_eval); + const int32_t n_eval = std::max(1, p->n_eval); + + LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms); + LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval); + LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval)); + } break; + case LLAMA_PERF_TYPE_SAMPLER_CHAIN: + { + const auto * smpl = (const struct llama_sampler *) ctx; + const auto * p = (const struct llama_sampler_chain *) smpl->ctx; + + const double t_sampler_ms = 1e-3 * p->t_sample_us; + + const int32_t n_sampler = std::max(0, p->n_sample); + + LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler); + } break; + default: + GGML_ABORT("invalid perf type"); + } +} + +void llama_perf_reset(void * ctx, enum llama_perf_type type) { + switch (type) { + case LLAMA_PERF_TYPE_CONTEXT: + { + auto * p = (struct llama_context *) ctx; + + p->t_start_us = ggml_time_us(); + p->t_eval_us = p->n_eval = 0; + p->t_p_eval_us = p->n_p_eval = 0; + } break; + case LLAMA_PERF_TYPE_SAMPLER_CHAIN: + { + auto * smpl = (struct llama_sampler *) ctx; + auto * p = (struct llama_sampler_chain *) smpl->ctx; + + p->t_sample_us = p->n_sample = 0; + } break; + default: + GGML_ABORT("invalid perf type"); + } +} + +void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) { fprintf(stream, "\n"); fprintf(stream, "###########\n"); fprintf(stream, "# Timings #\n"); @@ -20833,21 +20769,15 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) { 1.0e-3 * ctx->t_eval_us / ctx->n_eval); fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n", 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval); - fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n", - 1.0e-3 * ctx->sampling.t_sample_us / ctx->sampling.n_sample); fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval); fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval); - fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->sampling.n_sample); fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us); fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us); fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us); - fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->sampling.t_sample_us); fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n", 1.0e6 * ctx->n_eval / ctx->t_eval_us); fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n", 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us); - fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n", - 1.0e6 * ctx->sampling.n_sample / ctx->sampling.t_sample_us); } // For internal test use diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0207e3a59..30e71cfd4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -108,6 +108,7 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU #llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) # llama_target_and_test(test-double-float.cpp) # SLOW +llama_target_and_test(test-arg-parser.cpp) llama_target_and_test(test-quantize-fns.cpp) llama_target_and_test(test-quantize-perf.cpp) llama_target_and_test(test-sampling.cpp) diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp new file mode 100644 index 000000000..f26707910 --- /dev/null +++ b/tests/test-arg-parser.cpp @@ -0,0 +1,131 @@ +#include "arg.h" +#include "common.h" + +#include +#include +#include +#include + +#undef NDEBUG +#include + +int main(void) { + gpt_params params; + + printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n"); + for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) { + try { + auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex); + std::unordered_set seen_args; + std::unordered_set seen_env_vars; + for (const auto & opt : ctx_arg.options) { + // check for args duplications + for (const auto & arg : opt.args) { + if (seen_args.find(arg) == seen_args.end()) { + seen_args.insert(arg); + } else { + fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg); + exit(1); + } + } + // check for env var duplications + if (opt.env) { + if (seen_env_vars.find(opt.env) == seen_env_vars.end()) { + seen_env_vars.insert(opt.env); + } else { + fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env); + exit(1); + } + } + } + } catch (std::exception & e) { + printf("%s\n", e.what()); + assert(false); + } + } + + auto list_str_to_char = [](std::vector & argv) -> std::vector { + std::vector res; + for (auto & arg : argv) { + res.push_back(const_cast(arg.data())); + } + return res; + }; + + std::vector argv; + + printf("test-arg-parser: test invalid usage\n\n"); + + // missing value + argv = {"binary_name", "-m"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + + // wrong value (int) + argv = {"binary_name", "-ngl", "hello"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + + // wrong value (enum) + argv = {"binary_name", "-sm", "hello"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + + // non-existence arg in specific example (--draft cannot be used outside llama-speculative) + argv = {"binary_name", "--draft", "123"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER)); + + + printf("test-arg-parser: test valid usage\n\n"); + + argv = {"binary_name", "-m", "model_file.gguf"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.model == "model_file.gguf"); + + argv = {"binary_name", "-t", "1234"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.cpuparams.n_threads == 1234); + + argv = {"binary_name", "--verbose"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.verbosity == 1); + + argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.model == "abc.gguf"); + assert(params.n_predict == 6789); + assert(params.n_batch == 9090); + + // --draft cannot be used outside llama-speculative + argv = {"binary_name", "--draft", "123"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE)); + assert(params.n_draft == 123); + +// skip this part on windows, because setenv is not supported +#ifdef _WIN32 + printf("test-arg-parser: skip on windows build\n"); +#else + printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n"); + + setenv("LLAMA_ARG_THREADS", "blah", true); + argv = {"binary_name"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + + setenv("LLAMA_ARG_MODEL", "blah.gguf", true); + setenv("LLAMA_ARG_THREADS", "1010", true); + argv = {"binary_name"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.model == "blah.gguf"); + assert(params.cpuparams.n_threads == 1010); + + + printf("test-arg-parser: test environment variables being overwritten\n\n"); + + setenv("LLAMA_ARG_MODEL", "blah.gguf", true); + setenv("LLAMA_ARG_THREADS", "1010", true); + argv = {"binary_name", "-m", "overwritten.gguf"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.model == "overwritten.gguf"); + assert(params.cpuparams.n_threads == 1010); +#endif // _WIN32 + + + printf("test-arg-parser: all tests OK\n\n"); +} diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index bd65e8cb3..635de01d7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1,3 +1,20 @@ +// This file defines tests for various GGML ops and backends. +// For the forward pass it asserts that the results of multiple backends computing the same GGML ops are consistent. +// For the backwards pass it asserts that the gradients from backpropagation are consistent +// with the gradients obtained via the method of finite differences ("grad" mode, this is optional). +// It is also possible to check the performance ("perf" mode). +// +// this file has three sections: Section 1 does general setup, section 2 defines the GGML ops to be tested, +// and section 3 defines which tests to run. +// Quick start for adding a new GGML op: Go to section 2 and create a struct that inherits from test_case, +// then go to section 3 and add an instantiation of your struct. + + +// ############################## +// ## Section 1: General Setup ## +// ############################## + + #include #include #include @@ -5,7 +22,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -15,7 +34,6 @@ #include #include - static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { // static RNG initialization (revisit if n_threads stops being constant) static const size_t n_threads = std::thread::hardware_concurrency(); @@ -212,6 +230,39 @@ static double nmse(const float * a, const float * b, size_t n) { return mse_a_b / mse_a_0; } +// maximum absolute asymmetry between a and b +// asymmetry: (a - b) / (a + b) +// This is more stable than relative error if one of the values fluctuates towards zero. +// n: number of values to compare. +// expected_vals: optional vector of expected values for a. If expected_vals is not empty, filter out all comparisons where +// a does not match any of the expected values. Needed for noncontinuous gradients where the numerical calculation can fail. +static double mean_abs_asymm(const float * a, const float * b, const size_t n, const std::vector & expected_vals) { + double sum = 0.0f; + + size_t nvalid = 0; + for (size_t i = 0; i < n; i++) { + if (!expected_vals.empty()) { + bool matches_any = false; + for (const float & ev : expected_vals) { + if (fabsf(a[i] - ev) < 1e-3f) { + matches_any = true; + break; + } + } + if (!matches_any) { + continue; + } + } + + const float asymm = (a[i] - b[i]) / (a[i] + b[i]); + + sum += fabsf(asymm); + nvalid++; + } + + return sum/nvalid; +} + // utils for printing the variables of the test cases #define VAR_TO_STR(x) (#x "=" + var_to_str(x)) @@ -295,6 +346,7 @@ static bool ggml_is_view_op(enum ggml_op op) { enum test_mode { MODE_TEST, MODE_PERF, + MODE_GRAD, }; struct test_case { @@ -314,6 +366,32 @@ struct test_case { return 1e-7; } + virtual double max_maa_err() { + return 1e-4; + } + + virtual float grad_eps(){ + return 1e-1f; + } + + // If false, estimate gradient with 2 points, neglects 3rd order derivative and higher. + // If true, estimate gradient with 4 points, neglects 5th order derivative and higher. + virtual bool grad_precise(){ + return false; + } + + // Skip gradient checks if total number of gradients to be checked is larger than this (to speed up the tests). + virtual int64_t grad_nmax() { + return 10000; + } + + // No effect if empty. + // If not empty, skip all gradient checks where the numerical result does not match any of the values. + // Needed for dealing with noncontinuous gradients (e.g. ReLU) where estimation using finite differences is unreliable. + virtual std::vector grad_expect() { + return {}; + } + virtual void initialize_tensors(ggml_context * ctx) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { init_tensor_uniform(t); @@ -332,6 +410,7 @@ struct test_case { } ggml_cgraph * gf = nullptr; + ggml_cgraph * gb = nullptr; static const int sentinel_size = 1024; @@ -340,7 +419,7 @@ struct test_case { std::vector sentinels; void add_sentinel(ggml_context * ctx) { - if (mode == MODE_PERF) { + if (mode == MODE_PERF || mode == MODE_GRAD) { return; } ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size); @@ -389,6 +468,7 @@ struct test_case { /* .no_alloc = */ true, }; ggml_context * ctx = ggml_init(params); + GGML_ASSERT(ctx); gf = ggml_new_graph(ctx); @@ -550,6 +630,7 @@ struct test_case { /* .no_alloc = */ true, }; ggml_context * ctx = ggml_init(params); + GGML_ASSERT(ctx); ggml_tensor * out = build_graph(ctx); @@ -643,8 +724,282 @@ struct test_case { return true; } + + bool eval_grad(ggml_backend_t backend, const char * op_name) { + mode = MODE_GRAD; + const std::vector expect = grad_expect(); + + ggml_init_params params = { + /* .mem_size = */ ggml_tensor_overhead()*128 + 2*ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, true), + /* .mem_base = */ NULL, + /* .no_alloc = */ true, + }; + ggml_context * ctx = ggml_init(params); + GGML_ASSERT(ctx); + + gf = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true); + gb = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true); + + ggml_tensor * out = build_graph(ctx); + + if (op_name != nullptr && op_desc(out) != op_name) { + //printf(" %s: skipping\n", op_desc(out).c_str()); + ggml_free(ctx); + return true; + } + + printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str()); + fflush(stdout); + + if (out->grad == nullptr) { + printf("backwards pass not supported \n"); + ggml_free(ctx); + return true; + } + if (out->type != GGML_TYPE_F32) { + ggml_free(ctx); + printf("not supported [%s->type != FP32]\n", out->name); + return true; + } + + // check if the backend supports the ops + bool supported = true; + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (!ggml_backend_supports_op(backend, t)) { + printf("not supported [%s] ", ggml_backend_name(backend)); + supported = false; + break; + } + if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) { + printf("not supported [%s->type != FP32] ", t->name); + supported = false; + break; + } + } + if (!supported) { + printf("\n"); + ggml_free(ctx); + return true; + } + + int64_t ngrads = 0; + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->flags & GGML_TENSOR_FLAG_PARAM) { + ngrads += ggml_nelements(t); + } + } + if (ngrads > grad_nmax()) { + printf("skipping large tensors for speed \n"); + ggml_free(ctx); + return true; + } + + + if (!ggml_is_scalar(out)) { + out = ggml_sum(ctx, out); + ggml_set_name(out, "sum_of_out"); + } + + ggml_build_forward_expand(gf, out); + ggml_graph_cpy(gf, gb); + ggml_build_backward_expand(ctx, gf, gb, false); + if (expect.size() != 1 || expect[0] != 0.0f) { + GGML_ASSERT(gb->n_nodes > gf->n_nodes); + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE); + } + } + + // TODO: refactor so that this check is only needed once + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (!ggml_backend_supports_op(backend, t)) { + printf("not supported [%s] ", ggml_backend_name(backend)); + supported = false; + break; + } + if ((t->flags & GGML_TENSOR_FLAG_PARAM) && t->type != GGML_TYPE_F32) { + printf("not supported [%s->type != FP32] ", t->name); + supported = false; + break; + } + } + if (!supported) { + printf("\n"); + ggml_free(ctx); + return true; + } + + // allocate + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend); + if (buf == NULL) { + printf("failed to allocate tensors [%s] ", ggml_backend_name(backend)); + ggml_free(ctx); + return false; + } + + // randomize tensors + initialize_tensors(ctx); + + for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + if (!t->grad) { + continue; + } + + std::vector tmp(ggml_nelements(t->grad)); + ggml_backend_tensor_set(t->grad, tmp.data(), 0, ggml_nbytes(t->grad)); + } + + // build graphs + const float onef = 1.0f; + ggml_backend_graph_compute(backend, gf); + ggml_backend_tensor_set(out->grad, &onef, 0, ggml_nbytes(out->grad)); + ggml_backend_graph_compute(backend, gb); + + bool ok = true; + for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + if (!(t->flags & GGML_TENSOR_FLAG_PARAM)) { + continue; + } + + const char * bn = ggml_backend_name(backend); + const int64_t ne = ggml_nelements(t); + + std::vector ga = tensor_to_float(t->grad); + + for (int64_t i = 0; i < ne; ++i) { // gradient algebraic + // check for nans + if (!std::isfinite(ga[i])) { + printf("[%s] nonfinite gradient at index %" PRId64 " (%s=%f) ", ggml_op_desc(t), i, bn, ga[i]); + ok = false; + break; + } + } + if (!ok) { + break; + } + + std::vector gn(ne); // gradient numeric + GGML_ASSERT(ga.size() == gn.size()); + + std::vector x0 = tensor_to_float(t); // original t data + GGML_ASSERT(ggml_is_scalar(out)); + GGML_ASSERT(out->type == GGML_TYPE_F32); + + const float eps = grad_eps(); + for (int64_t i = 0; i < ne; ++i) { + const float xiu = x0[i] + 1.0f*eps; // x, index i, up + const float xiuh = x0[i] + 0.5f*eps; // x, index i, up half + const float xidh = x0[i] - 0.5f*eps; // x, index i, down half + const float xid = x0[i] - 1.0f*eps; // x, index i, down + + float fu, fuh, fdh, fd; // output values for xiu, xiuh, xid, xidh + + ggml_backend_tensor_set(t, &xiu, i*sizeof(float), sizeof(float)); + ggml_backend_graph_compute(backend, gf); + ggml_backend_tensor_get(out, &fu, 0, ggml_nbytes(out)); + + ggml_backend_tensor_set(t, &xid, i*sizeof(float), sizeof(float)); + ggml_backend_graph_compute(backend, gf); + ggml_backend_tensor_get(out, &fd, 0, ggml_nbytes(out)); + + if (grad_precise()) { + ggml_backend_tensor_set(t, &xiuh, i*sizeof(float), sizeof(float)); + ggml_backend_graph_compute(backend, gf); + ggml_backend_tensor_get(out, &fuh, 0, ggml_nbytes(out)); + + ggml_backend_tensor_set(t, &xidh, i*sizeof(float), sizeof(float)); + ggml_backend_graph_compute(backend, gf); + ggml_backend_tensor_get(out, &fdh, 0, ggml_nbytes(out)); + + gn[i] = (8.0*(double)fuh + (double)fd - (8.0*(double)fdh + (double)fu)) / (6.0*(double)eps); + } else { + gn[i] = (fu - fd) / (2.0f*eps); + } + + ggml_backend_tensor_set(t, x0.data(), 0, ggml_nbytes(t)); + } + + const double err = mean_abs_asymm(gn.data(), ga.data(), gn.size(), expect); + if (err > max_maa_err()) { + printf("[%s] MAA = %.9f > %.9f ", ggml_op_desc(t), err, max_maa_err()); + ok = false; + break; + } + if (!ok) { + break; + } + } + + if (!ok) { + printf("compare failed "); + } + + ggml_backend_buffer_free(buf); + + ggml_free(ctx); + + if (ok) { + printf("\033[1;32mOK\033[0m\n"); + return true; + } + + printf("\033[1;31mFAIL\033[0m\n"); + return false; + } }; + +// ################################### +// ## Section 2: GGML Op Defintions ## +// ################################### + + +// The following is an example showing the bare minimum for creating a test for a GGML op. + +// GGML_OP_EXAMPLE +struct test_example : public test_case { + // Always define these 2 or variants thereof: + const ggml_type type; // The type of the input tensors. + const std::array ne; // The shape of the input tensors. + // For some ops it's necessary to define multiple types or shapes for the inputs. + // Or they may need additional parameters. + + // Put all parameters needed to fully define the test into one of the VARS_TO_STR macros. + // In most cases these are just the properties of the struct that you defined above. + // This is needed for info prints. + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + // Define a constructor for the struct. + // In most cases it will be sufficient to have the same arguments as the struct has properties + // and just use initializer lists. + test_example(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 5, 4, 3}) + : type(type), ne(ne) {} + + // Define how a simple GGML compute graph can be constructed for the new GGML op. + ggml_tensor * build_graph(ggml_context * ctx) override { + // Step 1: create input tensors that don't depend on any other tensors: + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); // Setting names is optional but it's useful for debugging. + + ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(b, "b"); + + // Step 2: use the op that you want to test in the GGML compute graph. + ggml_tensor * out = ggml_add(ctx, a, b); // For this example we're just doing a simple addition. + ggml_set_name(out, "out"); + + // Step 3: return the output tensor. + return out; + } + // In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a) + // immediately after you create the tensors. + // This is optional and only makes sense if a backwards pass has actually been implemented for the new op. +}; + + // GGML_OP_UNARY struct test_unary : public test_case { const ggml_unary_op op; @@ -658,20 +1013,36 @@ struct test_unary : public test_case { test_unary(ggml_unary_op op, ggml_type type = GGML_TYPE_F32, - std::array ne_a = {128, 10, 10, 10}, + std::array ne_a = {128, 2, 2, 2}, int v = 0) : op(op), type(type), ne_a(ne_a), v(v) {} ggml_tensor * build_graph(ggml_context * ctx) override { + const bool grad_supported = op == GGML_UNARY_OP_ABS || op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_NEG || + op == GGML_UNARY_OP_STEP || op == GGML_UNARY_OP_RELU || op == GGML_UNARY_OP_SILU; + ggml_tensor * a; if (v & 1) { auto ne = ne_a; ne[0] *= 3; a = ggml_new_tensor(ctx, type, 4, ne.data()); + if (grad_supported) { + ggml_set_param(ctx, a); + } + ggml_set_name(a, "a"); + a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0); + ggml_set_name(a, "view_of_a"); } else { a = ggml_new_tensor(ctx, type, 4, ne_a.data()); + if (grad_supported) { + ggml_set_param(ctx, a); + } + ggml_set_name(a, "a"); } + ggml_tensor * out = ggml_unary(ctx, a, op); + ggml_set_name(out, "out"); + return out; } @@ -681,6 +1052,24 @@ struct test_unary : public test_case { init_tensor_uniform(t, -150.f, 150.f); } } + + float grad_eps() override { + return 15.0f; + } + + std::vector grad_expect() override { + if (op == GGML_UNARY_OP_ABS) { + return {-1.0f, 1.0f}; + } + if (op == GGML_UNARY_OP_SGN || op == GGML_UNARY_OP_STEP) { + return {0.0f}; + } + if (op == GGML_UNARY_OP_RELU) { + return {0.0f, 1.0f}; + } + return {}; + } + }; // GGML_OP_GET_ROWS @@ -701,11 +1090,24 @@ struct test_get_rows : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b); + ggml_set_name(in, "in"); + ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b); + ggml_set_name(rows, "rows"); if (v) { rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0); + ggml_set_name(rows, "view_of_rows"); } + + const bool grad_supported = ggml_is_matrix(in) && ggml_is_vector(rows); + if (grad_supported) { + ggml_set_param(ctx, in); + // rows is a constant input -> no gradients + } + ggml_tensor * out = ggml_get_rows(ctx, in, rows); + ggml_set_name(out, "out"); + return out; } @@ -741,14 +1143,21 @@ struct test_repeat : public test_case { } test_repeat(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}, + std::array ne = {10, 5, 4, 3}, std::array nr = {2, 2, 2, 2}) : type(type), ne(ne), nr(nr) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]); + ggml_set_name(target, "target"); + ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, src); + ggml_set_name(src, "src"); + ggml_tensor * out = ggml_repeat(ctx, src, target); + ggml_set_name(out, "out"); + return out; } }; @@ -774,10 +1183,62 @@ struct test_dup : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, src); + ggml_set_name(src, "src"); + if (_use_permute) { src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]); + ggml_set_name(src, "src_permuted"); } + ggml_tensor * out = ggml_dup(ctx, src); + ggml_set_name(out, "out"); + + return out; + } +}; + +// GGML_OP_SET +struct test_set : public test_case { + const ggml_type type_src; + const ggml_type type_dst; + const std::array ne; + const int dim; + + std::string vars() override { + return VARS_TO_STR4(type_src, type_dst, ne, dim); + } + + size_t op_size(ggml_tensor * t) override { + return ggml_nbytes(t) + ggml_nbytes(t->src[0]); + } + + test_set(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32, + std::array ne = {6, 5, 4, 3}, int dim = 1) + : type_src(type_src), type_dst(type_dst), ne(ne), dim(dim) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data()); + ggml_set_param(ctx, src); + ggml_set_name(src, "src"); + + auto ne_dst = ne; + for (int i = 0; i < dim; ++i) { + ne_dst[i] *= 2; + } + ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data()); + ggml_set_param(ctx, dst); + ggml_set_name(dst, "dst"); + + size_t offset = 0; + for (int i = 0; i < dim; ++i) { + offset += ((ne_dst[i] - ne[i])/2)*dst->nb[i]; + } + ggml_tensor * out = ggml_set(ctx, dst, src, + // The backwards pass requires setting a contiguous region: + src->nb[1], src->nb[2], src->nb[3], offset); + ggml_set_name(out, "out"); + return out; } }; @@ -810,11 +1271,20 @@ struct test_cpy : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data()); + ggml_set_param(ctx, src); + ggml_set_name(src, "src"); + if (_src_use_permute) { src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]); + ggml_set_name(src, "src_permuted"); } + ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne); + ggml_set_name(dst, "dst"); + ggml_tensor * out = ggml_cpy(ctx, src, dst); + ggml_set_name(out, "out"); + return out; } }; @@ -834,8 +1304,14 @@ struct test_cont : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, src); + ggml_set_name(src, "src"); + src = ggml_transpose(ctx, src); + ggml_set_name(src, "src_transposed"); + ggml_tensor * out = ggml_cont(ctx, src); + ggml_set_name(out, "out"); return out; } @@ -866,21 +1342,79 @@ struct test_bin_bcast : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]); + ggml_set_name(a, "a"); + ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(b, "b"); + + // The backwards pass supports broadcasting only for GGML_ADD: + const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b); + if (grad_supported) { + ggml_set_param(ctx, a); + ggml_set_param(ctx, b); + } + ggml_tensor * out = op(ctx, a, b); + ggml_set_name(out, "out"); + return out; } void initialize_tensors(ggml_context * ctx) override { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - if (op == ggml_div) { - // avoid division by zero - init_tensor_uniform(t, 1.0f, 2.0f); + if (op == ggml_mul || op == ggml_div) { + // MUL and DIV have numerical issues around zero: + init_tensor_uniform(t, 0.9f, 1.1f); } else { init_tensor_uniform(t); } } } + + float grad_eps() override { + return 0.1f * (op == ggml_mul ? ne[0]*ne[1]*ne[2]*ne[3] : 1); + } + + bool grad_precise() override { + return op == ggml_div; + } + + double max_maa_err() override { + return op == ggml_add ? 1e-4 : 1e-3; + } +}; + +// GGML_OP_ADD1 +struct test_add1 : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + test_add1(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 5, 4, 3}) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + + ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1); + // ggml_set_param(ctx, b); // TODO: implement + ggml_set_name(b, "b"); + + ggml_tensor * out = ggml_add1(ctx, a, b); + ggml_set_name(out, "out"); + + return out; + } + + float grad_eps() override { + return 0.1f * ne[0]*ne[1]*ne[2]*ne[3]; + } }; // GGML_OP_SCALE @@ -900,7 +1434,12 @@ struct test_scale : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_scale(ctx, a, scale); + ggml_set_name(out, "out"); + return out; } }; @@ -916,13 +1455,17 @@ struct test_norm : public test_case { } test_norm(ggml_type type = GGML_TYPE_F32, - std::array ne = {64, 10, 10, 10}, + std::array ne = {64, 5, 4, 3}, float eps = 1e-6f) : type(type), ne(ne), eps(eps) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_norm(ctx, a, eps); + ggml_set_name(out, "out"); + return out; } }; @@ -938,15 +1481,24 @@ struct test_rms_norm : public test_case { } test_rms_norm(ggml_type type = GGML_TYPE_F32, - std::array ne = {64, 10, 10, 10}, + std::array ne = {64, 5, 4, 3}, float eps = 1e-6f) : type(type), ne(ne), eps(eps) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_rms_norm(ctx, a, eps); + ggml_set_name(out, "out"); + return out; } + + bool grad_precise() override { + return true; + } }; // GGML_OP_SSM_CONV @@ -1038,7 +1590,14 @@ struct test_mul_mat : public test_case { // C^T = A * B^T: (k, m) * (k, n) => (m, n) ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]); ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]); + ggml_set_param(ctx, a); + ggml_set_param(ctx, b); + ggml_set_name(a, "a"); + ggml_set_name(b, "b"); + ggml_tensor * out = ggml_mul_mat(ctx, a, b); + ggml_set_name(out, "out"); + return out; } }; @@ -1082,12 +1641,21 @@ struct test_mul_mat_id : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { // C^T = A * B^T: (k, m) * (k, n) => (m, n) ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); + ggml_set_name(as, "as"); + ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); + ggml_set_name(ids, "ids"); if (n_used != n_mats) { ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0); + ggml_set_name(ids, "view_of_ids"); } + ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n); + ggml_set_name(b, "b"); + ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids); + ggml_set_name(out, "out"); + return out; } @@ -1123,14 +1691,23 @@ struct test_sqr : public test_case { } test_sqr(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}) + std::array ne = {10, 5, 4, 3}) : type(type), ne(ne) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_sqr(ctx, a); + ggml_set_name(out, "out"); + return out; } + + float grad_eps() override { + return 0.1f * 0.25f*ne[0]*ne[1]*ne[2]*ne[3]; // 10% of expected value of sum. + } }; // GGML_OP_SQRT @@ -1143,21 +1720,70 @@ struct test_sqrt : public test_case { } test_sqrt(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}) + std::array ne = {10, 3, 3, 2}) : type(type), ne(ne) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_sqrt(ctx, a); + ggml_set_name(out, "out"); + return out; } void initialize_tensors(ggml_context * ctx) override { // fill with positive values for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - init_tensor_uniform(t, 0.0f, 100.0f); + init_tensor_uniform(t, 50.0f, 100.0f); } } + + float grad_eps() override { + return 20.0f; + } + + bool grad_precise() override { + return true; + } +}; + +// GGML_OP_LOG +struct test_log : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + test_log(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 5, 4, 3}) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_log(ctx, a); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + // log(1) == 0, cluster values there to keep the sum low for better precision in the backwards pass: + init_tensor_uniform(t, 0.9f, 1.1f); + } + } + + bool grad_precise() override { + return true; + } }; // GGML_OP_SIN @@ -1170,20 +1796,37 @@ struct test_sin : public test_case { } test_sin(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}) + std::array ne = {10, 2, 2, 2}) : type(type), ne(ne) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_sin(ctx, a); + ggml_set_name(out, "out"); + return out; } void initialize_tensors(ggml_context * ctx) override { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - init_tensor_uniform(t, -100.0f, 100.0f); + init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi]. } } + + double max_maa_err() override { + return 1e-3; + } + + float grad_eps() override { + return 0.2f; + } + + bool grad_precise() override { + return true; + } }; // GGML_OP_COS @@ -1196,20 +1839,37 @@ struct test_cos : public test_case { } test_cos(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}) + std::array ne = {10, 2, 2, 2}) : type(type), ne(ne) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_cos(ctx, a); + ggml_set_name(out, "out"); + return out; } void initialize_tensors(ggml_context * ctx) override { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - init_tensor_uniform(t, -100.0f, 100.0f); + init_tensor_uniform(t, -6.5f, 6.5f); // Covers interval [-2*pi, 2*pi]. } } + + double max_maa_err() override { + return 1e-3; + } + + float grad_eps() override { + return 0.2f; + } + + bool grad_precise() override { + return true; + } }; // GGML_OP_CLAMP @@ -1224,15 +1884,27 @@ struct test_clamp : public test_case { } test_clamp(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}, + std::array ne = {10, 5, 4, 3}, float min = -0.5f, float max = 0.5f) : type(type), ne(ne), min(min), max(max) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_clamp(ctx, a, min, max); + ggml_set_name(out, "out"); + return out; } + + float grad_eps() override { + return 1e-2f; + } + + std::vector grad_expect() override { + return {0.0f, 1.0f}; + } }; // GGML_OP_DIAG_MASK_INF @@ -1246,13 +1918,18 @@ struct test_diag_mask_inf : public test_case { } test_diag_mask_inf(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}, + std::array ne = {10, 10, 3, 2}, int n_past = 5) : type(type), ne(ne), n_past(n_past) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past); + ggml_set_name(out, "out"); + return out; } }; @@ -1276,7 +1953,7 @@ struct test_soft_max : public test_case { } test_soft_max(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}, + std::array ne = {10, 5, 4, 3}, bool mask = false, float scale = 1.0f, float max_bias = 0.0f) @@ -1284,13 +1961,24 @@ struct test_soft_max : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + ggml_tensor * mask = nullptr; if (this->mask) { mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]); + ggml_set_name(mask, "mask"); } + ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias); + ggml_set_name(out, "out"); + return out; } + + bool grad_precise() override { + return true; + } }; @@ -1312,7 +2000,7 @@ struct test_rope : public test_case { } test_rope(ggml_type type = GGML_TYPE_F32, - std::array ne_a = {10, 10, 10, 1}, + std::array ne_a = {10, 5, 3, 1}, int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0) : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {} @@ -1321,13 +2009,29 @@ struct test_rope : public test_case { if (v & 1) { auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3; a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0); + ggml_set_name(a, "view_of_a"); } else { a = ggml_new_tensor(ctx, type, 4, ne_a.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); } + ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]); - ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr; + ggml_set_name(pos, "pos"); + + ggml_tensor * freq = nullptr; + if (ff) { + freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2); + ggml_set_name(freq, "freq"); + } + ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f); + ggml_set_name(out, "out"); + return out; } @@ -1350,6 +2054,14 @@ struct test_rope : public test_case { } } } + + double max_maa_err() override { + return 1e-3; + } + + bool grad_precise() override { + return true; + } }; // GGML_OP_POOL2D @@ -1381,7 +2093,12 @@ struct test_pool2d : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data()); + ggml_set_param(ctx, input); + ggml_set_name(input, "input"); + ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1); + ggml_set_name(out, "out"); + return out; } }; @@ -1406,8 +2123,14 @@ struct test_conv_transpose_1d : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data()); + ggml_set_name(input, "input"); + ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data()); + ggml_set_name(kernel, "kernel"); + ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0); + ggml_set_name(out, "out"); + return out; } }; @@ -1446,8 +2169,15 @@ struct test_im2col : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data()); + ggml_set_param(ctx, input); + ggml_set_name(input, "input"); + ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data()); + ggml_set_name(kernel, "kernel"); + ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type); + ggml_set_name(out, "out"); + return out; } }; @@ -1465,8 +2195,8 @@ struct test_concat : public test_case { } test_concat(ggml_type type = GGML_TYPE_F32, - std::array ne_a = {10, 10, 10, 10}, - int64_t ne_b_d = 10, + std::array ne_a = {10, 5, 5, 5}, + int64_t ne_b_d = 5, int dim = 2, int v = 0) : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {} @@ -1477,19 +2207,30 @@ struct test_concat : public test_case { if (v & 1) { auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3; a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0); + ggml_set_name(a, "view_of_a"); } else { a = ggml_new_tensor(ctx, type, 4, ne_a.data()); + ggml_set_name(a, "a"); } ggml_tensor * b; if (v & 2) { auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4; b = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(b, "b"); + b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0); + ggml_set_name(b, "view_of_b"); } else { b = ggml_new_tensor(ctx, type, 4, ne_b.data()); + ggml_set_name(b, "b"); } + ggml_tensor * out = ggml_concat(ctx, a, b, dim); + ggml_set_name(out, "out"); + return out; } }; @@ -1511,7 +2252,11 @@ struct test_argsort : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_argsort(ctx, a, order); + ggml_set_name(out, "out"); + return out; } @@ -1544,6 +2289,35 @@ struct test_argsort : public test_case { } }; +// GGML_OP_SUM +struct test_sum : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + test_sum(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 5, 4, 3}) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_sum(ctx, a); + ggml_set_name(out, "out"); + + return out; + } + + float grad_eps() override { + return 0.1f * sqrtf(ne[0]*ne[1]*ne[2]*ne[3]); + } +}; + // GGML_OP_SUM_ROWS struct test_sum_rows : public test_case { const ggml_type type; @@ -1554,12 +2328,17 @@ struct test_sum_rows : public test_case { } test_sum_rows(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}) + std::array ne = {10, 5, 4, 3}) : type(type), ne(ne) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_sum_rows(ctx, a); + ggml_set_name(out, "out"); + return out; } }; @@ -1582,8 +2361,16 @@ struct test_upscale : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); - if (transpose) a = ggml_transpose(ctx, a); + ggml_set_name(a, "a"); + + if (transpose) { + a = ggml_transpose(ctx, a); + ggml_set_name(a, "a_transposed"); + } + ggml_tensor * out = ggml_upscale(ctx, a, scale_factor); + ggml_set_name(out, "out"); + return out; } }; @@ -1605,7 +2392,11 @@ struct test_upscale_ext : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]); + ggml_set_name(out, "out"); + return out; } }; @@ -1629,7 +2420,11 @@ struct test_group_norm : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps); + ggml_set_name(out, "out"); + return out; } }; @@ -1645,14 +2440,22 @@ struct test_acc : public test_case { } test_acc(ggml_type type = GGML_TYPE_F32, - std::array ne_a = {1024, 577, 1, 1}, - std::array ne_b = {1024, 576, 1, 1}) + std::array ne_a = {256, 17, 1, 1}, + std::array ne_b = {256, 16, 1, 1}) : type(type), ne_a(ne_a), ne_b(ne_b) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data()); + ggml_set_param(ctx, a); + ggml_set_name(a, "a"); + ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data()); + ggml_set_param(ctx, b); + ggml_set_name(b, "b"); + ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]); + ggml_set_name(out, "out"); + return out; } }; @@ -1675,7 +2478,11 @@ struct test_pad : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data()); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0); + ggml_set_name(out, "out"); + return out; } }; @@ -1697,6 +2504,8 @@ struct test_arange : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * out = ggml_arange(ctx, start, stop, step); + ggml_set_name(out, "out"); + return out; } }; @@ -1719,7 +2528,11 @@ struct test_timestep_embedding : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data()); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period); + ggml_set_name(out, "out"); + return out; } }; @@ -1735,13 +2548,17 @@ struct test_leaky_relu : public test_case { } test_leaky_relu(ggml_type type = GGML_TYPE_F32, - std::array ne_a = {10, 10, 10, 10}, + std::array ne_a = {10, 5, 4, 3}, float negative_slope = 0.1f) : type(type), ne_a(ne_a), negative_slope(negative_slope) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data()); + ggml_set_name(a, "a"); + ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true); + ggml_set_name(out, "out"); + return out; } }; @@ -1768,19 +2585,37 @@ struct test_flash_attn_ext : public test_case { return 5e-4; } - test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16) + test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, + bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16) : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {} ggml_tensor * build_graph(ggml_context * ctx) override { const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV)); ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1); + ggml_set_name(q, "q"); + ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1); + ggml_set_name(k, "k"); + ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1); - ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr; + ggml_set_name(v, "v"); + + ggml_tensor * m = nullptr; + if (mask) { + m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1); + ggml_set_name(m, "m"); + } + ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias, logit_softcap); + ggml_set_name(out, "out"); + return out; } + + bool grad_precise() override { + return true; + } }; // GGML_OP_CROSS_ENTROPY_LOSS @@ -1793,15 +2628,42 @@ struct test_cross_entropy_loss : public test_case { } test_cross_entropy_loss(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 10}) + std::array ne = {10, 5, 4, 3}) : type(type), ne(ne) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(ctx, logits); + ggml_set_name(logits, "logits"); + ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data()); + // The labels are assumed to be constant -> no gradients. + ggml_set_name(labels, "labels"); + + // Ensure labels add up to 1: + labels = ggml_soft_max(ctx, labels); + ggml_set_name(labels, "labels_normalized"); + ggml_tensor * out = ggml_cross_entropy_loss(ctx, logits, labels); + ggml_set_name(out, "out"); + return out; } + + void initialize_tensors(ggml_context * ctx) override { + // For larger abs. diffs between logits softmax is more linear, therefore more precise num. gradients. + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -100.0f, 100.0f); + } + } + + float grad_eps() override { + return 1.0f; + } + + bool grad_precise() override { + return true; + } }; enum llm_norm_type { @@ -2188,6 +3050,12 @@ struct test_falcon : public test_llm { } }; + +// ########################################### +// ## Section 3: GGML Op Test Instantiation ## +// ########################################### + + static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) { std::vector> test_cases; std::default_random_engine rng(0); @@ -2230,8 +3098,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op // unary ops for (int v : {0, 1}) { for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) { - test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 10, 10, 10 }, v)); - test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }, v)); + test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 2, 2, 2 }, v)); + test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 5, 7, 11, 13 }, v)); } } @@ -2267,11 +3135,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } } + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16)); // test cases for 1D im2col - test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false)); + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false)); test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false)); + test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false)); // sycl backend will limit task global_range < MAX_INT // test cases for 2D im2col with large input W and H (occurs in stable-diffusion) @@ -2290,13 +3160,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); - test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, 3}, {2, 1, 1, 1})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, 3}, {1, 1, 1, 2})); test_cases.emplace_back(new test_dup(GGML_TYPE_F32)); test_cases.emplace_back(new test_dup(GGML_TYPE_F16)); @@ -2309,6 +3179,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3})); + for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) { + test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim)); + } + for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); @@ -2322,6 +3196,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } test_cases.emplace_back(new test_cont()); + test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1})); + test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5})); + test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7})); + test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1})); + test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5})); + test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7})); + test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1})); + test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5})); + test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7})); auto add_test_bin_bcast = [&](ggml_type type, std::array ne, std::array nr) { for (auto op : {ggml_add, ggml_mul, ggml_div}) { @@ -2332,16 +3215,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1}); add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1}); - add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1}); - add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1}); - add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 1}); - add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 1, 1, 1}); - add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 1, 1}); - add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 1}); - add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 2}); - add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 2}); - add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 2, 2}); - add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 2, 2, 2}); + add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 1, 1}, {1, 1, 1, 1}); + add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 1}, {1, 1, 1, 1}); + add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1}); + add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1}); + add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1}); + add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1}); + add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2}); + add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 2}); + add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 2, 2}); + add_test_bin_bcast(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 2, 2, 2}); // stable diffusion add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1}); @@ -2360,11 +3243,12 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1}); //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1}); + test_cases.emplace_back(new test_add1()); test_cases.emplace_back(new test_scale()); for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) { - test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps)); - test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps)); + test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); + test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); } test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1})); @@ -2468,13 +3352,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_sqr()); test_cases.emplace_back(new test_sqrt()); + test_cases.emplace_back(new test_log()); test_cases.emplace_back(new test_sin()); test_cases.emplace_back(new test_cos()); test_cases.emplace_back(new test_clamp()); - test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5)); - test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 1}, 5)); - test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5)); + test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5)); + test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 1}, 5)); + test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 3, 2}, 5)); #if 0 std::uniform_int_distribution<> dist_ne1(1, 50); @@ -2518,23 +3403,23 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (float af : { 1.0f, 1.4245f }) { for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) { for (bool ff : {false, true}) { // freq_factors - test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B + test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B if (all) { - test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B - test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B - test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B + test_cases.emplace_back(new test_rope(type, {128, 40, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B + test_cases.emplace_back(new test_rope(type, {128, 52, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B + test_cases.emplace_back(new test_rope(type, {128, 64, 2, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B } if (all) { - test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B) - test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B) - test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B) - test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm) - test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2) + test_cases.emplace_back(new test_rope(type, { 64, 1, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B) + test_cases.emplace_back(new test_rope(type, { 64, 71, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B) + test_cases.emplace_back(new test_rope(type, { 64, 8, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B) + test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm) + test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2) } - test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B) + test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B) } } @@ -2558,6 +3443,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen } + test_cases.emplace_back(new test_sum()); test_cases.emplace_back(new test_sum_rows()); test_cases.emplace_back(new test_upscale()); test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true)); @@ -2600,6 +3486,18 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op #endif // run tests + if (mode == MODE_GRAD) { + size_t n_ok = 0; + for (auto & test : test_cases) { + if (test->eval_grad(backend, op_name)) { + n_ok++; + } + } + printf(" %zu/%zu tests passed\n", n_ok, test_cases.size()); + + return n_ok == test_cases.size(); + } + if (mode == MODE_TEST) { ggml_backend_t backend_cpu = ggml_backend_cpu_init(); @@ -2628,8 +3526,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op static void usage(char ** argv) { printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]); - printf(" valid modes are: test (compare with CPU backend for correctness) or perf (performance evaluation)\n"); - printf(" op names are as given by ggml_op_desc()\n"); + printf(" valid modes:\n"); + printf(" - test (default, compare with CPU backend for correctness)\n"); + printf(" - perf (performance evaluation)\n"); + printf(" - grad (compare gradients from backpropagation with method of finite differences)\n"); + printf(" op names are as given by ggml_op_desc() (e.g. GGML_ADD)\n"); } int main(int argc, char ** argv) { @@ -2642,6 +3543,8 @@ int main(int argc, char ** argv) { mode = MODE_TEST; } else if (strcmp(argv[i], "perf") == 0) { mode = MODE_PERF; + } else if (strcmp(argv[i], "grad") == 0) { + mode = MODE_GRAD; } else if (strcmp(argv[i], "-o") == 0) { if (i + 1 < argc) { op_name_filter = argv[++i]; @@ -2679,7 +3582,7 @@ int main(int argc, char ** argv) { ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL); GGML_ASSERT(backend != NULL); - if (backend_filter == NULL && ggml_backend_is_cpu(backend)) { + if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) { printf(" Skipping CPU backend\n"); ggml_backend_free(backend); n_ok++; diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 9c4e7d18e..5cc0cdb04 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -2,33 +2,18 @@ #undef NDEBUG #endif -#define LLAMA_API_INTERNAL - -#include "ggml.h" -#include "llama.h" -#include "grammar-parser.h" -#include "json-schema-to-grammar.h" #include "unicode.h" +#include "llama-grammar.h" +#include "json-schema-to-grammar.h" + #include #include #include using json = nlohmann::ordered_json; -static llama_grammar* build_grammar(const std::string & grammar_str) { - auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); - - // Ensure we parsed correctly - assert(!parsed_grammar.rules.empty()); - - // Ensure we have a root node - assert(!(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end())); - - std::vector grammar_rules(parsed_grammar.c_rules()); - llama_grammar* grammar = llama_grammar_init( - grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); - - return grammar; +static llama_grammar * build_grammar(const std::string & grammar_str) { + return llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root"); } static bool test_build_grammar_fails(const std::string & grammar_str) { @@ -45,25 +30,23 @@ static bool test_build_grammar_fails(const std::string & grammar_str) { } static bool match_string(const std::string & input, llama_grammar * grammar) { - auto decoded = decode_utf8(input, {}); - - const auto & code_points = decoded.first; + const auto cpts = unicode_cpts_from_utf8(input); const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); - llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar); + llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar); - for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy + for (const auto & cpt : cpts) { + const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy - llama_grammar_accept(rules, prev_stacks, *it, cur_stacks); + llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur); - if (cur_stacks.empty()) { + if (stacks_cur.empty()) { // no stacks means that the grammar failed to match at this point return false; } } - for (const auto & stack : cur_stacks) { + for (const auto & stack : stacks_cur) { if (stack.empty()) { // An empty stack means that the grammar has been completed return true; @@ -77,12 +60,12 @@ static void test(const std::string & test_desc, const std::string & grammar_str, fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str()); fflush(stderr); - auto grammar = build_grammar(grammar_str); + auto * grammar = build_grammar(grammar_str); // Save the original grammar stacks so that we can reset after every new string we want to test - const llama_grammar_stacks original_stacks = llama_grammar_get_stacks(grammar); + const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar); - llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar); + llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar); fprintf(stderr, " 🔵 Valid strings:\n"); @@ -119,7 +102,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str, assert(matched); // Reset the grammar stacks - cur_stacks = original_stacks; + stacks_cur = stacks_org; } fprintf(stderr, " 🟠 Invalid strings:\n"); @@ -139,11 +122,11 @@ static void test(const std::string & test_desc, const std::string & grammar_str, assert(!matched); // Reset the grammar stacks - cur_stacks = original_stacks; + stacks_cur = stacks_org; } // Clean up allocated memory - llama_grammar_free(grammar); + llama_grammar_free_impl(grammar); } static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector & passing_strings, const std::vector & failing_strings) { test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings); @@ -683,7 +666,8 @@ static void test_failure_missing_root() { term ::= number number ::= [0-9]+)"""; - grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + llama_grammar_parser parsed_grammar; + parsed_grammar.parse(grammar_str.c_str()); // Ensure we parsed correctly assert(!parsed_grammar.rules.empty()); @@ -705,7 +689,8 @@ static void test_failure_missing_reference() { fprintf(stderr, " Expected error: "); - grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + llama_grammar_parser parsed_grammar; + parsed_grammar.parse(grammar_str.c_str()); // Ensure we did NOT parsed correctly assert(parsed_grammar.rules.empty()); diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp index 5df5abb25..259172d99 100644 --- a/tests/test-grammar-parser.cpp +++ b/tests/test-grammar-parser.cpp @@ -3,7 +3,7 @@ #endif #include "llama.h" -#include "grammar-parser.h" +#include "llama-grammar.h" #include @@ -22,7 +22,8 @@ static const char * type_str(llama_gretype type) { static void verify_parsing(const char *grammar_bytes, const std::vector> expected, const std::vector &expected_rules) { uint32_t index = 0; - grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_bytes); + llama_grammar_parser parsed_grammar; + parsed_grammar.parse(grammar_bytes); std::map symbol_names; for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) { @@ -129,9 +130,10 @@ static void verify_parsing(const char *grammar_bytes, const std::vector #include #include #include -#include "json-schema-to-grammar.h" -#include "grammar-parser.h" - static std::string trim(const std::string & source) { std::string s(source); s.erase(0,s.find_first_not_of(" \n\r\t")); @@ -40,7 +41,8 @@ struct TestCase { } void verify_expectation_parseable() const { try { - auto state = grammar_parser::parse(expected_grammar.c_str()); + llama_grammar_parser state; + state.parse(expected_grammar.c_str()); if (state.symbol_ids.find("root") == state.symbol_ids.end()) { throw std::runtime_error("Grammar failed to parse:\n" + expected_grammar); } diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp index 1f3a267b3..6f1374ca8 100644 --- a/tests/test-llama-grammar.cpp +++ b/tests/test-llama-grammar.cpp @@ -2,16 +2,15 @@ #undef NDEBUG #endif -#define LLAMA_API_INTERNAL #include "llama.h" -#include "grammar-parser.h" +#include "llama-grammar.h" #include #include int main() { - grammar_parser::parse_state parsed_grammar; + llama_grammar_parser parsed_grammar; std::vector> expected = { {"expr", 2}, @@ -117,7 +116,7 @@ int main() llama_grammar * grammar = NULL; std::vector grammar_rules(parsed_grammar.c_rules()); - grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); if (grammar == nullptr) { throw std::runtime_error("Failed to initialize llama_grammar"); @@ -174,13 +173,13 @@ int main() }}; auto index = 0; - for (auto stack : llama_grammar_get_stacks(grammar)) + for (const llama_grammar_stack & stack : llama_grammar_get_stacks(grammar)) { // compare stack to expected_stack for (uint32_t i = 0; i < stack.size(); i++) { - auto element = stack[i]; - auto expected_element = expected_stacks[index][i]; + const llama_grammar_element * element = stack[i]; + const llama_grammar_element & expected_element = expected_stacks[index][i]; // pretty print error message before asserting if (expected_element.type != element->type || expected_element.value != element->value) @@ -403,6 +402,8 @@ int main() delete[] candidate.code_points; candidate.code_points = nullptr; } - llama_grammar_free(grammar); + + llama_grammar_free_impl(grammar); + return 0; } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 6c2a5db9a..d738b7a45 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "llama.h" +#include "llama-sampling.h" #ifdef NDEBUG #undef NDEBUG @@ -10,181 +11,199 @@ #include #include -static void dump(const llama_token_data_array * candidates) { - for (size_t i = 0; i < candidates->size; i++) { - printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit); +static void dump(const llama_token_data_array * cur_p) { + for (size_t i = 0; i < cur_p->size; i++) { + printf("%d: %f (%f)\n", cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); } } -#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0) +#define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0) + +#define APPLY(__cnstr, __cur_p) do { \ + auto * cnstr = (__cnstr); \ + llama_sampler_apply(cnstr, (__cur_p)); \ + llama_sampler_free(cnstr); \ +} while(0) static void test_top_k(const std::vector & probs, const std::vector & expected_probs, int k) { const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - llama_sample_softmax(nullptr, &candidates_p); - DUMP(&candidates_p); - llama_sample_top_k(nullptr, &candidates_p, k, 1); - DUMP(&candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + APPLY(llama_sampler_init_softmax(), &cur_p); + DUMP(&cur_p); + APPLY(llama_sampler_init_top_k(k), &cur_p); + DUMP(&cur_p); - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5); + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5); } } static void test_top_p(const std::vector & probs, const std::vector & expected_probs, float p) { const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - llama_sample_softmax(nullptr, &candidates_p); - DUMP(&candidates_p); - llama_sample_top_p(nullptr, &candidates_p, p, 1); - DUMP(&candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + APPLY(llama_sampler_init_softmax(), &cur_p); + DUMP(&cur_p); + APPLY(llama_sampler_init_top_p(p, 1), &cur_p); + DUMP(&cur_p); - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); } } static void test_tfs(const std::vector & probs, const std::vector & expected_probs, float z) { const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - DUMP(&candidates_p); - llama_sample_tail_free(nullptr, &candidates_p, z, 1); - DUMP(&candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + DUMP(&cur_p); + APPLY(llama_sampler_init_tail_free(z, 1), &cur_p); + DUMP(&cur_p); - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); } } static void test_min_p(const std::vector & probs, const std::vector & expected_probs, float p) { const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - DUMP(&candidates_p); - llama_sample_min_p(nullptr, &candidates_p, p, 1); - DUMP(&candidates_p); - llama_sample_softmax(nullptr, &candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + DUMP(&cur_p); + APPLY(llama_sampler_init_min_p(p, 1), &cur_p); + DUMP(&cur_p); + APPLY(llama_sampler_init_softmax(), &cur_p); - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); } } static void test_typical(const std::vector & probs, const std::vector & expected_probs, float p) { const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - DUMP(&candidates_p); - llama_sample_typical(nullptr, &candidates_p, p, 1); - DUMP(&candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + DUMP(&cur_p); + APPLY(llama_sampler_init_typical(p, 1), &cur_p); + DUMP(&cur_p); - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); } } -static void test_repetition_penalties( +static void test_penalties( const std::vector & probs, const std::vector & last_tokens, const std::vector & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence ) { GGML_ASSERT(probs.size() == expected_probs.size()); const size_t n_vocab = probs.size(); - std::vector candidates; - candidates.reserve(n_vocab); + + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(probs[token_id]); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - llama_sample_softmax(nullptr, &candidates_p); - DUMP(&candidates_p); - llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence); - llama_sample_softmax(nullptr, &candidates_p); - DUMP(&candidates_p); + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; - GGML_ASSERT(candidates_p.size == expected_probs.size()); - for (size_t i = 0; i < candidates_p.size; i++) { - GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3); + auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false); + + for (size_t i = 0; i < last_tokens.size(); i++) { + llama_sampler_accept(sampler, last_tokens[i]); + } + + APPLY(llama_sampler_init_softmax(), &cur_p); + DUMP(&cur_p); + APPLY(sampler, &cur_p); + APPLY(llama_sampler_init_softmax(), &cur_p); + DUMP(&cur_p); + + GGML_ASSERT(cur_p.size == expected_probs.size()); + for (size_t i = 0; i < cur_p.size; i++) { + GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3); } } -static void test_sampler_queue( - const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p +static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p ) { - std::vector candidates; - candidates.reserve(n_vocab); + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) { const float logit = logf(token_id); - candidates.emplace_back(llama_token_data{token_id, logit, 0.0f}); + cur.emplace_back(llama_token_data{token_id, logit, 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; llama_token min_token_id = 0; const llama_token max_token_id = n_vocab-1; for (auto s : samplers_sequence) { switch (s){ - case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break; + case 'k': APPLY(llama_sampler_init_top_k(top_k), &cur_p); break; case 'f': GGML_ABORT("tail_free test not implemented"); case 'y': GGML_ABORT("typical test not implemented"); - case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break; - case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break; + case 'p': APPLY(llama_sampler_init_top_p(top_p, 1), &cur_p); break; + case 'm': APPLY(llama_sampler_init_min_p(min_p, 1), &cur_p); break; case 't': GGML_ABORT("temperature test not implemented"); default : GGML_ABORT("Unknown sampler"); } - llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests + APPLY(llama_sampler_init_softmax(), &cur_p); // make sure tokens are sorted for tests - const int size = candidates_p.size; + const int size = cur_p.size; if (s == 'k') { const int expected_size = std::min(size, top_k); min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k)); GGML_ASSERT(size == expected_size); - GGML_ASSERT(candidates_p.data[0].id == max_token_id); - GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); + GGML_ASSERT(cur_p.data[0].id == max_token_id); + GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id); } else if (s == 'p') { const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2; const int softmax_numerator_target = ceilf(top_p * softmax_divisor); @@ -206,8 +225,8 @@ static void test_sampler_queue( } GGML_ASSERT(size == expected_size); - GGML_ASSERT(candidates_p.data[0].id == max_token_id); - GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); + GGML_ASSERT(cur_p.data[0].id == max_token_id); + GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id); } else if (s == 'm') { int expected_size = ceilf((1.0f-min_p) * n_vocab); expected_size = std::max(expected_size, 1); @@ -219,14 +238,14 @@ static void test_sampler_queue( min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1)); GGML_ASSERT(size == expected_size); - GGML_ASSERT(candidates_p.data[0].id == max_token_id); - GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); + GGML_ASSERT(cur_p.data[0].id == max_token_id); + GGML_ASSERT(cur_p.data[expected_size-1].id == min_token_id); } else { GGML_ABORT("fatal error"); } } - printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n", + printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n", samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p); } @@ -259,13 +278,13 @@ int main(void) { test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f); test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f); - test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f, 0.0f, 0.0f); - test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f); - test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f); + test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f, 0.0f, 0.0f); + test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f); + test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f); - test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f); - test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f); - test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f); + test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f); + test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f); + test_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f); test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f); test_sampler_queue(10000, "k", 1, 1.0f, 1.0f);