diff --git a/README.md b/README.md index 9d2a59d89..09e8cad31 100644 --- a/README.md +++ b/README.md @@ -598,7 +598,7 @@ Building the program with BLAS support may lead to some performance improvements To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. -Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives. +Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives. It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face. ```bash diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 0ec8d6d8d..171530c91 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -84,4 +84,4 @@ endif () target_include_directories(${TARGET} PUBLIC .) target_compile_features(${TARGET} PUBLIC cxx_std_11) -target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama) +target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) diff --git a/common/common.cpp b/common/common.cpp index c8df9a4ce..d2a8bb69e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -273,6 +273,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); + params = params_org; return false; } @@ -408,6 +409,20 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } + if (arg == "--in-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + params.in_files.push_back(argv[i]); + return true; + } if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { if (++i >= argc) { invalid_param = true; @@ -1081,7 +1096,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-v" || arg == "--verbose") { - params.verbose = true; + params.verbosity = 1; + return true; + } + if (arg == "--verbosity") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.verbosity = std::stoi(argv[i]); return true; } if (arg == "--verbose-prompt") { @@ -1391,6 +1414,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.timeout_write = std::stoi(argv[i]); return true; } + if (arg == "--threads-http") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads_http = std::stoi(argv[i]); + return true; + } if (arg == "-spf" || arg == "--system-prompt-file") { if (++i >= argc) { invalid_param = true; @@ -1460,6 +1491,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.chat_template = argv[i]; return true; } + if (arg == "--slot-prompt-similarity" || arg == "-sps") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.slot_prompt_similarity = std::stof(argv[i]); + return true; + } if (arg == "-pps") { params.is_pp_shared = true; return true; @@ -1537,6 +1576,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.i_pos = std::stoi(argv[i]); return true; } + if (arg == "-o" || arg == "--output" || arg == "--output-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.out_file = argv[i]; + return true; + } + if (arg == "-ofreq" || arg == "--output-frequency") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_out_freq = std::stoi(argv[i]); + return true; + } + if (arg == "--save-frequency") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_save_freq = std::stoi(argv[i]); + return true; + } + if (arg == "--process-output") { + params.process_output = true; + return true; + } + if (arg == "--no-ppl") { + params.compute_ppl = false; + return true; + } + if (arg == "--chunk" || arg == "--from-chunk") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.i_chunk = std::stoi(argv[i]); + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1612,6 +1691,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); options.push_back({ "*", " --version", "show version and build info" }); options.push_back({ "*", "-v, --verbose", "print verbose information" }); + options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity }); options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); @@ -1637,6 +1717,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() }); options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); + options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" }); options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); options.push_back({ "*", " --no-escape", "do not process escape sequences" }); @@ -1804,6 +1885,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); + options.push_back({ "imatrix" }); + options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() }); + options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq }); + options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq }); + options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" }); + options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" }); + options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk }); + options.push_back({ "bench" }); options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); @@ -1820,6 +1909,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" }); options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read }); + options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http }); options.push_back({ "server", " --system-prompt-file FNAME", "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" }); options.push_back({ "server", " --log-format {text,json}", @@ -1831,6 +1921,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "set custom jinja chat template (default: template taken from model's metadata)\n" "only commonly used templates are accepted:\n" "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); + options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", + "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); #ifndef LOG_DISABLE_LOGS options.push_back({ "logging" }); diff --git a/common/common.h b/common/common.h index e0a08a61b..038f9084f 100644 --- a/common/common.h +++ b/common/common.h @@ -56,43 +56,42 @@ struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed int32_t n_threads = cpu_get_num_math(); - int32_t n_threads_draft = -1; - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_threads_batch_draft = -1; - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 0; // context size - int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 5; // number of tokens to draft during speculative decoding - int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) - int32_t n_parallel = 1; // number of parallel sequences to decode - int32_t n_sequences = 1; // number of sequences to decode - float p_split = 0.1f; // speculative decoding split probability - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs - int32_t n_beams = 0; // if non-zero then use beam search of given width. - int32_t grp_attn_n = 1; // group-attention factor - int32_t grp_attn_w = 512; // group-attention width - int32_t n_print = -1; // print token count every n tokens (-1 = disabled) - float rope_freq_base = 0.0f; // RoPE base frequency - float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + int32_t n_threads_draft = -1; + int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_threads_batch_draft = -1; + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 0; // context size + int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_draft = 5; // number of tokens to draft during speculative decoding + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) + int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_sequences = 1; // number of sequences to decode + float p_split = 0.1f; // speculative decoding split probability + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + int32_t n_beams = 0; // if non-zero then use beam search of given width. + int32_t grp_attn_n = 1; // group-attention factor + int32_t grp_attn_w = 512; // group-attention width + int32_t n_print = -1; // print token count every n tokens (-1 = disabled) + float rope_freq_base = 0.0f; // RoPE base frequency + float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor - float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor + float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor float yarn_beta_fast = 32.0f; // YaRN low correction dim - float yarn_beta_slow = 1.0f; // YaRN high correction dim - int32_t yarn_orig_ctx = 0; // YaRN original context length + float yarn_beta_slow = 1.0f; // YaRN high correction dim + int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold - std::string rpc_servers = ""; // comma separated list of RPC servers ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; + enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings @@ -114,7 +113,9 @@ struct gpt_params { std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding std::string logits_file = ""; // file for saving *all* logits + std::string rpc_servers = ""; // comma separated list of RPC servers + std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; @@ -124,23 +125,24 @@ struct gpt_params { std::vector control_vectors; // control vector with user defined scale + int32_t verbosity = 0; int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_end = -1; // layer range for control vector - int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. - int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line - // (which is more convenient to use for plotting) - // - bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt - size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score + int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. + int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line + // (which is more convenient to use for plotting) + // + bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt + size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score - bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt - size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed + bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt + size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed - bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt - size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed + bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt + size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed - bool kl_divergence = false; // compute KL divergence + bool kl_divergence = false; // compute KL divergence bool usage = false; // print usage bool use_color = false; // use color to distinguish generations and inputs @@ -163,7 +165,6 @@ struct gpt_params { bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory - bool verbose = false; bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation bool infill = false; // use infill mode @@ -180,10 +181,10 @@ struct gpt_params { std::vector image; // path to image file(s) // server params - int32_t port = 8080; - int32_t timeout_read = 600; - int32_t timeout_write = timeout_read; - int32_t n_threads_http = -1; + int32_t port = 8080; // server listens on this network port + int32_t timeout_read = 600; // http read timeout in seconds + int32_t timeout_write = timeout_read; // http write timeout in seconds + int32_t n_threads_http = -1; // number of threads to process HTTP requests std::string hostname = "127.0.0.1"; std::string public_path = ""; @@ -202,6 +203,8 @@ struct gpt_params { std::string slot_save_path; + float slot_prompt_similarity = 0.5f; + // batched-bench params bool is_pp_shared = false; @@ -219,6 +222,16 @@ struct gpt_params { // passkey params int32_t n_junk = 250; // number of times to repeat the junk text int32_t i_pos = -1; // position of the passkey in the junk text + + // imatrix params + std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file + + int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations + int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations + int32_t i_chunk = 0; // start processing from this chunk + + bool process_output = false; // collect data for the output tensor + bool compute_ppl = true; // whether to compute perplexity }; void gpt_params_handle_model_default(gpt_params & params); diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index 79d2b0354..a518b766d 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -266,6 +266,10 @@ namespace grammar_parser { throw std::runtime_error(std::string("expecting ')' at ") + pos); } pos = parse_space(pos + 1, is_nested); + } else if (*pos == '.') { // any char + last_sym_start = out_elements.size(); + out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0}); + pos = parse_space(pos + 1, is_nested); } else if (*pos == '*') { pos = parse_space(pos + 1, is_nested); handle_repetitions(0, -1); @@ -401,6 +405,7 @@ namespace grammar_parser { case LLAMA_GRETYPE_CHAR_NOT: return true; case LLAMA_GRETYPE_CHAR_ALT: return true; case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true; + case LLAMA_GRETYPE_CHAR_ANY: return true; default: return false; } } @@ -415,6 +420,7 @@ namespace grammar_parser { case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break; case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break; case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break; + case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break; } switch (elem.type) { case LLAMA_GRETYPE_END: @@ -426,6 +432,7 @@ namespace grammar_parser { case LLAMA_GRETYPE_CHAR_NOT: case LLAMA_GRETYPE_CHAR_RNG_UPPER: case LLAMA_GRETYPE_CHAR_ALT: + case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "(\""); print_grammar_char(file, elem.value); fprintf(file, "\") "); @@ -483,11 +490,15 @@ namespace grammar_parser { } print_grammar_char(file, elem.value); break; + case LLAMA_GRETYPE_CHAR_ANY: + fprintf(file, "."); + break; } if (is_char_element(elem)) { switch (rule[i + 1].type) { case LLAMA_GRETYPE_CHAR_ALT: case LLAMA_GRETYPE_CHAR_RNG_UPPER: + case LLAMA_GRETYPE_CHAR_ANY: break; default: fprintf(file, "] "); diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index e04feeae3..881f0451c 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) { int n; if (str.back() == 'M') { sscanf(str.c_str(), "%d", &n); - n_bytes = (size_t)n * 1024 * 1024; // megabytes + n_bytes = (size_t)n * 1000 * 1000; // megabytes } else if (str.back() == 'G') { sscanf(str.c_str(), "%d", &n); - n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes + n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes } else { throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back())); } @@ -284,7 +284,7 @@ struct split_strategy { struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i)); total_size += ggml_nbytes(t); } - total_size = total_size / 1024 / 1024; // convert to megabytes + total_size = total_size / 1000 / 1000; // convert to megabytes printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); i_split++; } diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 458c01b87..866ca9f56 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/ ## Usage ``` -./imatrix -m -f [-o ] [--verbosity ] - [-ofreq num_chunks] [-ow <0 or 1>] [other common params] +./imatrix \ + -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \ + [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \ + [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] ``` Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory. The parameters in square brackets are optional and have the following meaning: * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. -* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) -* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. +* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) +* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never) +* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. For faster computation, make sure to use GPU offloading via the `-ngl` argument diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index e050c09d2..e18f49563 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -17,39 +17,37 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s \\\n" + " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" + " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" + " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); + LOG_TEE("\n"); +} + struct Stats { std::vector values; std::vector counts; int ncall = 0; }; -struct StatParams { - std::string dataset; - std::string ofile = "imatrix.dat"; - int n_output_frequency = 10; - int verbosity = 1; - int keep_every = 0; - bool collect_output_weight = false; -}; - class IMatrixCollector { public: IMatrixCollector() = default; - void set_parameters(StatParams&& params) { m_params = std::move(params); } + void set_params(gpt_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); - void save_imatrix() const; - bool load_imatrix(const char * file_name, bool add); - static bool load_imatrix(const char * file_name, std::unordered_map& imatrix); + void save_imatrix(int ncall = -1) const; + bool load_imatrix(const char * file_name); private: std::unordered_map m_stats; - StatParams m_params; + gpt_params m_params; std::mutex m_mutex; int m_last_call = 0; std::vector m_src1_data; std::vector m_ids; // the expert ids from ggml_mul_mat_id - // - void save_imatrix(const char * file_name, const char * dataset) const; - void keep_imatrix(int ncall) const; }; // remove any prefix and suffixes from the name @@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * if (t->op != GGML_OP_MUL_MAT) return false; // why are small batches ignored (<16 tokens)? if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; - if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false; + if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false; return true; } @@ -153,21 +151,25 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * for (int j = 0; j < (int)src1->ne[0]; ++j) { e.values[e_start + j] += x[j]*x[j]; e.counts[e_start + j]++; + if (!std::isfinite(e.values[e_start + j])) { + fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str()); + exit(1); + } } } } if (e.ncall > m_last_call) { m_last_call = e.ncall; - if (m_last_call % m_params.n_output_frequency == 0) { + if (m_last_call % m_params.n_out_freq == 0) { save_imatrix(); } - if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { - keep_imatrix(m_last_call); + if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { + save_imatrix(m_last_call); } } } } else { - auto& e = m_stats[wname]; + auto & e = m_stats[wname]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); e.counts.resize(src1->ne[0], 0); @@ -185,15 +187,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * for (int j = 0; j < (int)src1->ne[0]; ++j) { e.values[j] += x[j]*x[j]; e.counts[j]++; + if (!std::isfinite(e.values[j])) { + fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str()); + exit(1); + } } } if (e.ncall > m_last_call) { m_last_call = e.ncall; - if (m_last_call % m_params.n_output_frequency == 0) { + if (m_last_call % m_params.n_out_freq == 0) { save_imatrix(); } - if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { - keep_imatrix(m_last_call); + if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { + save_imatrix(m_last_call); } } } @@ -201,19 +207,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * return true; } -void IMatrixCollector::save_imatrix() const { - save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str()); -} +void IMatrixCollector::save_imatrix(int ncall) const { + auto fname = m_params.out_file; + if (fname.empty()) { + fname = "imatrix.dat"; + } -void IMatrixCollector::keep_imatrix(int ncall) const { - auto file_name = m_params.ofile; - if (file_name.empty()) file_name = "imatrix.dat"; - file_name += ".at_"; - file_name += std::to_string(ncall); - save_imatrix(file_name.c_str(), m_params.dataset.c_str()); -} + if (ncall > 0) { + fname += ".at_"; + fname += std::to_string(ncall); + } -void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const { std::ofstream out(fname, std::ios::binary); int n_entries = m_stats.size(); out.write((const char *) &n_entries, sizeof(n_entries)); @@ -236,26 +240,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co // Write the number of call the matrix was computed with out.write((const char *) &m_last_call, sizeof(m_last_call)); - // Write the dataset name at the end of the file to later on specify it in quantize - int n_dataset = strlen(dataset); - out.write((const char *) &n_dataset, sizeof(n_dataset)); - out.write(dataset, n_dataset); + // Write the input filename at the end of the file to later on specify it in quantize + { + int len = m_params.prompt_file.size(); + out.write((const char *) &len, sizeof(len)); + out.write(m_params.prompt_file.c_str(), len); + } if (m_params.verbosity > 0) { - fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname); + fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); } } -bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map& imatrix_data) { - std::ifstream in(imatrix_file, std::ios::binary); +bool IMatrixCollector::load_imatrix(const char * fname) { + std::ifstream in(fname, std::ios::binary); if (!in) { - printf("%s: failed to open %s\n",__func__,imatrix_file); + printf("%s: failed to open %s\n",__func__, fname); return false; } int n_entries; in.read((char*)&n_entries, sizeof(n_entries)); if (in.fail() || n_entries < 1) { - printf("%s: no data in file %s\n", __func__, imatrix_file); + printf("%s: no data in file %s\n", __func__, fname); return false; } for (int i = 0; i < n_entries; ++i) { @@ -263,23 +269,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma std::vector name_as_vec(len+1); in.read((char *)name_as_vec.data(), len); if (in.fail()) { - printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file); + printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); return false; } name_as_vec[len] = 0; std::string name{name_as_vec.data()}; - auto& e = imatrix_data[std::move(name)]; + auto & e = m_stats[std::move(name)]; int ncall; in.read((char*)&ncall, sizeof(ncall)); int nval; in.read((char *)&nval, sizeof(nval)); if (in.fail() || nval < 1) { printf("%s: failed reading number of values for entry %d\n",__func__,i); - imatrix_data = {}; + m_stats = {}; return false; } - // When re-called from load_imatrix() with add set, this will already be created. if (e.values.empty()) { e.values.resize(nval, 0); e.counts.resize(nval, 0); @@ -289,7 +294,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma in.read((char*)tmp.data(), nval*sizeof(float)); if (in.fail()) { printf("%s: failed reading data for entry %d\n",__func__,i); - imatrix_data = {}; + m_stats = {}; return false; } @@ -304,13 +309,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma return true; } -bool IMatrixCollector::load_imatrix(const char * file_name, bool add) { - if (!add) { - m_stats.clear(); - } - return load_imatrix(file_name, m_stats); -} - static IMatrixCollector g_collector; static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { @@ -324,7 +322,7 @@ struct results_log_softmax { float prob; }; -static std::vector softmax(const std::vector& logits) { +static std::vector softmax(const std::vector & logits) { std::vector probs(logits.size()); float max_logit = logits[0]; for (float v : logits) { @@ -358,8 +356,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to static void process_logits( int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, - double & nll, double & nll2, float * logit_history, float * prob_history -) { + double & nll, double & nll2, float * logit_history, float * prob_history) { std::mutex mutex; int counter = 0; auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { @@ -391,8 +388,7 @@ static void process_logits( } } -static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) { - +static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); const int n_ctx = llama_n_ctx(ctx); @@ -405,13 +401,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool auto tim2 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); - if (from_chunk > 0) { - if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) { - fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk); + if (params.i_chunk > 0) { + if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) { + fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); return false; } - fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx); - tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx); + fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); + tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx); } if (int(tokens.size()) < 2*n_ctx) { @@ -424,7 +420,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool std::vector logit_history; std::vector prob_history; - if (compute_ppl) { + if (params.compute_ppl) { logit_history.resize(tokens.size()); prob_history.resize(tokens.size()); } @@ -446,7 +442,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool const int num_batches = (n_ctx + n_batch - 1) / n_batch; std::vector logits; - if (compute_ppl && num_batches > 1) { + if (params.compute_ppl && num_batches > 1) { logits.reserve((size_t)n_ctx * n_vocab); } @@ -482,7 +478,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool // restore the original token in case it was set to BOS tokens[batch_start] = token_org; - if (compute_ppl && num_batches > 1) { + if (params.compute_ppl && num_batches > 1) { const auto * batch_logits = llama_get_logits(ctx); logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); } @@ -501,7 +497,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); } - if (compute_ppl) { + if (params.compute_ppl) { const int first = n_ctx/2; const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, @@ -516,7 +512,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool } printf("\n"); - if (compute_ppl) { + if (params.compute_ppl) { nll2 /= count; nll /= count; const double ppl = exp(nll); @@ -533,109 +529,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool } int main(int argc, char ** argv) { - StatParams sparams; - std::string prev_result_file; - std::string combine_files; - bool compute_ppl = true; - int from_chunk = 0; - std::vector args; - args.push_back(argv[0]); - int iarg = 1; - for (; iarg < argc-1; ++iarg) { - std::string arg{argv[iarg]}; - if (arg == "-o" || arg == "--output-file") { - sparams.ofile = argv[++iarg]; - } - else if (arg == "-ofreq" || arg == "--output-frequency") { - sparams.n_output_frequency = std::stoi(argv[++iarg]); - } - else if (arg == "-ow" || arg == "--output-weight") { - sparams.collect_output_weight = std::stoi(argv[++iarg]); - } - else if (arg == "--verbosity") { - sparams.verbosity = std::stoi(argv[++iarg]); - } else if (arg == "--no-ppl") { - compute_ppl = false; - } else if (arg == "--keep-imatrix") { - sparams.keep_every = std::stoi(argv[++iarg]); - } else if (arg == "--continue-from") { - prev_result_file = argv[++iarg]; - } else if (arg == "--combine") { - combine_files = argv[++iarg]; - } - else if (arg == "--from-chunk") { - from_chunk = std::stoi(argv[++iarg]); - } else { - args.push_back(argv[iarg]); - } - } - if (iarg < argc) { - std::string arg{argv[iarg]}; - if (arg == "--no-ppl") { - compute_ppl = false; - } else { - args.push_back(argv[iarg]); - } - } - gpt_params params; - params.n_batch = 512; + + params.n_ctx = 512; + params.logits_all = true; + params.verbosity = 1; if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + print_usage(argc, argv, params); return 1; } - params.logits_all = true; params.n_batch = std::min(params.n_batch, params.n_ctx); - print_build_info(); + g_collector.set_params(params); - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - - sparams.dataset = params.prompt_file; - g_collector.set_parameters(std::move(sparams)); - - if (!combine_files.empty()) { - std::vector files; - size_t pos = 0; - while (true) { - auto new_pos = combine_files.find(',', pos); - if (new_pos != std::string::npos) { - files.emplace_back(combine_files.substr(pos, new_pos - pos)); - pos = new_pos + 1; - } else { - files.emplace_back(combine_files.substr(pos)); - break; - } - } - if (files.size() < 2) { - fprintf(stderr, "You must provide at least two comma separated files to use --combine\n"); + for (const auto & in_file : params.in_files) { + printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); + if (!g_collector.load_imatrix(in_file.c_str())) { + fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str()); return 1; } - printf("Combining the following %d files\n", int(files.size())); - for (auto& file : files) { - printf(" %s\n", file.c_str()); - if (!g_collector.load_imatrix(file.c_str(), true)) { - fprintf(stderr, "Failed to load %s\n", file.c_str()); - return 1; - } - } + } + + if (params.in_files.size() > 1) { + printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); g_collector.save_imatrix(); - return 0; - } - - if (!prev_result_file.empty()) { - if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) { - fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str()); - return 1; - } } llama_backend_init(); @@ -650,6 +569,7 @@ int main(int argc, char ** argv) { // init llama_model * model; llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == nullptr || ctx == nullptr) { fprintf(stderr, "%s : failed to init\n", __func__); @@ -668,8 +588,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); } - bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk); - if (!OK) { + if (!compute_imatrix(ctx, params)) { return 1; } diff --git a/examples/server/README.md b/examples/server/README.md index 0c3db8c84..ccbdcdbdb 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -279,7 +279,7 @@ node index.js `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` - `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. Default: `false` + `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d581cad95..6ffaa8d9f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -647,6 +647,9 @@ struct server_context { server_metrics metrics; + // Necessary similarity of prompt for slot selection + float slot_prompt_similarity = 0.0f; + ~server_context() { if (ctx) { llama_free(ctx); @@ -795,24 +798,88 @@ struct server_context { return prompt_tokens; } - server_slot * get_slot(int id) { - int64_t t_last = ggml_time_us(); - - server_slot * last_used = nullptr; - + server_slot * get_slot_by_id(int id) { for (server_slot & slot : slots) { - if (slot.id == id && slot.available()) { + if (slot.id == id) { return &slot; } - - // among all available slots, find the one that has been least recently used - if (slot.available() && slot.t_last_used < t_last) { - last_used = &slot; - t_last = slot.t_last_used; - } } - return last_used; + return nullptr; + } + + server_slot * get_available_slot(const std::string & prompt) { + server_slot * ret = nullptr; + + // find the slot that has at least n% prompt similarity + if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) { + int max_lcp_len = 0; + float similarity = 0; + + for (server_slot & slot : slots) { + // skip the slot if it is not available + if (!slot.available()) { + continue; + } + + // skip the slot if it does not contains prompt + if (!slot.prompt.is_string()) { + continue; + } + + // current slot's prompt + std::string slot_prompt = slot.prompt.get(); + + // length of the current slot's prompt + int slot_prompt_len = slot_prompt.size(); + + // length of the Longest Common Prefix between the current slot's prompt and the input prompt + int lcp_len = common_part(slot_prompt, prompt); + + // fraction of the common substring length compared to the current slot's prompt length + similarity = static_cast(lcp_len) / slot_prompt_len; + + // select the current slot if the criteria match + if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) { + max_lcp_len = lcp_len; + ret = &slot; + } + } + + if (ret != nullptr) { + LOG_VERBOSE("selected slot by lcp similarity", { + {"id_slot", ret->id}, + {"max_lcp_len", max_lcp_len}, + {"similarity", similarity}, + }); + } + } + + // find the slot that has been least recently used + if (ret == nullptr) { + int64_t t_last = ggml_time_us(); + for (server_slot & slot : slots) { + // skip the slot if it is not available + if (!slot.available()) { + continue; + } + + // select the current slot if the criteria match + if (slot.t_last_used < t_last) { + t_last = slot.t_last_used; + ret = &slot; + } + } + + if (ret != nullptr) { + LOG_VERBOSE("selected slot by lru", { + {"id_slot", ret->id}, + {"t_last", t_last}, + }); + } + } + + return ret; } bool launch_slot_with_task(server_slot & slot, const server_task & task) { @@ -888,7 +955,7 @@ struct server_context { slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); // get prompt - { + if (!task.infill) { const auto & prompt = data.find("prompt"); if (prompt == data.end()) { send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST); @@ -1515,13 +1582,29 @@ struct server_context { switch (task.type) { case SERVER_TASK_TYPE_COMPLETION: { - server_slot * slot = get_slot(json_value(task.data, "id_slot", -1)); + int id_slot = json_value(task.data, "id_slot", -1); + std::string prompt = json_value(task.data, "prompt", std::string()); + + server_slot * slot; + + if (id_slot != -1) { + slot = get_slot_by_id(id_slot); + } else { + slot = get_available_slot(prompt); + } + if (slot == nullptr) { // if no slot is available, we defer this task for processing later LOG_VERBOSE("no slot is available", {{"id_task", task.id}}); queue_tasks.defer(task); break; } + if (!slot->available()) { + // if requested slot is unavailable, we defer this task for processing later + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + queue_tasks.defer(task); + break; + } if (task.data.contains("system_prompt")) { std::string sys_prompt = json_value(task.data, "system_prompt", std::string()); @@ -1638,11 +1721,17 @@ struct server_context { case SERVER_TASK_TYPE_SLOT_SAVE: { int id_slot = task.data.at("id_slot"); - server_slot * slot = get_slot(id_slot); + server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); break; } + if (!slot->available()) { + // if requested slot is unavailable, we defer this task for processing later + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + queue_tasks.defer(task); + break; + } const size_t token_count = slot->cache_tokens.size(); const int64_t t_start = ggml_time_us(); @@ -1673,11 +1762,17 @@ struct server_context { case SERVER_TASK_TYPE_SLOT_RESTORE: { int id_slot = task.data.at("id_slot"); - server_slot * slot = get_slot(id_slot); + server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); break; } + if (!slot->available()) { + // if requested slot is unavailable, we defer this task for processing later + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + queue_tasks.defer(task); + break; + } const int64_t t_start = ggml_time_us(); @@ -1715,11 +1810,17 @@ struct server_context { case SERVER_TASK_TYPE_SLOT_ERASE: { int id_slot = task.data.at("id_slot"); - server_slot * slot = get_slot(id_slot); + server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); break; } + if (!slot->available()) { + // if requested slot is unavailable, we defer this task for processing later + LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + queue_tasks.defer(task); + break; + } // Erase token cache const size_t n_erased = slot->cache_tokens.size(); @@ -2360,7 +2461,7 @@ int main(int argc, char ** argv) { // TODO: not great to use extern vars server_log_json = params.log_json; - server_verbose = params.verbose; + server_verbose = params.verbosity > 0; // struct that contains llama context and inference server_context ctx_server; @@ -2467,6 +2568,9 @@ int main(int argc, char ** argv) { log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded"; } + // Necessary similarity of prompt for slot selection + ctx_server.slot_prompt_similarity = params.slot_prompt_similarity; + // load the model if (!ctx_server.load_model(params)) { state.store(SERVER_STATE_ERROR); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index b7bfb41d3..63fde9c9f 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -253,6 +253,13 @@ static size_t common_part(const std::vector & a, const std::vector< return i; } +static size_t common_part(const std::string & a, const std::string & b) { + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} + + return i; +} + static bool ends_with(const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); } diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 3ff76474d..0a645b2e1 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -9108,6 +9108,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const // find the sum of exps in the block tmp = warp_reduce_sum(tmp, item_ct1); if (block_size > WARP_SIZE) { + item_ct1.barrier(sycl::access::fence_space::local_space); if (warp_id == 0) { buf[lane_id] = 0.f; } diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index e0c512c0d..128769177 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -345,15 +345,12 @@ struct vk_context { }; struct ggml_tensor_extra_gpu { - bool ready; - size_t ctx_idx; vk_buffer_ref buffer_gpu; uint64_t offset; void reset() { - ready = false; ctx_idx = 0; buffer_gpu.reset(); offset = 0; @@ -2949,7 +2946,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su const uint64_t d_sz = sizeof(float) * d_ne; vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset; + const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); vk_buffer d_X; @@ -2958,12 +2955,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su uint64_t y_buf_offset = 0; if (!src0_uma) { d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset; + qx_buf_offset = extra_src0->offset + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if (!src1_uma) { d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset; + qy_buf_offset = extra_src1->offset + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { @@ -3114,7 +3111,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context const uint64_t d_sz = sizeof(float) * d_ne; vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset; + const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; @@ -3122,12 +3119,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context uint64_t y_buf_offset = 0; if(!src0_uma) { d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset; + qx_buf_offset = extra_src0->offset + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if(!src1_uma) { d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset; + qy_buf_offset = extra_src1->offset + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { @@ -3246,14 +3243,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c const uint64_t d_sz = sizeof(float) * d_ne; vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset; + const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); - const uint64_t qx_buf_offset = extra_src0->offset; + const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); if (!src1_uma) { d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset; + qy_buf_offset = extra_src1->offset + src1->view_offs; GGML_ASSERT(d_Qx != nullptr); } @@ -3323,14 +3320,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con const uint64_t d_sz = sizeof(float) * d_ne; vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset; + const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); - const uint64_t qx_buf_offset = extra_src0->offset; + const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); if (!src1_uma) { d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset; + qy_buf_offset = extra_src1->offset + src1->view_offs; GGML_ASSERT(d_Qx != nullptr); } @@ -3459,7 +3456,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * const uint64_t d_sz = sizeof(float) * d_ne; vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset; + const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; @@ -3467,17 +3464,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * uint64_t y_buf_offset = 0; if (!src0_uma) { d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset; + qx_buf_offset = extra_src0->offset + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if (!src1_uma) { d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset; + qy_buf_offset = extra_src1->offset + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if (!ids_uma) { d_ids = extra_ids->buffer_gpu.lock(); - ids_buf_offset = extra_ids->offset; + ids_buf_offset = extra_ids->offset + ids->view_offs; GGML_ASSERT(d_ids != nullptr); } if (qx_needs_dequant) { @@ -3636,7 +3633,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte const uint64_t d_sz = sizeof(float) * d_ne; vk_buffer d_D = extra->buffer_gpu.lock(); - const uint64_t d_buf_offset = extra->offset; + const uint64_t d_buf_offset = extra->offset + dst->view_offs; GGML_ASSERT(d_D != nullptr); vk_buffer d_X; uint64_t x_buf_offset = 0; @@ -3644,17 +3641,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte uint64_t y_buf_offset = 0; if(!src0_uma) { d_Qx = extra_src0->buffer_gpu.lock(); - qx_buf_offset = extra_src0->offset; + qx_buf_offset = extra_src0->offset + src0->view_offs; GGML_ASSERT(d_Qx != nullptr); } if(!src1_uma) { d_Qy = extra_src1->buffer_gpu.lock(); - qy_buf_offset = extra_src1->offset; + qy_buf_offset = extra_src1->offset + src1->view_offs; GGML_ASSERT(d_Qy != nullptr); } if(!ids_uma) { d_ids = extra_ids->buffer_gpu.lock(); - ids_buf_offset = extra_ids->offset; + ids_buf_offset = extra_ids->offset + ids->view_offs; GGML_ASSERT(d_ids != nullptr); } if (qx_needs_dequant) { @@ -3769,9 +3766,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; const vk_buffer src_buf = extra_src0->buffer_gpu.lock(); - const uint64_t src_offset = extra_src0->offset; + const uint64_t src_offset = extra_src0->offset + src0->view_offs; vk_buffer dst_buf = extra->buffer_gpu.lock(); - const uint64_t dst_offset = extra->offset; + const uint64_t dst_offset = extra->offset + dst->view_offs; std::vector copies; @@ -4062,21 +4059,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c } GGML_ASSERT(d_D != nullptr); - uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; + uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT if(!src0_uma) { d_X = extra_src0->buffer_gpu.lock(); - x_buf_offset = extra_src0->offset; + x_buf_offset = extra_src0->offset + src0->view_offs; GGML_ASSERT(d_X != nullptr); } if (use_src1 && !src1_uma) { d_Y = extra_src1->buffer_gpu.lock(); - y_buf_offset = extra_src1->offset; + y_buf_offset = extra_src1->offset + src1->view_offs; GGML_ASSERT(d_Y != nullptr); } if (use_src2 && !src2_uma) { d_Z = extra_src2->buffer_gpu.lock(); - z_buf_offset = extra_src2->offset; + z_buf_offset = extra_src2->offset + src2->view_offs; GGML_ASSERT(d_Z != nullptr); } @@ -4336,7 +4333,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; + const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { (uint32_t)ggml_nelements(src0), @@ -5569,6 +5566,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod const ggml_tensor * src2 = node->src[2]; switch (node->op) { + // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_NONE: + return; case GGML_OP_UNARY: switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: @@ -5590,10 +5594,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: case GGML_OP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_DIAG_MASK_INF: @@ -5601,7 +5601,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_ROPE: case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: - case GGML_OP_NONE: case GGML_OP_ARGSORT: case GGML_OP_SUM_ROWS: break; @@ -5654,12 +5653,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_DUP: ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node); - break; - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_NONE: break; case GGML_OP_NORM: ggml_vk_norm(ctx, ctx->compute_ctx, src0, node); @@ -5712,7 +5705,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod return; } - extra->ready = true; extra->ctx_idx = ctx->compute_ctx->idx; #ifdef GGML_VULKAN_CHECK_RESULTS @@ -5796,8 +5788,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_ ggml_vk_check_results_0(ctx, params, tensor); #endif - GGML_ASSERT(extra->ready); - vk_context& subctx = ctx->gc.contexts[extra->ctx_idx]; // Only run if ctx hasn't been submitted yet @@ -5822,8 +5812,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_ subctx.out_memcpys.clear(); } - extra->ready = false; - return true; } @@ -5943,7 +5931,9 @@ struct ggml_backend_vk_buffer_context { ~ggml_backend_vk_buffer_context() { ggml_vk_destroy_buffer(dev_buffer); - delete[] temp_tensor_extras; + if (temp_tensor_extras != nullptr) { + delete[] temp_tensor_extras; + } } ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() { @@ -5990,18 +5980,16 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b #endif ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; - ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra(); - if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) { + if (tensor->view_src != nullptr) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); - ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra; - extra->buffer_gpu = extra_view->buffer_gpu; - extra->offset = extra_view->offset + tensor->view_offs; + GGML_ASSERT(tensor->view_src->extra != nullptr); + tensor->extra = tensor->view_src->extra; } else { + ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra(); extra->buffer_gpu = ctx->dev_buffer; extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; + tensor->extra = extra; } - - tensor->extra = extra; } GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -6014,7 +6002,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu vk_buffer buf = extra->buffer_gpu.lock(); - ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size); + ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size); } GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -6027,7 +6015,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu vk_buffer buf = extra->buffer_gpu.lock(); - ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size); + ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size); } GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { @@ -6038,7 +6026,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu vk_buffer src_buf = src_extra->buffer_gpu.lock(); vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); - ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src)); + ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); return true; } @@ -6264,7 +6252,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g vk_buffer buf = extra->buffer_gpu.lock(); - ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size); + ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); } GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -6284,7 +6272,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c vk_buffer buf = extra->buffer_gpu.lock(); - ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size); + ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); } GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { @@ -6305,7 +6293,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c vk_buffer src_buf = src_extra->buffer_gpu.lock(); vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); - ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src)); + ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); return true; } @@ -6478,11 +6466,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; // } break; case GGML_OP_ROPE: - { - const int mode = ((const int32_t *) op->op_params)[2]; - - return true; - } break; + return true; case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: @@ -6725,7 +6709,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size); + ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size); } std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; @@ -6809,7 +6793,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ } else if (ggml_backend_buffer_is_vk(src0->buffer)) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - uint64_t offset = extra->offset; + uint64_t offset = extra->offset + src0->view_offs; if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) { for (int i3 = 0; i3 < src0->ne[3]; i3++) { for (int i2 = 0; i2 < src0->ne[2]; i2++) { @@ -6851,7 +6835,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ } else if (ggml_backend_buffer_is_vk(src1->buffer)) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - uint64_t offset = extra->offset; + uint64_t offset = extra->offset + src1->view_offs; if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) { for (int i3 = 0; i3 < src1->ne[3]; i3++) { for (int i2 = 0; i2 < src1->ne[2]; i2++) { @@ -6909,7 +6893,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ } else if (ggml_backend_buffer_is_vk(src2->buffer)) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra; vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - uint64_t offset = extra->offset; + uint64_t offset = extra->offset + src2->view_offs; if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) { for (int i3 = 0; i3 < src2->ne[3]; i3++) { for (int i2 = 0; i2 < src2->ne[2]; i2++) { @@ -7092,11 +7076,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; vk_buffer buffer_gpu = extra->buffer_gpu.lock(); - if (extra->offset + tensor_size >= buffer_gpu->size) { - tensor_size = buffer_gpu->size - (extra->offset); + if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) { + tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs); } - ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size); + ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size); } float first_error_result = -1.0f; diff --git a/llama.cpp b/llama.cpp index cefb4d1d5..8b675ea99 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13640,7 +13640,7 @@ static std::pair llama_grammar_match_char( const uint32_t chr) { bool found = false; - bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR; + bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY; GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT @@ -13649,6 +13649,10 @@ static std::pair llama_grammar_match_char( // inclusive range, e.g. [a-z] found = found || (pos->value <= chr && chr <= pos[1].value); pos += 2; + } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) { + // Any character matches "." + found = true; + pos += 1; } else { // exact char match, e.g. [a] or "a" found = found || pos->value == chr; @@ -13666,7 +13670,7 @@ static bool llama_grammar_match_partial_char( const llama_grammar_element * pos, const llama_partial_utf8 partial_utf8) { - bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR; + bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY; GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); uint32_t partial_value = partial_utf8.value; @@ -13696,6 +13700,9 @@ static bool llama_grammar_match_partial_char( return is_positive_char; } pos += 2; + } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) { + // Any character matches "." + return true; } else { // exact char match, e.g. [a] or "a" if (low <= pos->value && pos->value <= high) { @@ -13756,6 +13763,7 @@ static void llama_grammar_advance_stack( } case LLAMA_GRETYPE_CHAR: case LLAMA_GRETYPE_CHAR_NOT: + case LLAMA_GRETYPE_CHAR_ANY: if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { // only add the stack if it's not a duplicate of one we already have new_stacks.emplace_back(stack); @@ -15229,6 +15237,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (imatrix_data) { LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); qs.has_imatrix = true; + // check imatrix for nans or infs + for (const auto & kv : *imatrix_data) { + for (float f : kv.second) { + if (!std::isfinite(f)) { + throw std::runtime_error(format("imatrix contains non-finite value %f\n", f)); + } + } + } } } diff --git a/llama.h b/llama.h index 9dcd67bef..62908261f 100644 --- a/llama.h +++ b/llama.h @@ -365,6 +365,9 @@ extern "C" { // modifies a preceding LLAMA_GRETYPE_CHAR or // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) LLAMA_GRETYPE_CHAR_ALT = 6, + + // any character (.) + LLAMA_GRETYPE_CHAR_ANY = 7, }; typedef struct llama_grammar_element { diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 6aca32a39..2d78c61d3 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -285,6 +285,33 @@ static void test_complex_grammar() { ); } +static void test_special_chars() { + // A collection of tests to exercise special characters such as "." + test_grammar( + "special characters", + // Grammar + R"""( + root ::= ... "abc" ... + )""", + // Passing strings + { + "abcabcabc", + "aaaabcccc", + // NOTE: Also ensures that multi-byte characters still count as a single character + "🔵🟠✅abc❌🟠🔵" + }, + // Failing strings + { + "aaabcccc", + "aaaaabcccc", + "aaaabccc", + "aaaabccccc", + "🔵🟠✅❌abc❌✅🟠🔵" + "🔵🟠abc🟠🔵" + } + ); +} + static void test_quantifiers() { // A collection of tests to exercise * + and ? quantifiers @@ -525,6 +552,7 @@ int main() { fprintf(stdout, "Running grammar integration tests...\n"); test_simple_grammar(); test_complex_grammar(); + test_special_chars(); test_quantifiers(); test_failure_missing_root(); test_failure_missing_reference();