diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index dd0a7e3e5..1f275603a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -47,7 +47,7 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON .. + cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -105,7 +105,7 @@ jobs: sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON + cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -222,7 +222,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF cmake --build . --config Release -j $(nproc) - name: Test @@ -801,6 +801,7 @@ jobs: 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) cd build + $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1 & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - name: Determine tag name diff --git a/CMakeLists.txt b/CMakeLists.txt index 03dfce57c..9505fadd5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,14 +79,21 @@ set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS}) set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) -set(GGML_LLAMAFILE ON) -set(GGML_CUDA_USE_GRAPHS ON) + +# change the default for these ggml options +if (NOT DEFINED GGML_LLAMAFILE) + set(GGML_LLAMAFILE ON) +endif() + +if (NOT DEFINED GGML_CUDA_USE_GRAPHS) + set(GGML_CUDA_USE_GRAPHS ON) +endif() # transition helpers function (llama_option_depr TYPE OLD NEW) if (${OLD}) message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n") - set(${NEW} ON) + set(${NEW} ON PARENT_SCOPE) endif() endfunction() @@ -96,7 +103,6 @@ llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE) llama_option_depr(WARNING LLAMA_METAL GGML_METAL) llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY) llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) -llama_option_depr(WARNING LLAMA_OPENMP GGML_OPENMP) llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) diff --git a/Makefile b/Makefile index a0b69392a..987352816 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,7 @@ BUILD_TARGETS = \ TEST_TARGETS = \ tests/test-autorelease \ tests/test-backend-ops \ + tests/test-chat-template \ tests/test-double-float \ tests/test-grad0 \ tests/test-grammar-integration \ @@ -1092,6 +1093,7 @@ clean: rm -rvf src/*.o rm -rvf tests/*.o rm -rvf examples/*.o + rm -rvf common/*.o rm -rvf *.a rm -rvf *.dll rm -rvf *.so diff --git a/common/common.cpp b/common/common.cpp index c76d0e2c3..6a00d25be 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1026,6 +1026,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.input_suffix = argv[i]; return true; } + if (arg == "--spm-infill") { + params.spm_infill = true; + return true; + } if (arg == "--grammar") { CHECK_ARG sparams.grammar = argv[i]; @@ -1409,6 +1413,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + options.push_back({ "server infill", + " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); options.push_back({ "sampling" }); options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" @@ -2618,6 +2624,7 @@ std::string llama_chat_apply_template(const struct llama_model * model, const std::vector & msgs, bool add_ass) { int alloc_size = 0; + bool fallback = false; // indicate if we must fallback to default chatml std::vector chat; for (auto & msg : msgs) { chat.push_back({msg.role.c_str(), msg.content.c_str()}); @@ -2630,10 +2637,26 @@ std::string llama_chat_apply_template(const struct llama_model * model, // run the first time to get the total output length int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + // error: chat template is not supported + if (res < 0) { + if (ptr_tmpl != nullptr) { + // if the custom "tmpl" is not supported, we throw an error + // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() + throw std::runtime_error("this custom template is not supported"); + } else { + // If the built-in template is not supported, we default to chatml + res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + fallback = true; + } + } + // if it turns out that our buffer is too small, we resize it if ((size_t) res > buf.size()) { buf.resize(res); - res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + res = llama_chat_apply_template( + fallback ? nullptr : model, + fallback ? "chatml" : ptr_tmpl, + chat.data(), chat.size(), add_ass, buf.data(), buf.size()); } std::string formatted_chat(buf.data(), res); @@ -2804,125 +2827,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) // static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { - int32_t n_tensors; - - size_t n_bytes = 0; - - uint32_t max_direction_layer = 0; - llama_control_vector_data result = { -1, {} }; - // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer - { - struct ggml_init_params meta_params = { - /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ true, - }; - ggml_context * meta_ctx = ggml_init(meta_params); - struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ true, - /* .ctx = */ &meta_ctx, - }; - struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); - if (!meta_ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - return result; - } - - n_tensors = gguf_get_n_tensors(meta_ctx_gguf); - for (int i = 0; i < n_tensors; i++) { - std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); - - // split on '.' - size_t dotpos = name.find('.'); - if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { - try { - uint32_t layer = std::stoi(name.substr(dotpos + 1)); - if (layer == 0) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - if (layer > max_direction_layer) { - max_direction_layer = layer; - } - } catch (...) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - } - - struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); - if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - if (result.n_embd == -1) { - result.n_embd = ggml_nelements(tensor_meta); - } else if (ggml_nelements(tensor_meta) != result.n_embd) { - fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - n_bytes += ggml_nbytes(tensor_meta); - } - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); + ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, + /* .ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); + return result; } + int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); if (n_tensors == 0) { fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); - return result; } - // load and scale tensors into final control vector context - struct ggml_init_params ggml_params = { - /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ false, - }; - struct ggml_context * ctx = ggml_init(ggml_params); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(ctx_gguf, i); - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx, - }; - struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params); - if (!ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); - ggml_free(ctx); - return result; - } + int layer_idx = -1; - // do not store data for layer 0 (it's not used) - result.data.resize(result.n_embd * max_direction_layer); - - for (uint32_t il = 1; il <= max_direction_layer; il++) { - const std::string name = "direction." + std::to_string(il); - const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); - - float * dst = result.data.data() + result.n_embd * (il - 1); - - if (tensor) { - const float * src = (const float *) tensor->data; - for (int j = 0; j < result.n_embd; j++) { - dst[j] = src[j] * load_info.strength; - } - } else { - for (int j = 0; j < result.n_embd; j++) { - dst[j] = 0.0f; + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + layer_idx = std::stoi(name.substr(dotpos + 1)); + } catch (...) { + layer_idx = -1; } } + if (layer_idx < 0) { + fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } else if (layer_idx == 0) { + fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } + + struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + if (tensor->type != GGML_TYPE_F32) { + fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } + if (ggml_n_dims(tensor) != 1) { + fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } + + if (result.n_embd == -1) { + result.n_embd = ggml_nelements(tensor); + } else if (ggml_nelements(tensor) != result.n_embd) { + fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } + + // extend if necessary - do not store data for layer 0 (it's not used) + result.data.resize(std::max(result.data.size(), static_cast(result.n_embd * layer_idx)), 0.0f); + + const float * src = (const float *) tensor->data; + float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0] + for (int j = 0; j < result.n_embd; j++) { + dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file + } + } + if (result.n_embd == -1) { + fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); + result.data.clear(); + } + + gguf_free(ctx_gguf); + ggml_free(ctx); + return result; } @@ -2933,16 +2918,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector & chat, diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 2f233e2e7..881eb49e3 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -316,7 +316,7 @@ std::unordered_map GRAMMAR_LITERAL_ESCAPES = { }; std::unordered_set NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'}; -std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; +std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; template std::string join(Iterator begin, Iterator end, const std::string & separator) { @@ -720,7 +720,7 @@ private: } prop_names.push_back(prop_name); } - if (!(additional_properties.is_boolean() && !additional_properties.get())) { + if ((additional_properties.is_boolean() && additional_properties.get()) || additional_properties.is_object()) { std::string sub_name = name + (name.empty() ? "" : "-") + "additional"; std::string value_rule = additional_properties.is_object() ? visit(additional_properties, sub_name + "-value") diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 67598b561..2758214fa 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -85,6 +85,7 @@ models = [ {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, + {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B ] diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c26fad930..5bcc849db 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -487,6 +487,9 @@ class Model: if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code res = "jina-v2-code" + if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": + # ref: https://huggingface.co/LumiOpen/Viking-7B + res = "viking" if res is None: logger.warning("\n") @@ -2337,6 +2340,46 @@ class GemmaModel(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("Gemma2ForCausalLM") +class Gemma2Model(Model): + model_arch = gguf.MODEL_ARCH.GEMMA2 + + def set_vocab(self): + self._set_vocab_llama_hf() + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unusem + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return [] + + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + @Model.register("Starcoder2ForCausalLM") class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 diff --git a/examples/infill/README.md b/examples/infill/README.md index 74f42d2fc..810a0c5e7 100644 --- a/examples/infill/README.md +++ b/examples/infill/README.md @@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. - `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. +- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. ## Input Prompts diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 3e82e4a81..ca71dd687 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -210,6 +210,7 @@ int main(int argc, char ** argv) { suff_rm_leading_spc = false; } std::vector embd_inp; + std::vector embd_end; std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); const int space_token = 29871; @@ -217,12 +218,13 @@ int main(int argc, char ** argv) { inp_sfx.erase(inp_sfx.begin()); } inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); - } inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); - embd_inp = inp_pfx; - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + embd_inp = params.spm_infill ? inp_sfx : inp_pfx; + embd_end = params.spm_infill ? inp_pfx : inp_sfx; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { @@ -526,14 +528,14 @@ int main(int argc, char ** argv) { inp_sfx.erase(inp_sfx.begin()); } inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); - } inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); - embd_inp = inp_pfx; - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + embd_inp = params.spm_infill ? inp_sfx : inp_pfx; + embd_end = params.spm_infill ? inp_pfx : inp_sfx; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { embd_inp.push_back(middle_token); } diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 92f6e3d47..072a230f7 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -231,7 +231,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]') GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'} NON_LITERAL_SET = set('|.()[]{}*+?') -ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?') +ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?') class SchemaConverter: @@ -602,7 +602,7 @@ class SchemaConverter: else: add_component(t, is_required=True) - return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[])) + return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema): items = schema.get('items') or schema['prefixItems'] @@ -691,7 +691,7 @@ class SchemaConverter: required_props = [k for k in sorted_props if k in required] optional_props = [k for k in sorted_props if k not in required] - if additional_properties != False: + if additional_properties is not None and additional_properties != False: sub_name = f'{name}{"-" if name else ""}additional' value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \ self._add_primitive('value', PRIMITIVE_RULES['value']) diff --git a/examples/llama.android/llama/CMakeLists.txt b/examples/llama.android/llama/CMakeLists.txt deleted file mode 100644 index a5618cac0..000000000 --- a/examples/llama.android/llama/CMakeLists.txt +++ /dev/null @@ -1,55 +0,0 @@ - -# For more information about using CMake with Android Studio, read the -# documentation: https://d.android.com/studio/projects/add-native-code.html. -# For more examples on how to use CMake, see https://github.com/android/ndk-samples. - -# Sets the minimum CMake version required for this project. -cmake_minimum_required(VERSION 3.22.1) - -# Declares the project name. The project name can be accessed via ${ PROJECT_NAME}, -# Since this is the top level CMakeLists.txt, the project name is also accessible -# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level -# build script scope). -project("llama-android") - -## Fetch latest llama.cpp from GitHub -#include(FetchContent) -#FetchContent_Declare( -# llama -# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp -# GIT_TAG master -#) -# -## Also provides "common" -#FetchContent_MakeAvailable(llama) - -# llama.cpp CI uses the code from the current branch -# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700 -add_subdirectory(../../../../../../ build-llama) - -# Creates and names a library, sets it as either STATIC -# or SHARED, and provides the relative paths to its source code. -# You can define multiple libraries, and CMake builds them for you. -# Gradle automatically packages shared libraries with your APK. -# -# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define -# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} -# is preferred for the same purpose. -# -# In order to load a library into your app from Java/Kotlin, you must call -# System.loadLibrary() and pass the name of the library defined here; -# for GameActivity/NativeActivity derived applications, the same library name must be -# used in the AndroidManifest.xml file. -add_library(${CMAKE_PROJECT_NAME} SHARED - # List C/C++ source files with relative paths to this CMakeLists.txt. - llama-android.cpp) - -# Specifies libraries CMake should link to your target library. You -# can link libraries from various origins, such as libraries defined in this -# build script, prebuilt third-party libraries, or Android system libraries. -target_link_libraries(${CMAKE_PROJECT_NAME} - # List libraries link to the target library - llama - common - android - log) diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt index 42ebaad49..2de496574 100644 --- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt @@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1) # build script scope). project("llama-android") -include(FetchContent) -FetchContent_Declare( - llama - GIT_REPOSITORY https://github.com/ggerganov/llama.cpp - GIT_TAG master -) +#include(FetchContent) +#FetchContent_Declare( +# llama +# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp +# GIT_TAG master +#) # Also provides "common" -FetchContent_MakeAvailable(llama) +#FetchContent_MakeAvailable(llama) # Creates and names a library, sets it as either STATIC # or SHARED, and provides the relative paths to its source code. @@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama) # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} # is preferred for the same purpose. # + +#load local llama.cpp +add_subdirectory(../../../../../../ build-llama) + # In order to load a library into your app from Java/Kotlin, you must call # System.loadLibrary() and pass the name of the library defined here; # for GameActivity/NativeActivity derived applications, the same library name must be diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 874158ef0..92a6b16b1 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -5,7 +5,7 @@ #include #include #include "llama.h" -#include "common/common.h" +#include "common.h" // Write C++ code here. // diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 95fbe3d02..d6882eec3 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1121,20 +1121,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } if (n < 32) hparams.image_grid_pinpoints[n] = 0; - } catch (std::runtime_error & e) { + } catch (std::runtime_error & /*e*/) { hparams.image_grid_pinpoints[0]=0; } try { int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); - } catch (std::runtime_error & e) { + } catch (std::runtime_error & /*e*/) { strcpy(hparams.mm_patch_merge_type, "flat"); } try { hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 - } catch(const std::exception& e) { + } catch(const std::exception& /*e*/) { hparams.image_crop_resolution = hparams.image_size; } @@ -1173,7 +1173,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { try { vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); new_clip->has_class_embedding = true; - } catch (const std::exception& e) { + } catch (const std::exception& /*e*/) { new_clip->has_class_embedding = false; } @@ -1181,7 +1181,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); new_clip->has_pre_norm = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_pre_norm = false; } @@ -1189,21 +1189,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight")); vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias")); new_clip->has_post_norm = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_post_norm = false; } try { vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS); new_clip->has_patch_bias = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_patch_bias = false; } try { vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); - } catch(const std::exception& e) { + } catch(const std::exception& /*e*/) { LOG_TEE("%s: failed to load vision model tensors\n", __func__); } @@ -1215,26 +1215,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { // Yi-type llava vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight")); vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // missing in Yi-type llava vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // Yi-type llava vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight")); vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // Yi-type llava vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index cfaf6a6e8..1114073b8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -810,7 +810,7 @@ int main(int argc, char ** argv) { is_antiprompt = true; } - chat_add_and_format(model, chat_msgs, "system", assistant_ss.str()); + chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); is_interacting = true; printf("\n"); } diff --git a/examples/server/README.md b/examples/server/README.md index e7fb0bf64..4fab006bb 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co - `-fa`, `--flash-attn` : enable flash attention (default: disabled). - `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`) - `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options) +- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. **If compiled with `LLAMA_SERVER_SSL=ON`** - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index 06d76edde..7267f3f9c 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -259,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g; const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' }; const NON_LITERAL_SET = new Set('|.()[]{}*+?'); -const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?'); +const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?'); export class SchemaConverter { constructor(options) { @@ -751,7 +751,7 @@ export class SchemaConverter { const requiredProps = sortedProps.filter(k => required.has(k)); const optionalProps = sortedProps.filter(k => !required.has(k)); - if (additionalProperties !== false) { + if (additionalProperties) { const subName = `${name ?? ''}${name ? '-' : ''}additional`; const valueRule = additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ae768097b..d7fb61812 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2020,6 +2020,7 @@ struct server_context { slot.t_start_generation = 0; if (slot.infill) { + const bool add_bos = llama_should_add_bos_token(model); bool suff_rm_leading_spc = true; if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); @@ -2035,16 +2036,21 @@ struct server_context { } prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); + suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); + + auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; + auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { - prefix_tokens.push_back(middle_token); + embd_inp.push_back(middle_token); } - prompt_tokens = prefix_tokens; + prompt_tokens = embd_inp; } else { prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt } diff --git a/flake.lock b/flake.lock index 5278fb68a..79bb3f63f 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1718318537, - "narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=", + "lastModified": 1718895438, + "narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420", + "rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3", "type": "github" }, "original": { diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 31fcbf139..1396e7a75 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -2475,7 +2475,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const dim3 block_nums_mmq(nsm, 1, 1); - ggml_cuda_pool & pool = ctx.pool(); + ggml_cuda_pool & pool = ctx.pool(id); ggml_cuda_pool_alloc tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y); if (args.ne01 % mmq_y == 0) { diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 222a2d137..cf3d09e70 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -150,6 +150,7 @@ class MODEL_ARCH(IntEnum): INTERNLM2 = auto() MINICPM = auto() GEMMA = auto() + GEMMA2 = auto() STARCODER2 = auto() MAMBA = auto() XVERSE = auto() @@ -180,10 +181,13 @@ class MODEL_TENSOR(IntEnum): ATTN_NORM = auto() ATTN_NORM_2 = auto() ATTN_OUT_NORM = auto() + ATTN_POST_NORM = auto() ATTN_ROT_EMBD = auto() FFN_GATE_INP = auto() FFN_GATE_INP_SHEXP = auto() FFN_NORM = auto() + FFN_PRE_NORM = auto() + FFN_POST_NORM = auto() FFN_GATE = auto() FFN_DOWN = auto() FFN_UP = auto() @@ -270,6 +274,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", @@ -303,9 +308,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", + MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", @@ -751,6 +759,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_NORM, ], + MODEL_ARCH.GEMMA2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + ], MODEL_ARCH.STARCODER2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 7b047f241..0bed43939 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -187,6 +187,10 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), + MODEL_TENSOR.ATTN_POST_NORM: ( + "model.layers.{bid}.post_attention_layernorm", # gemma2 + ), + # Rotary embeddings MODEL_TENSOR.ATTN_ROT_EMBD: ( "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf @@ -210,6 +214,16 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.rms_norm_2", # Grok ), + # Post feed-forward norm + MODEL_TENSOR.FFN_PRE_NORM: ( + "model.layers.{bid}.pre_feedforward_layernorm", # gemma2 + ), + + # Post feed-forward norm + MODEL_TENSOR.FFN_POST_NORM: ( + "model.layers.{bid}.post_feedforward_layernorm", # gemma2 + ), + MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral diff --git a/grammars/README.md b/grammars/README.md index 2f685eb6d..886023f77 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -126,19 +126,257 @@ You can use GBNF grammars: - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py) - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI) -Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555). +Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555). -Here is also a non-exhaustive list of **unsupported** features: +```bash +llama-cli \ + -hfr bartowski/Phi-3-medium-128k-instruct-GGUF \ + -hff Phi-3-medium-128k-instruct-Q8_0.gguf \ + -j '{ + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1, + "maxLength": 100 + }, + "age": { + "type": "integer", + "minimum": 0, + "maximum": 150 + } + }, + "required": ["name", "age"], + "additionalProperties": false + }, + "minItems": 10, + "maxItems": 100 + }' \ + -p 'Generate a {name, age}[] JSON array with famous actors of all ages.' +``` -- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840 -- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum` - - `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797 -- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs) -- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703) -- `string` formats `uri`, `email` +
+ +Show grammar + +You can convert any schema in command-line with: + +```bash +examples/json_schema_to_grammar.py name-age-schema.json +``` + +``` +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +item ::= "{" space item-name-kv "," space item-age-kv "}" space +item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space +item-age-kv ::= "\"age\"" space ":" space item-age +item-name ::= "\"" char{1,100} "\"" space +item-name-kv ::= "\"name\"" space ":" space item-name +root ::= "[" space item ("," space item){9,99} "]" space +space ::= | " " | "\n" [ \t]{0,20} +``` + +
+ +Here is also a list of known limitations (contributions welcome): + +- `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations). +- `"additionalProperties": true` may produce keys that contain unescaped newlines. +- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp). +- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703) +- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works) +- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number` +- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073) +- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$` +- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs) +- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email` +- No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties) + +And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars): + +- [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems) - [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains` -- `uniqueItems` - `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing)) - [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not) - [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas` -- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties) + +### A word about additionalProperties + +> [!WARNING] +> The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default. +> Since this is slow and seems prone to hallucinations, we default to no additional properties. +> You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties. + +If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class: + +```python +# pip install pydantic +import json +from typing import Annotated, List +from pydantic import BaseModel, Extra, Field +class QAPair(BaseModel): + class Config: + extra = 'allow' # triggers additionalProperties: true in the JSON schema + question: str + concise_answer: str + justification: str + +class Summary(BaseModel): + class Config: + extra = 'allow' + key_facts: List[Annotated[str, Field(pattern='- .{5,}')]] + question_answers: List[Annotated[List[QAPair], Field(min_items=5)]] + +print(json.dumps(Summary.model_json_schema(), indent=2)) +``` + +
+Show JSON schema & grammar + +```json +{ + "$defs": { + "QAPair": { + "additionalProperties": true, + "properties": { + "question": { + "title": "Question", + "type": "string" + }, + "concise_answer": { + "title": "Concise Answer", + "type": "string" + }, + "justification": { + "title": "Justification", + "type": "string" + } + }, + "required": [ + "question", + "concise_answer", + "justification" + ], + "title": "QAPair", + "type": "object" + } + }, + "additionalProperties": true, + "properties": { + "key_facts": { + "items": { + "pattern": "^- .{5,}$", + "type": "string" + }, + "title": "Key Facts", + "type": "array" + }, + "question_answers": { + "items": { + "items": { + "$ref": "#/$defs/QAPair" + }, + "minItems": 5, + "type": "array" + }, + "title": "Question Answers", + "type": "array" + } + }, + "required": [ + "key_facts", + "question_answers" + ], + "title": "Summary", + "type": "object" +} +``` + +``` +QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv ( "," space ( QAPair-additional-kv ( "," space QAPair-additional-kv )* ) )? "}" space +QAPair-additional-k ::= ["] ( [c] ([o] ([n] ([c] ([i] ([s] ([e] ([_] ([a] ([n] ([s] ([w] ([e] ([r] char+ | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"e] char*) | [^"s] char*) | [^"i] char*) | [^"c] char*) | [^"n] char*) | [^"o] char*) | [j] ([u] ([s] ([t] ([i] ([f] ([i] ([c] ([a] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"a] char*) | [^"c] char*) | [^"i] char*) | [^"f] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"u] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"cjq] char* )? ["] space +QAPair-additional-kv ::= QAPair-additional-k ":" space value +QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string +QAPair-justification-kv ::= "\"justification\"" space ":" space string +QAPair-question-kv ::= "\"question\"" space ":" space string +additional-k ::= ["] ( [k] ([e] ([y] ([_] ([f] ([a] ([c] ([t] ([s] char+ | [^"s] char*) | [^"t] char*) | [^"c] char*) | [^"a] char*) | [^"f] char*) | [^"_] char*) | [^"y] char*) | [^"e] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] ([_] ([a] ([n] ([s] ([w] ([e] ([r] ([s] char+ | [^"s] char*) | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"kq] char* )? ["] space +additional-kv ::= additional-k ":" space value +array ::= "[" space ( value ("," space value)* )? "]" space +boolean ::= ("true" | "false") space +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +decimal-part ::= [0-9]{1,16} +dot ::= [^\x0A\x0D] +integral-part ::= [0] | [1-9] [0-9]{0,15} +key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space +key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space +key-facts-item-1 ::= dot +key-facts-kv ::= "\"key_facts\"" space ":" space key-facts +null ::= "null" space +number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space +object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space +question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space +question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space +question-answers-item-item ::= QAPair +question-answers-kv ::= "\"question_answers\"" space ":" space question-answers +root ::= "{" space key-facts-kv "," space question-answers-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space +space ::= | " " | "\n" [ \t]{0,20} +string ::= "\"" char* "\"" space +value ::= object | array | string | number | boolean | null +``` + +
+ +If you're using [Zod](https://zod.dev/), you can make your objects to explicitly allow extra properties w/ `nonstrict()` / `passthrough()` (or explicitly no extra props w/ `z.object(...).strict()` or `z.strictObject(...)`) but note that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always sets `"additionalProperties": false` anyway. + +```js +import { z } from 'zod'; +import { zodToJsonSchema } from 'zod-to-json-schema'; + +const Foo = z.object({ + age: z.number().positive(), + email: z.string().email(), +}).strict(); + +console.log(zodToJsonSchema(Foo)); +``` + +
+Show JSON schema & grammar + +```json +{ + "type": "object", + "properties": { + "age": { + "type": "number", + "exclusiveMinimum": 0 + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "age", + "email" + ], + "additionalProperties": false, + "$schema": "http://json-schema.org/draft-07/schema#" +} +``` + +``` +age-kv ::= "\"age\"" space ":" space number +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +decimal-part ::= [0-9]{1,16} +email-kv ::= "\"email\"" space ":" space string +integral-part ::= [0] | [1-9] [0-9]{0,15} +number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space +root ::= "{" space age-kv "," space email-kv "}" space +space ::= | " " | "\n" [ \t]{0,20} +string ::= "\"" char* "\"" space +``` + +
diff --git a/include/llama.h b/include/llama.h index 88eecb0ed..cafeafb85 100644 --- a/include/llama.h +++ b/include/llama.h @@ -88,6 +88,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_DBRX = 13, LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, LLAMA_VOCAB_PRE_TYPE_PORO = 15, + LLAMA_VOCAB_PRE_TYPE_VIKING = 16, }; // note: these values should be synchronized with ggml_rope diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 9e654180b..b05a33747 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -136,42 +136,41 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # LICENSE -> LICENSE # scripts/gen-authors.sh -> scripts/gen-authors.sh - cat ggml-src.patch | sed \ - -e 's/CMakeLists.txt/ggml\/CMakeLists.txt/g' \ - -e 's/src\/CMakeLists.txt/ggml\/src\/CMakeLists.txt/g' \ - -e 's/cmake\/FindSIMD.cmake/ggml\/cmake\/FindSIMD.cmake/g' \ - -e 's/src\/ggml\.c/ggml/src/ggml.c/g' \ - -e 's/src\/ggml-alloc\.c/ggml/src/ggml-alloc.c/g' \ - -e 's/src\/ggml-backend-impl\.h/ggml/src/ggml-backend-impl.h/g' \ - -e 's/src\/ggml-backend\.c/ggml/src/ggml-backend.c/g' \ - -e 's/src\/ggml-common\.h/ggml/src/ggml-common.h/g' \ - -e 's/src\/ggml-cuda\//ggml-cuda\//g' \ - -e 's/src\/ggml-cuda\.cu/ggml/src/ggml-cuda.cu/g' \ - -e 's/src\/ggml-impl\.h/ggml/src/ggml-impl.h/g' \ - -e 's/src\/ggml-kompute\.cpp/ggml/src/ggml-kompute.cpp/g' \ - -e 's/src\/ggml-metal\.m/ggml/src/ggml-metal.m/g' \ - -e 's/src\/ggml-quants\.c/ggml/src/ggml-quants.c/g' \ - -e 's/src\/ggml-quants\.h/ggml/src/ggml-quants.h/g' \ - -e 's/src\/ggml-rpc\.cpp/ggml/src/ggml-rpc.cpp/g' \ - -e 's/src\/ggml-sycl\.cpp/ggml/src/ggml-sycl.cpp/g' \ - -e 's/src\/ggml-vulkan\.cpp/ggml/src/ggml-vulkan.cpp/g' \ - -e 's/include\/ggml\.h/ggml/include/ggml.h/g' \ - -e 's/include\/ggml-alloc\.h/ggml/include/ggml-alloc.h/g' \ - -e 's/include\/ggml-backend\.h/ggml/include/ggml-backend.h/g' \ - -e 's/include\/ggml-blas\.h/ggml/include/ggml-blas.h/g' \ - -e 's/include\/ggml-cuda\.h/ggml/include/ggml-cuda.h/g' \ - -e 's/include\/ggml-kompute\.h/ggml/include/ggml-kompute.h/g' \ - -e 's/include\/ggml-metal\.h/ggml/include/ggml-metal.h/g' \ - -e 's/include\/ggml-rpc\.h/ggml/include/ggml-rpc.h/g' \ - -e 's/include\/ggml-sycl\.h/ggml/include/ggml-sycl.h/g' \ - -e 's/include\/ggml-vulkan\.h/ggml/include/ggml-vulkan.h/g' \ - -e 's/tests\/test-opt\.cpp/tests\/test-opt.cpp/g' \ - -e 's/tests\/test-grad0\.cpp/tests\/test-grad0.cpp/g' \ - -e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \ - -e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \ - -e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \ - -e 's/LICENSE/LICENSE/g' \ - -e 's/scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \ + cat ggml-src.patch | sed -E \ + -e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \ + -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \ + -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common\.h/examples\/common.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/examples\/common.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/examples\/common-ggml.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)LICENSE/LICENSE/g' \ + -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \ > ggml-src.patch.tmp mv ggml-src.patch.tmp ggml-src.patch diff --git a/src/llama.cpp b/src/llama.cpp index f78594a6f..3edaa98e8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -217,6 +217,7 @@ enum llm_arch { LLM_ARCH_INTERNLM2, LLM_ARCH_MINICPM, LLM_ARCH_GEMMA, + LLM_ARCH_GEMMA2, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, LLM_ARCH_XVERSE, @@ -257,6 +258,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_GEMMA, "gemma" }, + { LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_XVERSE, "xverse" }, @@ -478,10 +480,12 @@ enum llm_tensor { LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_NORM_2, LLM_TENSOR_ATTN_OUT_NORM, + LLM_TENSOR_ATTN_POST_NORM, LLM_TENSOR_ATTN_ROT_EMBD, LLM_TENSOR_FFN_GATE_INP, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_POST_NORM, LLM_TENSOR_FFN_GATE, LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, @@ -1004,6 +1008,24 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_GEMMA2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + }, + }, { LLM_ARCH_STARCODER2, { @@ -2038,6 +2060,9 @@ enum e_model { MODEL_8x22B, MODEL_16x12B, MODEL_10B_128x3_66B, + MODEL_57B_A14B, + MODEL_9B, + MODEL_27B, }; static const size_t kiB = 1024; @@ -2214,6 +2239,7 @@ struct llama_layer { struct ggml_tensor * attn_q_a_norm; struct ggml_tensor * attn_kv_a_norm; struct ggml_tensor * attn_sub_norm; + struct ggml_tensor * attn_post_norm; struct ggml_tensor * ffn_sub_norm; // attention @@ -2237,6 +2263,7 @@ struct llama_layer { // normalization struct ggml_tensor * ffn_norm; struct ggml_tensor * ffn_norm_b; + struct ggml_tensor * ffn_post_norm; struct ggml_tensor * layer_out_norm; struct ggml_tensor * layer_out_norm_b; struct ggml_tensor * ffn_norm_exps; @@ -4267,6 +4294,9 @@ static const char * llama_model_type_name(e_model type) { case MODEL_8x22B: return "8x22B"; case MODEL_16x12B: return "16x12B"; case MODEL_10B_128x3_66B: return "10B+128x3.66B"; + case MODEL_57B_A14B: return "57B.A14B"; + case MODEL_9B: return "9B"; + case MODEL_27B: return "27B"; default: return "?B"; } } @@ -4588,6 +4618,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_A2_7B; break; + case 28: model.type = e_model::MODEL_57B_A14B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -4668,6 +4699,16 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GEMMA2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 42: model.type = e_model::MODEL_9B; break; + case 46: model.type = e_model::MODEL_27B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_STARCODER2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -5067,6 +5108,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "poro-chat") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO; + } else if ( + tokenizer_pre == "viking") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -5152,10 +5196,10 @@ static void llm_load_vocab( if (gen_name.find("code") != std::string::npos) { if (model.arch == LLM_ARCH_LLAMA && 32010 < vocab.id_to_token.size() - && vocab.id_to_token[32007].text == "
"
-              && vocab.id_to_token[32008].text == ""
-              && vocab.id_to_token[32009].text == ""
-              && vocab.id_to_token[32010].text == "") {
+              && vocab.id_to_token[32007].text.find("
") != std::string::npos
+              && vocab.id_to_token[32008].text.find("") != std::string::npos
+              && vocab.id_to_token[32009].text.find("") != std::string::npos
+              && vocab.id_to_token[32010].text.find("") != std::string::npos) {
                 vocab.special_prefix_id = 32007;
                 vocab.special_suffix_id = 32008;
                 vocab.special_middle_id = 32009;
@@ -6506,6 +6550,40 @@ static bool llm_load_tensors(
                         layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
                     }
                 } break;
+            case LLM_ARCH_GEMMA2:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+                    const int64_t n_ff          = hparams.n_ff;
+                    const int64_t n_embd_head_k = hparams.n_embd_head_k;
+                    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
+                    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
+                        layer.attn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
+
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
+                    }
+                } break;
             case LLM_ARCH_STARCODER2:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -10917,6 +10995,125 @@ struct llm_build_context {
         return gf;
     }
 
+    struct ggml_cgraph * build_gemma2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
+                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+                cb(Qcur, "Qcur_scaled", il);
+
+                Kcur = ggml_rope_ext(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
+                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+            }
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    model.layers[il].attn_post_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_post_norm", il);
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
+                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+            }
+
+            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+            cb(sa_out, "sa_out", il);
+
+            cur = llm_build_norm(ctx0, sa_out, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                model.layers[il].ffn_post_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+            cb(cur, "ffn_post_norm", -1);
+
+            cur = ggml_add(ctx0, cur, sa_out);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
+
     struct ggml_cgraph * build_starcoder2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
@@ -12297,6 +12494,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_gemma();
             } break;
+        case LLM_ARCH_GEMMA2:
+            {
+                result = llm.build_gemma2();
+            } break;
         case LLM_ARCH_STARCODER2:
             {
                 result = llm.build_starcoder2();
@@ -13703,6 +13904,12 @@ struct llm_tokenizer_bpe {
                     " ?[^(\\s|.,!?…。,、।۔،)]+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_VIKING:
+                regex_exprs = {
+                    "\\p{N}",
+                    " ?[^(\\s|.,!?…。,、।۔،)]+",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
@@ -17585,6 +17792,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_PHI2:
         case LLM_ARCH_PHI3:
         case LLM_ARCH_GEMMA:
+        case LLM_ARCH_GEMMA2:
         case LLM_ARCH_STARCODER2:
         case LLM_ARCH_GPTNEOX:
             return LLAMA_ROPE_TYPE_NEOX;
@@ -19405,7 +19613,10 @@ static int32_t llama_chat_apply_template_internal(
     std::string & dest, bool add_ass) {
     // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
     std::stringstream ss;
-    if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
+    auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
+        return tmpl.find(haystack) != std::string::npos;
+    };
+    if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
         // chatml template
         for (auto message : chat) {
             ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
@@ -19413,16 +19624,16 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|im_start|>assistant\n";
         }
-    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) {
+    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
         // llama2 template and its variants
         // [variant] support system message
-        bool support_system_message = tmpl.find("<>") != std::string::npos || tmpl == "mistral";
+        bool support_system_message = tmpl_contains("<>") || tmpl == "mistral";
         // [variant] space before + after response
-        bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
+        bool space_around_response = tmpl_contains("' ' + eos_token");
         // [variant] add BOS inside history
-        bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
+        bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
         // [variant] trim spaces from the input message
-        bool strip_message = tmpl.find("content.strip()") != std::string::npos;
+        bool strip_message = tmpl_contains("content.strip()");
         // construct the prompt
         bool is_inside_turn = true; // skip BOS at the beginning
         ss << "[INST] ";
@@ -19448,7 +19659,7 @@ static int32_t llama_chat_apply_template_internal(
             }
         }
         // llama2 templates seem to not care about "add_generation_prompt"
-    } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
+    } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
         // Phi 3
         for (auto message : chat) {
             std::string role(message->role);
@@ -19457,7 +19668,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
-    } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
+    } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
         // zephyr template
         for (auto message : chat) {
             ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
@@ -19465,7 +19676,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|assistant|>\n";
         }
-    } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
+    } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
         // mlabonne/AlphaMonarch-7B template (the  is included inside history)
         for (auto message : chat) {
             std::string bos = (message == chat.front()) ? "" : ""; // skip BOS for first message
@@ -19474,7 +19685,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "assistant\n";
         }
-    } else if (tmpl == "gemma" || tmpl.find("") != std::string::npos) {
+    } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("")) {
         // google/gemma-7b-it
         std::string system_prompt = "";
         for (auto message : chat) {
@@ -19496,7 +19707,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "model\n";
         }
-    } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
+    } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
         // OrionStarAI/Orion-14B-Chat
         std::string system_prompt = "";
         for (auto message : chat) {
@@ -19516,7 +19727,7 @@ static int32_t llama_chat_apply_template_internal(
                 ss << message->content << "";
             }
         }
-    } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
+    } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
         // openchat/openchat-3.5-0106,
         for (auto message : chat) {
             std::string role(message->role);
@@ -19530,13 +19741,13 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "GPT4 Correct Assistant:";
         }
-    } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
+    } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
         // eachadea/vicuna-13b-1.1 (and Orca variant)
         for (auto message : chat) {
             std::string role(message->role);
             if (role == "system") {
                 // Orca-Vicuna variant uses a system prefix
-                if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
+                if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
                     ss << "SYSTEM: " << message->content << "\n";
                 } else {
                     ss << message->content << "\n\n";
@@ -19550,7 +19761,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "ASSISTANT:";
         }
-    } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
+    } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
         // deepseek-ai/deepseek-coder-33b-instruct
         for (auto message : chat) {
             std::string role(message->role);
@@ -19565,7 +19776,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "### Response:\n";
         }
-    } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
+    } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
         // CohereForAI/c4ai-command-r-plus
         for (auto message : chat) {
             std::string role(message->role);
@@ -19580,7 +19791,7 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
         }
-    } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
+    } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
         // Llama 3
         for (auto message : chat) {
             std::string role(message->role);
@@ -19589,6 +19800,33 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
         }
+    } else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) {
+        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "user") {
+                ss << u8"<用户>";
+                ss << trim(message->content);
+                ss << "";
+            } else {
+                ss << trim(message->content);
+            }
+        }
+    } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
+        // DeepSeek-V2
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << message->content << "\n\n";
+            } else if (role == "user") {
+                ss << "User: " << message->content << "\n\n";
+            } else if (role == "assistant") {
+                ss << "Assistant: " << message->content << u8"<|end▁of▁sentence|>";
+            }
+        }
+        if (add_ass) {
+            ss << "Assistant:";
+        }
     } else {
         // template not supported
         return -1;
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index d19ba8633..b154038b2 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -57,7 +57,11 @@ int main(void) {
         //Phi-3-medium
         "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
         //Phi-3-vision
-        "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
+        "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
+        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+        u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
+        // DeepSeek-V2
+        "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
     };
     std::vector expected_output = {
         // teknium/OpenHermes-2.5-Mistral-7B
@@ -94,6 +98,10 @@ int main(void) {
         "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
         //Phi-3-vision
         "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
+        u8"You are a helpful assistant<用户>HelloHi there<用户>Who are youI am an assistant<用户>Another question",
+        // DeepSeek-V2
+        u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<|end▁of▁sentence|>User: Who are you\n\nAssistant:    I am an assistant   <|end▁of▁sentence|>User: Another question\n\nAssistant:",
     };
     std::vector formatted_chat(1024);
     int32_t res;
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 0e21dc795..975658f79 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -993,6 +993,40 @@ static void test_json_schema() {
         }
     );
 
+    test_schema(
+        "simple pattern",
+        // Schema
+        R"""({
+            "pattern": "^[a-zA-Z0-9_-]*$"
+        })""",
+        // Passing strings
+        {
+            R"""("")""",
+            R"""("He_llo-12")""",
+        },
+        // Failing strings
+        {
+            R"""("!")""",
+            R"""("Hello World")""",
+        }
+    );
+
+    test_schema(
+        "pattern with escapes",
+        // Schema
+        R"""({
+            "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$"
+        })""",
+        // Passing strings
+        {
+            R"""("a^$.[]()|{}*+?b")""",
+        },
+        // Failing strings
+        {
+            R"""("ab")""",
+        }
+    );
+
     test_schema(
         "",
         // Schema
@@ -1062,8 +1096,6 @@ static void test_json_schema() {
             R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
             // "By extension, even an empty object is valid"
             R"""({})""",
-            // "By default, providing additional properties is valid"
-            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
             R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
         },
         // Failing strings
@@ -1074,6 +1106,9 @@ static void test_json_schema() {
             R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
             // Reorder properties
             R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
+            // "Additional properties default to false for generation, even though the spec says true.
+            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
+
         }
     );
 
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 3aaa11833..65486ac5c 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -1120,28 +1120,15 @@ static void test_all(const std::string & lang, std::function test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
-            tc.verify(read("test-grammar-output.tmp"));
-        });
+    if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
+        fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m");
     } else {
-        fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
-    }
+        if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
+            test_all("Python", [](const TestCase & tc) {
+                write("test-json-schema-input.tmp", tc.schema);
+                tc.verify_status(std::system(
+                    "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+                tc.verify(read("test-grammar-output.tmp"));
+            });
+        } else {
+            fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
+        }
 
-    if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
-        test_all("JavaScript", [](const TestCase & tc) {
-            write("test-json-schema-input.tmp", tc.schema);
-            tc.verify_status(std::system(
-                "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
-            tc.verify(read("test-grammar-output.tmp"));
-        });
-    } else {
-        fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
+        if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
+            test_all("JavaScript", [](const TestCase & tc) {
+                write("test-json-schema-input.tmp", tc.schema);
+                tc.verify_status(std::system(
+                    "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
+                tc.verify(read("test-grammar-output.tmp"));
+            });
+        } else {
+            fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
+        }
     }
 
     test_all("Check Expectations Validity", [](const TestCase & tc) {