diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 208515287..adf67cecc 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -47,7 +47,7 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON .. + cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -105,7 +105,7 @@ jobs: sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON + cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -222,7 +222,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF cmake --build . --config Release -j $(nproc) - name: Test diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a7197282..dba083089 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,7 +86,7 @@ set(GGML_CUDA_USE_GRAPHS ON) function (llama_option_depr TYPE OLD NEW) if (${OLD}) message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n") - set(${NEW} ON) + set(${NEW} ON PARENT_SCOPE) endif() endfunction() @@ -96,7 +96,6 @@ llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE) llama_option_depr(WARNING LLAMA_METAL GGML_METAL) llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY) llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) -llama_option_depr(WARNING LLAMA_OPENMP GGML_OPENMP) llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) diff --git a/Makefile b/Makefile index bbfe0f12b..8ae4f1dc4 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,7 @@ BUILD_TARGETS = \ TEST_TARGETS = \ tests/test-autorelease \ tests/test-backend-ops \ + tests/test-chat-template \ tests/test-double-float \ tests/test-grad0 \ tests/test-grammar-integration \ @@ -1070,6 +1071,7 @@ clean: rm -rvf src/*.o rm -rvf tests/*.o rm -rvf examples/*.o + rm -rvf common/*.o rm -rvf *.a rm -rvf *.dll rm -rvf *.so diff --git a/common/common.cpp b/common/common.cpp index c76d0e2c3..57d03a578 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2618,6 +2618,7 @@ std::string llama_chat_apply_template(const struct llama_model * model, const std::vector & msgs, bool add_ass) { int alloc_size = 0; + bool fallback = false; // indicate if we must fallback to default chatml std::vector chat; for (auto & msg : msgs) { chat.push_back({msg.role.c_str(), msg.content.c_str()}); @@ -2630,10 +2631,26 @@ std::string llama_chat_apply_template(const struct llama_model * model, // run the first time to get the total output length int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + // error: chat template is not supported + if (res < 0) { + if (ptr_tmpl != nullptr) { + // if the custom "tmpl" is not supported, we throw an error + // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() + throw std::runtime_error("this custom template is not supported"); + } else { + // If the built-in template is not supported, we default to chatml + res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + fallback = true; + } + } + // if it turns out that our buffer is too small, we resize it if ((size_t) res > buf.size()) { buf.resize(res); - res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + res = llama_chat_apply_template( + fallback ? nullptr : model, + fallback ? "chatml" : ptr_tmpl, + chat.data(), chat.size(), add_ass, buf.data(), buf.size()); } std::string formatted_chat(buf.data(), res); @@ -2804,125 +2821,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) // static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { - int32_t n_tensors; - - size_t n_bytes = 0; - - uint32_t max_direction_layer = 0; - llama_control_vector_data result = { -1, {} }; - // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer - { - struct ggml_init_params meta_params = { - /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ true, - }; - ggml_context * meta_ctx = ggml_init(meta_params); - struct gguf_init_params meta_gguf_params = { - /* .no_alloc = */ true, - /* .ctx = */ &meta_ctx, - }; - struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); - if (!meta_ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - return result; - } - - n_tensors = gguf_get_n_tensors(meta_ctx_gguf); - for (int i = 0; i < n_tensors; i++) { - std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); - - // split on '.' - size_t dotpos = name.find('.'); - if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { - try { - uint32_t layer = std::stoi(name.substr(dotpos + 1)); - if (layer == 0) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - if (layer > max_direction_layer) { - max_direction_layer = layer; - } - } catch (...) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - } - - struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); - if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { - fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - if (result.n_embd == -1) { - result.n_embd = ggml_nelements(tensor_meta); - } else if (ggml_nelements(tensor_meta) != result.n_embd) { - fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); - return result; - } - n_bytes += ggml_nbytes(tensor_meta); - } - ggml_free(meta_ctx); - gguf_free(meta_ctx_gguf); + ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, + /* .ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); + return result; } + int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); if (n_tensors == 0) { fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); - return result; } - // load and scale tensors into final control vector context - struct ggml_init_params ggml_params = { - /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ false, - }; - struct ggml_context * ctx = ggml_init(ggml_params); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(ctx_gguf, i); - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ &ctx, - }; - struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params); - if (!ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); - ggml_free(ctx); - return result; - } + int layer_idx = -1; - // do not store data for layer 0 (it's not used) - result.data.resize(result.n_embd * max_direction_layer); - - for (uint32_t il = 1; il <= max_direction_layer; il++) { - const std::string name = "direction." + std::to_string(il); - const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); - - float * dst = result.data.data() + result.n_embd * (il - 1); - - if (tensor) { - const float * src = (const float *) tensor->data; - for (int j = 0; j < result.n_embd; j++) { - dst[j] = src[j] * load_info.strength; - } - } else { - for (int j = 0; j < result.n_embd; j++) { - dst[j] = 0.0f; + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + layer_idx = std::stoi(name.substr(dotpos + 1)); + } catch (...) { + layer_idx = -1; } } + if (layer_idx < 0) { + fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } else if (layer_idx == 0) { + fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } + + struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + if (tensor->type != GGML_TYPE_F32) { + fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } + if (ggml_n_dims(tensor) != 1) { + fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } + + if (result.n_embd == -1) { + result.n_embd = ggml_nelements(tensor); + } else if (ggml_nelements(tensor) != result.n_embd) { + fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); + result.n_embd = -1; + break; + } + + // extend if necessary - do not store data for layer 0 (it's not used) + result.data.resize(std::max(result.data.size(), static_cast(result.n_embd * layer_idx)), 0.0f); + + const float * src = (const float *) tensor->data; + float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0] + for (int j = 0; j < result.n_embd; j++) { + dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file + } + } + if (result.n_embd == -1) { + fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); + result.data.clear(); + } + + gguf_free(ctx_gguf); + ggml_free(ctx); + return result; } @@ -2933,16 +2912,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector & chat, diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 67598b561..2758214fa 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -85,6 +85,7 @@ models = [ {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, + {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B ] diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c9e6ebf30..b14a702c8 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -490,6 +490,9 @@ class Model: if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b": # ref: https://huggingface.co/THUDM/glm-4-9b-chat res = "chatglm-bpe" + if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": + # ref: https://huggingface.co/LumiOpen/Viking-7B + res = "viking" if res is None: logger.warning("\n") diff --git a/examples/llama.android/llama/CMakeLists.txt b/examples/llama.android/llama/CMakeLists.txt deleted file mode 100644 index a5618cac0..000000000 --- a/examples/llama.android/llama/CMakeLists.txt +++ /dev/null @@ -1,55 +0,0 @@ - -# For more information about using CMake with Android Studio, read the -# documentation: https://d.android.com/studio/projects/add-native-code.html. -# For more examples on how to use CMake, see https://github.com/android/ndk-samples. - -# Sets the minimum CMake version required for this project. -cmake_minimum_required(VERSION 3.22.1) - -# Declares the project name. The project name can be accessed via ${ PROJECT_NAME}, -# Since this is the top level CMakeLists.txt, the project name is also accessible -# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level -# build script scope). -project("llama-android") - -## Fetch latest llama.cpp from GitHub -#include(FetchContent) -#FetchContent_Declare( -# llama -# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp -# GIT_TAG master -#) -# -## Also provides "common" -#FetchContent_MakeAvailable(llama) - -# llama.cpp CI uses the code from the current branch -# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700 -add_subdirectory(../../../../../../ build-llama) - -# Creates and names a library, sets it as either STATIC -# or SHARED, and provides the relative paths to its source code. -# You can define multiple libraries, and CMake builds them for you. -# Gradle automatically packages shared libraries with your APK. -# -# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define -# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} -# is preferred for the same purpose. -# -# In order to load a library into your app from Java/Kotlin, you must call -# System.loadLibrary() and pass the name of the library defined here; -# for GameActivity/NativeActivity derived applications, the same library name must be -# used in the AndroidManifest.xml file. -add_library(${CMAKE_PROJECT_NAME} SHARED - # List C/C++ source files with relative paths to this CMakeLists.txt. - llama-android.cpp) - -# Specifies libraries CMake should link to your target library. You -# can link libraries from various origins, such as libraries defined in this -# build script, prebuilt third-party libraries, or Android system libraries. -target_link_libraries(${CMAKE_PROJECT_NAME} - # List libraries link to the target library - llama - common - android - log) diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt index 42ebaad49..2de496574 100644 --- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt @@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1) # build script scope). project("llama-android") -include(FetchContent) -FetchContent_Declare( - llama - GIT_REPOSITORY https://github.com/ggerganov/llama.cpp - GIT_TAG master -) +#include(FetchContent) +#FetchContent_Declare( +# llama +# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp +# GIT_TAG master +#) # Also provides "common" -FetchContent_MakeAvailable(llama) +#FetchContent_MakeAvailable(llama) # Creates and names a library, sets it as either STATIC # or SHARED, and provides the relative paths to its source code. @@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama) # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} # is preferred for the same purpose. # + +#load local llama.cpp +add_subdirectory(../../../../../../ build-llama) + # In order to load a library into your app from Java/Kotlin, you must call # System.loadLibrary() and pass the name of the library defined here; # for GameActivity/NativeActivity derived applications, the same library name must be diff --git a/flake.lock b/flake.lock index 5278fb68a..79bb3f63f 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1718318537, - "narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=", + "lastModified": 1718895438, + "narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420", + "rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3", "type": "github" }, "original": { diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 31fcbf139..1396e7a75 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -2475,7 +2475,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const dim3 block_nums_mmq(nsm, 1, 1); - ggml_cuda_pool & pool = ctx.pool(); + ggml_cuda_pool & pool = ctx.pool(id); ggml_cuda_pool_alloc tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y); if (args.ne01 % mmq_y == 0) { diff --git a/grammars/README.md b/grammars/README.md index 2f685eb6d..40f666240 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -126,19 +126,244 @@ You can use GBNF grammars: - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py) - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI) -Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555). +Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555). -Here is also a non-exhaustive list of **unsupported** features: +```bash +llama-cli \ + -hfr bartowski/Phi-3-medium-128k-instruct-GGUF \ + -hff Phi-3-medium-128k-instruct-Q8_0.gguf \ + -j '{ + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1, + "maxLength": 100 + }, + "age": { + "type": "integer", + "minimum": 0, + "maximum": 150 + } + }, + "required": ["name", "age"], + "additionalProperties": false + }, + "minItems": 10, + "maxItems": 100 + }' \ + -p 'Generate a {name, age}[] JSON array with famous actors of all ages.' +``` -- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840 -- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum` - - `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797 -- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs) -- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703) -- `string` formats `uri`, `email` +
+ +Show grammar + +You can convert any schema in command-line with: + +```bash +examples/json_schema_to_grammar.py name-age-schema.json +``` + +``` +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +item ::= "{" space item-name-kv "," space item-age-kv "}" space +item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space +item-age-kv ::= "\"age\"" space ":" space item-age +item-name ::= "\"" char{1,100} "\"" space +item-name-kv ::= "\"name\"" space ":" space item-name +root ::= "[" space item ("," space item){9,99} "]" space +space ::= | " " | "\n" [ \t]{0,20} +``` + +
+ +Here is also a list of known limitations (contributions welcome): + +- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp). +- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703) +- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works) +- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number` +- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073) +- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$` +- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs) +- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email` +- No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties) + +And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars): + +- [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems) - [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains` -- `uniqueItems` - `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing)) - [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not) - [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas` -- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties) + +### A word about additionalProperties + +> [!WARNING] +> By default, `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties), which you might not want / not expect, and which will make sampling slower (not just because of the extra tokens, but also generates a slower grammar). +> You can set `"additionalProperties": false` on the schema of any object to ensure only properties listed in `properties` are generated (not needed for non-`object` types, e.g. `array` or `string`). + +If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can disable additional properties with the `extra` config on each model class: + +```python +# pip install pydantic +import json +from typing import Annotated, List +from pydantic import BaseModel, Extra, Field +class QAPair(BaseModel): + class Config: + extra = 'forbid' # triggers additionalProperties: false in the JSON schema + question: str + concise_answer: str + justification: str + +class Summary(BaseModel): + class Config: + extra = 'forbid' + key_facts: List[Annotated[str, Field(pattern='- .{5,}')]] + question_answers: List[Annotated[List[QAPair], Field(min_items=5)]] + +print(json.dumps(Summary.model_json_schema(), indent=2)) +``` + +
+Show JSON schema & grammar + +```json +{ + "$defs": { + "QAPair": { + "additionalProperties": false, + "properties": { + "question": { + "title": "Question", + "type": "string" + }, + "concise_answer": { + "title": "Concise Answer", + "type": "string" + }, + "justification": { + "title": "Justification", + "type": "string" + } + }, + "required": [ + "question", + "concise_answer", + "justification" + ], + "title": "QAPair", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "key_facts": { + "items": { + "pattern": "^- .{5,}$", + "type": "string" + }, + "title": "Key Facts", + "type": "array" + }, + "question_answers": { + "items": { + "items": { + "$ref": "#/$defs/QAPair" + }, + "minItems": 5, + "type": "array" + }, + "title": "Question Answers", + "type": "array" + } + }, + "required": [ + "key_facts", + "question_answers" + ], + "title": "Summary", + "type": "object" +} +``` + +``` +QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv "}" space +QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string +QAPair-justification-kv ::= "\"justification\"" space ":" space string +QAPair-question-kv ::= "\"question\"" space ":" space string +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +dot ::= [^\x0A\x0D] +key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space +key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space +key-facts-item-1 ::= dot +key-facts-kv ::= "\"key_facts\"" space ":" space key-facts +question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space +question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space +question-answers-item-item ::= QAPair +question-answers-kv ::= "\"question_answers\"" space ":" space question-answers +root ::= "{" space key-facts-kv "," space question-answers-kv "}" space +space ::= | " " | "\n" [ \t]{0,20} +string ::= "\"" char* "\"" space +``` + +
+ +If you're using [Zod](https://zod.dev/), you can make your objects explicitly strict w/ `z.object(...).strict()` or `z.strictObject(...)`. + +Note however that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always seems to set `"additionalProperties": false` anyway (even w/ zod schemas on which `nonstrict()` / `passthrough()` was called). + +```js +import { z } from 'zod'; +import { zodToJsonSchema } from 'zod-to-json-schema'; + +const Foo = z.object({ + age: z.number().positive(), + email: z.string().email(), +}).strict(); + +console.log(zodToJsonSchema(Foo)); +``` + +
+Show JSON schema & grammar + +```json +{ + "type": "object", + "properties": { + "age": { + "type": "number", + "exclusiveMinimum": 0 + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "age", + "email" + ], + "additionalProperties": false, + "$schema": "http://json-schema.org/draft-07/schema#" +} +``` + +``` +age-kv ::= "\"age\"" space ":" space number +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +decimal-part ::= [0-9]{1,16} +email-kv ::= "\"email\"" space ":" space string +integral-part ::= [0] | [1-9] [0-9]{0,15} +number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space +root ::= "{" space age-kv "," space email-kv "}" space +space ::= | " " | "\n" [ \t]{0,20} +string ::= "\"" char* "\"" space +``` + +
diff --git a/include/llama.h b/include/llama.h index 892c22e6f..da562a6c1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -90,6 +90,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_PORO = 15, LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16, LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17, + LLAMA_VOCAB_PRE_TYPE_VIKING = 18, }; // note: these values should be synchronized with ggml_rope diff --git a/src/llama.cpp b/src/llama.cpp index ad17d5ab5..36c5e681f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2057,6 +2057,7 @@ enum e_model { MODEL_8x22B, MODEL_16x12B, MODEL_10B_128x3_66B, + MODEL_57B_A14B, }; static const size_t kiB = 1024; @@ -4288,6 +4289,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_8x22B: return "8x22B"; case MODEL_16x12B: return "16x12B"; case MODEL_10B_128x3_66B: return "10B+128x3.66B"; + case MODEL_57B_A14B: return "57B.A14B"; default: return "?B"; } } @@ -4609,6 +4611,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_A2_7B; break; + case 28: model.type = e_model::MODEL_57B_A14B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -5100,6 +5103,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "chatglm-bpe") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4; + } else if ( + tokenizer_pre == "viking") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -5192,10 +5198,10 @@ static void llm_load_vocab( if (gen_name.find("code") != std::string::npos) { if (model.arch == LLM_ARCH_LLAMA && 32010 < vocab.id_to_token.size() - && vocab.id_to_token[32007].text == "
"
-              && vocab.id_to_token[32008].text == ""
-              && vocab.id_to_token[32009].text == ""
-              && vocab.id_to_token[32010].text == "") {
+              && vocab.id_to_token[32007].text.find("
") != std::string::npos
+              && vocab.id_to_token[32008].text.find("") != std::string::npos
+              && vocab.id_to_token[32009].text.find("") != std::string::npos
+              && vocab.id_to_token[32010].text.find("") != std::string::npos) {
                 vocab.special_prefix_id = 32007;
                 vocab.special_suffix_id = 32008;
                 vocab.special_middle_id = 32009;
@@ -13909,6 +13915,12 @@ struct llm_tokenizer_bpe {
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_VIKING:
+                regex_exprs = {
+                    "\\p{N}",
+                    " ?[^(\\s|.,!?…。,、।۔،)]+",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {