merge master

This commit is contained in:
toyer 2024-06-28 02:22:08 +00:00
commit 482bdeaa16
14 changed files with 373 additions and 196 deletions

View file

@ -47,7 +47,7 @@ jobs:
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@ -105,7 +105,7 @@ jobs:
sysctl -a
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@ -222,7 +222,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
cmake --build . --config Release -j $(nproc)
- name: Test

View file

@ -86,7 +86,7 @@ set(GGML_CUDA_USE_GRAPHS ON)
function (llama_option_depr TYPE OLD NEW)
if (${OLD})
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
set(${NEW} ON)
set(${NEW} ON PARENT_SCOPE)
endif()
endfunction()
@ -96,7 +96,6 @@ llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
llama_option_depr(WARNING LLAMA_OPENMP GGML_OPENMP)
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)

View file

@ -45,6 +45,7 @@ BUILD_TARGETS = \
TEST_TARGETS = \
tests/test-autorelease \
tests/test-backend-ops \
tests/test-chat-template \
tests/test-double-float \
tests/test-grad0 \
tests/test-grammar-integration \
@ -1070,6 +1071,7 @@ clean:
rm -rvf src/*.o
rm -rvf tests/*.o
rm -rvf examples/*.o
rm -rvf common/*.o
rm -rvf *.a
rm -rvf *.dll
rm -rvf *.so

View file

@ -2618,6 +2618,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
const std::vector<llama_chat_msg> & msgs,
bool add_ass) {
int alloc_size = 0;
bool fallback = false; // indicate if we must fallback to default chatml
std::vector<llama_chat_message> chat;
for (auto & msg : msgs) {
chat.push_back({msg.role.c_str(), msg.content.c_str()});
@ -2630,10 +2631,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
// run the first time to get the total output length
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
// error: chat template is not supported
if (res < 0) {
if (ptr_tmpl != nullptr) {
// if the custom "tmpl" is not supported, we throw an error
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
throw std::runtime_error("this custom template is not supported");
} else {
// If the built-in template is not supported, we default to chatml
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
fallback = true;
}
}
// if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) {
buf.resize(res);
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
res = llama_chat_apply_template(
fallback ? nullptr : model,
fallback ? "chatml" : ptr_tmpl,
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
}
std::string formatted_chat(buf.data(), res);
@ -2804,125 +2821,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
//
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
int32_t n_tensors;
size_t n_bytes = 0;
uint32_t max_direction_layer = 0;
llama_control_vector_data result = { -1, {} };
// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
{
struct ggml_init_params meta_params = {
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ true,
};
ggml_context * meta_ctx = ggml_init(meta_params);
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
/* .ctx = */ &meta_ctx,
};
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
if (!meta_ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
return result;
}
n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
for (int i = 0; i < n_tensors; i++) {
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
// split on '.'
size_t dotpos = name.find('.');
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
try {
uint32_t layer = std::stoi(name.substr(dotpos + 1));
if (layer == 0) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
if (layer > max_direction_layer) {
max_direction_layer = layer;
}
} catch (...) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
}
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
if (result.n_embd == -1) {
result.n_embd = ggml_nelements(tensor_meta);
} else if (ggml_nelements(tensor_meta) != result.n_embd) {
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
n_bytes += ggml_nbytes(tensor_meta);
}
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
ggml_context * ctx = nullptr;
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ false,
/* .ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
return result;
}
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
if (n_tensors == 0) {
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
return result;
}
// load and scale tensors into final control vector context
struct ggml_init_params ggml_params = {
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(ggml_params);
for (int i = 0; i < n_tensors; i++) {
std::string name = gguf_get_tensor_name(ctx_gguf, i);
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
ggml_free(ctx);
return result;
}
int layer_idx = -1;
// do not store data for layer 0 (it's not used)
result.data.resize(result.n_embd * max_direction_layer);
for (uint32_t il = 1; il <= max_direction_layer; il++) {
const std::string name = "direction." + std::to_string(il);
const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
float * dst = result.data.data() + result.n_embd * (il - 1);
if (tensor) {
const float * src = (const float *) tensor->data;
for (int j = 0; j < result.n_embd; j++) {
dst[j] = src[j] * load_info.strength;
}
} else {
for (int j = 0; j < result.n_embd; j++) {
dst[j] = 0.0f;
// split on '.'
size_t dotpos = name.find('.');
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
try {
layer_idx = std::stoi(name.substr(dotpos + 1));
} catch (...) {
layer_idx = -1;
}
}
if (layer_idx < 0) {
fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
} else if (layer_idx == 0) {
fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
if (tensor->type != GGML_TYPE_F32) {
fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
if (ggml_n_dims(tensor) != 1) {
fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
if (result.n_embd == -1) {
result.n_embd = ggml_nelements(tensor);
} else if (ggml_nelements(tensor) != result.n_embd) {
fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
// extend if necessary - do not store data for layer 0 (it's not used)
result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
const float * src = (const float *) tensor->data;
float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
for (int j = 0; j < result.n_embd; j++) {
dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
}
}
if (result.n_embd == -1) {
fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
result.data.clear();
}
gguf_free(ctx_gguf);
ggml_free(ctx);
return result;
}
@ -2933,16 +2912,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
auto cur = llama_control_vector_load_one(info);
if (cur.n_embd == -1) {
return result;
result.n_embd = -1;
break;
}
if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
return result;
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
result.n_embd = -1;
break;
}
if (result.n_embd == -1) {
result = std::move(cur);
} else {
result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
for (size_t i = 0; i < cur.data.size(); i++) {
result.data[i] += cur.data[i];
}
@ -2950,7 +2932,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
}
if (result.n_embd == -1) {
fprintf(stderr, "%s: no vectors passed\n", __func__);
fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
result.data.clear();
}
return result;

View file

@ -380,6 +380,8 @@ struct llama_chat_msg {
bool llama_chat_verify_template(const std::string & tmpl);
// CPP wrapper for llama_chat_apply_template
// If the built-in template is not supported, we default to chatml
// If the custom "tmpl" is not supported, we throw an error
std::string llama_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<llama_chat_msg> & chat,

View file

@ -85,6 +85,7 @@ models = [
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
]

View file

@ -490,6 +490,9 @@ class Model:
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
res = "chatglm-bpe"
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
# ref: https://huggingface.co/LumiOpen/Viking-7B
res = "viking"
if res is None:
logger.warning("\n")

View file

@ -1,55 +0,0 @@
# For more information about using CMake with Android Studio, read the
# documentation: https://d.android.com/studio/projects/add-native-code.html.
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
# Sets the minimum CMake version required for this project.
cmake_minimum_required(VERSION 3.22.1)
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
# Since this is the top level CMakeLists.txt, the project name is also accessible
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
# build script scope).
project("llama-android")
## Fetch latest llama.cpp from GitHub
#include(FetchContent)
#FetchContent_Declare(
# llama
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
# GIT_TAG master
#)
#
## Also provides "common"
#FetchContent_MakeAvailable(llama)
# llama.cpp CI uses the code from the current branch
# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
add_subdirectory(../../../../../../ build-llama)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
# You can define multiple libraries, and CMake builds them for you.
# Gradle automatically packages shared libraries with your APK.
#
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose.
#
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this
# build script, prebuilt third-party libraries, or Android system libraries.
target_link_libraries(${CMAKE_PROJECT_NAME}
# List libraries link to the target library
llama
common
android
log)

View file

@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
# build script scope).
project("llama-android")
include(FetchContent)
FetchContent_Declare(
llama
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
GIT_TAG master
)
#include(FetchContent)
#FetchContent_Declare(
# llama
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
# GIT_TAG master
#)
# Also provides "common"
FetchContent_MakeAvailable(llama)
#FetchContent_MakeAvailable(llama)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose.
#
#load local llama.cpp
add_subdirectory(../../../../../../ build-llama)
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be

6
flake.lock generated
View file

@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1718318537,
"narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=",
"lastModified": 1718895438,
"narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420",
"rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3",
"type": "github"
},
"original": {

View file

@ -2475,7 +2475,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const dim3 block_nums_mmq(nsm, 1, 1);
ggml_cuda_pool & pool = ctx.pool();
ggml_cuda_pool & pool = ctx.pool(id);
ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
if (args.ne01 % mmq_y == 0) {

View file

@ -126,19 +126,244 @@ You can use GBNF grammars:
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
Here is also a non-exhaustive list of **unsupported** features:
```bash
llama-cli \
-hfr bartowski/Phi-3-medium-128k-instruct-GGUF \
-hff Phi-3-medium-128k-instruct-Q8_0.gguf \
-j '{
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"minLength": 1,
"maxLength": 100
},
"age": {
"type": "integer",
"minimum": 0,
"maximum": 150
}
},
"required": ["name", "age"],
"additionalProperties": false
},
"minItems": 10,
"maxItems": 100
}' \
-p 'Generate a {name, age}[] JSON array with famous actors of all ages.'
```
- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
- `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
- `string` formats `uri`, `email`
<details>
<summary>Show grammar</summary>
You can convert any schema in command-line with:
```bash
examples/json_schema_to_grammar.py name-age-schema.json
```
```
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
item ::= "{" space item-name-kv "," space item-age-kv "}" space
item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space
item-age-kv ::= "\"age\"" space ":" space item-age
item-name ::= "\"" char{1,100} "\"" space
item-name-kv ::= "\"name\"" space ":" space item-name
root ::= "[" space item ("," space item){9,99} "]" space
space ::= | " " | "\n" [ \t]{0,20}
```
</details>
Here is also a list of known limitations (contributions welcome):
- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number`
- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073)
- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$`
- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs)
- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email`
- No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars):
- [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems)
- [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
- `uniqueItems`
- `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
- [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
- [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
### A word about additionalProperties
> [!WARNING]
> By default, `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties), which you might not want / not expect, and which will make sampling slower (not just because of the extra tokens, but also generates a slower grammar).
> You can set `"additionalProperties": false` on the schema of any object to ensure only properties listed in `properties` are generated (not needed for non-`object` types, e.g. `array` or `string`).
If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can disable additional properties with the `extra` config on each model class:
```python
# pip install pydantic
import json
from typing import Annotated, List
from pydantic import BaseModel, Extra, Field
class QAPair(BaseModel):
class Config:
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
question: str
concise_answer: str
justification: str
class Summary(BaseModel):
class Config:
extra = 'forbid'
key_facts: List[Annotated[str, Field(pattern='- .{5,}')]]
question_answers: List[Annotated[List[QAPair], Field(min_items=5)]]
print(json.dumps(Summary.model_json_schema(), indent=2))
```
<details>
<summary>Show JSON schema & grammar</summary>
```json
{
"$defs": {
"QAPair": {
"additionalProperties": false,
"properties": {
"question": {
"title": "Question",
"type": "string"
},
"concise_answer": {
"title": "Concise Answer",
"type": "string"
},
"justification": {
"title": "Justification",
"type": "string"
}
},
"required": [
"question",
"concise_answer",
"justification"
],
"title": "QAPair",
"type": "object"
}
},
"additionalProperties": false,
"properties": {
"key_facts": {
"items": {
"pattern": "^- .{5,}$",
"type": "string"
},
"title": "Key Facts",
"type": "array"
},
"question_answers": {
"items": {
"items": {
"$ref": "#/$defs/QAPair"
},
"minItems": 5,
"type": "array"
},
"title": "Question Answers",
"type": "array"
}
},
"required": [
"key_facts",
"question_answers"
],
"title": "Summary",
"type": "object"
}
```
```
QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv "}" space
QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string
QAPair-justification-kv ::= "\"justification\"" space ":" space string
QAPair-question-kv ::= "\"question\"" space ":" space string
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
dot ::= [^\x0A\x0D]
key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space
key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space
key-facts-item-1 ::= dot
key-facts-kv ::= "\"key_facts\"" space ":" space key-facts
question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space
question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space
question-answers-item-item ::= QAPair
question-answers-kv ::= "\"question_answers\"" space ":" space question-answers
root ::= "{" space key-facts-kv "," space question-answers-kv "}" space
space ::= | " " | "\n" [ \t]{0,20}
string ::= "\"" char* "\"" space
```
</details>
If you're using [Zod](https://zod.dev/), you can make your objects explicitly strict w/ `z.object(...).strict()` or `z.strictObject(...)`.
Note however that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always seems to set `"additionalProperties": false` anyway (even w/ zod schemas on which `nonstrict()` / `passthrough()` was called).
```js
import { z } from 'zod';
import { zodToJsonSchema } from 'zod-to-json-schema';
const Foo = z.object({
age: z.number().positive(),
email: z.string().email(),
}).strict();
console.log(zodToJsonSchema(Foo));
```
<details>
<summary>Show JSON schema & grammar</summary>
```json
{
"type": "object",
"properties": {
"age": {
"type": "number",
"exclusiveMinimum": 0
},
"email": {
"type": "string",
"format": "email"
}
},
"required": [
"age",
"email"
],
"additionalProperties": false,
"$schema": "http://json-schema.org/draft-07/schema#"
}
```
```
age-kv ::= "\"age\"" space ":" space number
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
email-kv ::= "\"email\"" space ":" space string
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "{" space age-kv "," space email-kv "}" space
space ::= | " " | "\n" [ \t]{0,20}
string ::= "\"" char* "\"" space
```
</details>

View file

@ -90,6 +90,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
};
// note: these values should be synchronized with ggml_rope

View file

@ -2057,6 +2057,7 @@ enum e_model {
MODEL_8x22B,
MODEL_16x12B,
MODEL_10B_128x3_66B,
MODEL_57B_A14B,
};
static const size_t kiB = 1024;
@ -4288,6 +4289,7 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_8x22B: return "8x22B";
case MODEL_16x12B: return "16x12B";
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
case MODEL_57B_A14B: return "57B.A14B";
default: return "?B";
}
}
@ -4609,6 +4611,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_A2_7B; break;
case 28: model.type = e_model::MODEL_57B_A14B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
@ -5100,6 +5103,9 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "chatglm-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
} else if (
tokenizer_pre == "viking") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
@ -5192,10 +5198,10 @@ static void llm_load_vocab(
if (gen_name.find("code") != std::string::npos) {
if (model.arch == LLM_ARCH_LLAMA
&& 32010 < vocab.id_to_token.size()
&& vocab.id_to_token[32007].text == "<PRE>"
&& vocab.id_to_token[32008].text == "<SUF>"
&& vocab.id_to_token[32009].text == "<MID>"
&& vocab.id_to_token[32010].text == "<EOT>") {
&& vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
&& vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
&& vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
&& vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
vocab.special_prefix_id = 32007;
vocab.special_suffix_id = 32008;
vocab.special_middle_id = 32009;
@ -13909,6 +13915,12 @@ struct llm_tokenizer_bpe {
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_VIKING:
regex_exprs = {
"\\p{N}",
" ?[^(\\s|.,!?…。,、।۔،)]+",
};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {