merge master
This commit is contained in:
commit
482bdeaa16
14 changed files with 373 additions and 196 deletions
6
.github/workflows/build.yml
vendored
6
.github/workflows/build.yml
vendored
|
@ -47,7 +47,7 @@ jobs:
|
|||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
|
@ -105,7 +105,7 @@ jobs:
|
|||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
|
||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
|
@ -222,7 +222,7 @@ jobs:
|
|||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
|
|
|
@ -86,7 +86,7 @@ set(GGML_CUDA_USE_GRAPHS ON)
|
|||
function (llama_option_depr TYPE OLD NEW)
|
||||
if (${OLD})
|
||||
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
|
||||
set(${NEW} ON)
|
||||
set(${NEW} ON PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
@ -96,7 +96,6 @@ llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
|||
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
||||
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
||||
llama_option_depr(WARNING LLAMA_OPENMP GGML_OPENMP)
|
||||
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
|
||||
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
||||
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
||||
|
|
2
Makefile
2
Makefile
|
@ -45,6 +45,7 @@ BUILD_TARGETS = \
|
|||
TEST_TARGETS = \
|
||||
tests/test-autorelease \
|
||||
tests/test-backend-ops \
|
||||
tests/test-chat-template \
|
||||
tests/test-double-float \
|
||||
tests/test-grad0 \
|
||||
tests/test-grammar-integration \
|
||||
|
@ -1070,6 +1071,7 @@ clean:
|
|||
rm -rvf src/*.o
|
||||
rm -rvf tests/*.o
|
||||
rm -rvf examples/*.o
|
||||
rm -rvf common/*.o
|
||||
rm -rvf *.a
|
||||
rm -rvf *.dll
|
||||
rm -rvf *.so
|
||||
|
|
|
@ -2618,6 +2618,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
|||
const std::vector<llama_chat_msg> & msgs,
|
||||
bool add_ass) {
|
||||
int alloc_size = 0;
|
||||
bool fallback = false; // indicate if we must fallback to default chatml
|
||||
std::vector<llama_chat_message> chat;
|
||||
for (auto & msg : msgs) {
|
||||
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
||||
|
@ -2630,10 +2631,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
|||
// run the first time to get the total output length
|
||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
|
||||
// error: chat template is not supported
|
||||
if (res < 0) {
|
||||
if (ptr_tmpl != nullptr) {
|
||||
// if the custom "tmpl" is not supported, we throw an error
|
||||
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
||||
throw std::runtime_error("this custom template is not supported");
|
||||
} else {
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
fallback = true;
|
||||
}
|
||||
}
|
||||
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
if ((size_t) res > buf.size()) {
|
||||
buf.resize(res);
|
||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
res = llama_chat_apply_template(
|
||||
fallback ? nullptr : model,
|
||||
fallback ? "chatml" : ptr_tmpl,
|
||||
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
}
|
||||
|
||||
std::string formatted_chat(buf.data(), res);
|
||||
|
@ -2804,125 +2821,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
|
|||
//
|
||||
|
||||
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
|
||||
int32_t n_tensors;
|
||||
|
||||
size_t n_bytes = 0;
|
||||
|
||||
uint32_t max_direction_layer = 0;
|
||||
|
||||
llama_control_vector_data result = { -1, {} };
|
||||
|
||||
// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
|
||||
{
|
||||
struct ggml_init_params meta_params = {
|
||||
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
|
||||
/* .mem_buffer = */ nullptr,
|
||||
/* .no_alloc = */ true,
|
||||
};
|
||||
ggml_context * meta_ctx = ggml_init(meta_params);
|
||||
struct gguf_init_params meta_gguf_params = {
|
||||
/* .no_alloc = */ true,
|
||||
/* .ctx = */ &meta_ctx,
|
||||
};
|
||||
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
||||
if (!meta_ctx_gguf) {
|
||||
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(meta_ctx);
|
||||
return result;
|
||||
}
|
||||
|
||||
n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
|
||||
for (int i = 0; i < n_tensors; i++) {
|
||||
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
|
||||
|
||||
// split on '.'
|
||||
size_t dotpos = name.find('.');
|
||||
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
|
||||
try {
|
||||
uint32_t layer = std::stoi(name.substr(dotpos + 1));
|
||||
if (layer == 0) {
|
||||
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(meta_ctx);
|
||||
gguf_free(meta_ctx_gguf);
|
||||
return result;
|
||||
}
|
||||
if (layer > max_direction_layer) {
|
||||
max_direction_layer = layer;
|
||||
}
|
||||
} catch (...) {
|
||||
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(meta_ctx);
|
||||
gguf_free(meta_ctx_gguf);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
|
||||
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
|
||||
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(meta_ctx);
|
||||
gguf_free(meta_ctx_gguf);
|
||||
return result;
|
||||
}
|
||||
if (result.n_embd == -1) {
|
||||
result.n_embd = ggml_nelements(tensor_meta);
|
||||
} else if (ggml_nelements(tensor_meta) != result.n_embd) {
|
||||
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(meta_ctx);
|
||||
gguf_free(meta_ctx_gguf);
|
||||
return result;
|
||||
}
|
||||
n_bytes += ggml_nbytes(tensor_meta);
|
||||
}
|
||||
ggml_free(meta_ctx);
|
||||
gguf_free(meta_ctx_gguf);
|
||||
ggml_context * ctx = nullptr;
|
||||
struct gguf_init_params meta_gguf_params = {
|
||||
/* .no_alloc = */ false,
|
||||
/* .ctx = */ &ctx,
|
||||
};
|
||||
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||
if (n_tensors == 0) {
|
||||
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
||||
return result;
|
||||
}
|
||||
|
||||
// load and scale tensors into final control vector context
|
||||
struct ggml_init_params ggml_params = {
|
||||
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
|
||||
/* .mem_buffer = */ nullptr,
|
||||
/* .no_alloc = */ false,
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(ggml_params);
|
||||
for (int i = 0; i < n_tensors; i++) {
|
||||
std::string name = gguf_get_tensor_name(ctx_gguf, i);
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(ctx);
|
||||
return result;
|
||||
}
|
||||
int layer_idx = -1;
|
||||
|
||||
// do not store data for layer 0 (it's not used)
|
||||
result.data.resize(result.n_embd * max_direction_layer);
|
||||
|
||||
for (uint32_t il = 1; il <= max_direction_layer; il++) {
|
||||
const std::string name = "direction." + std::to_string(il);
|
||||
const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
||||
|
||||
float * dst = result.data.data() + result.n_embd * (il - 1);
|
||||
|
||||
if (tensor) {
|
||||
const float * src = (const float *) tensor->data;
|
||||
for (int j = 0; j < result.n_embd; j++) {
|
||||
dst[j] = src[j] * load_info.strength;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < result.n_embd; j++) {
|
||||
dst[j] = 0.0f;
|
||||
// split on '.'
|
||||
size_t dotpos = name.find('.');
|
||||
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
|
||||
try {
|
||||
layer_idx = std::stoi(name.substr(dotpos + 1));
|
||||
} catch (...) {
|
||||
layer_idx = -1;
|
||||
}
|
||||
}
|
||||
if (layer_idx < 0) {
|
||||
fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
} else if (layer_idx == 0) {
|
||||
fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
||||
if (tensor->type != GGML_TYPE_F32) {
|
||||
fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
if (ggml_n_dims(tensor) != 1) {
|
||||
fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (result.n_embd == -1) {
|
||||
result.n_embd = ggml_nelements(tensor);
|
||||
} else if (ggml_nelements(tensor) != result.n_embd) {
|
||||
fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
// extend if necessary - do not store data for layer 0 (it's not used)
|
||||
result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
|
||||
|
||||
const float * src = (const float *) tensor->data;
|
||||
float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
|
||||
for (int j = 0; j < result.n_embd; j++) {
|
||||
dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (result.n_embd == -1) {
|
||||
fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
|
||||
result.data.clear();
|
||||
}
|
||||
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -2933,16 +2912,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|||
auto cur = llama_control_vector_load_one(info);
|
||||
|
||||
if (cur.n_embd == -1) {
|
||||
return result;
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
|
||||
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
|
||||
return result;
|
||||
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
|
||||
fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (result.n_embd == -1) {
|
||||
result = std::move(cur);
|
||||
} else {
|
||||
result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
|
||||
for (size_t i = 0; i < cur.data.size(); i++) {
|
||||
result.data[i] += cur.data[i];
|
||||
}
|
||||
|
@ -2950,7 +2932,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|||
}
|
||||
|
||||
if (result.n_embd == -1) {
|
||||
fprintf(stderr, "%s: no vectors passed\n", __func__);
|
||||
fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
|
||||
result.data.clear();
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
|
@ -380,6 +380,8 @@ struct llama_chat_msg {
|
|||
bool llama_chat_verify_template(const std::string & tmpl);
|
||||
|
||||
// CPP wrapper for llama_chat_apply_template
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
// If the custom "tmpl" is not supported, we throw an error
|
||||
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<llama_chat_msg> & chat,
|
||||
|
|
|
@ -85,6 +85,7 @@ models = [
|
|||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -490,6 +490,9 @@ class Model:
|
|||
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
||||
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
||||
res = "chatglm-bpe"
|
||||
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
||||
# ref: https://huggingface.co/LumiOpen/Viking-7B
|
||||
res = "viking"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
|
||||
# For more information about using CMake with Android Studio, read the
|
||||
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||
|
||||
# Sets the minimum CMake version required for this project.
|
||||
cmake_minimum_required(VERSION 3.22.1)
|
||||
|
||||
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
## Fetch latest llama.cpp from GitHub
|
||||
#include(FetchContent)
|
||||
#FetchContent_Declare(
|
||||
# llama
|
||||
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
# GIT_TAG master
|
||||
#)
|
||||
#
|
||||
## Also provides "common"
|
||||
#FetchContent_MakeAvailable(llama)
|
||||
|
||||
# llama.cpp CI uses the code from the current branch
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
|
||||
add_subdirectory(../../../../../../ build-llama)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
# You can define multiple libraries, and CMake builds them for you.
|
||||
# Gradle automatically packages shared libraries with your APK.
|
||||
#
|
||||
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||
# is preferred for the same purpose.
|
||||
#
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||
# List libraries link to the target library
|
||||
llama
|
||||
common
|
||||
android
|
||||
log)
|
|
@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
|
|||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
GIT_TAG master
|
||||
)
|
||||
#include(FetchContent)
|
||||
#FetchContent_Declare(
|
||||
# llama
|
||||
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
# GIT_TAG master
|
||||
#)
|
||||
|
||||
# Also provides "common"
|
||||
FetchContent_MakeAvailable(llama)
|
||||
#FetchContent_MakeAvailable(llama)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
|
@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
|
|||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||
# is preferred for the same purpose.
|
||||
#
|
||||
|
||||
#load local llama.cpp
|
||||
add_subdirectory(../../../../../../ build-llama)
|
||||
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
|||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1718318537,
|
||||
"narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=",
|
||||
"lastModified": 1718895438,
|
||||
"narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420",
|
||||
"rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
|
@ -2475,7 +2475,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
|||
|
||||
const dim3 block_nums_mmq(nsm, 1, 1);
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
ggml_cuda_pool & pool = ctx.pool(id);
|
||||
ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
|
||||
|
||||
if (args.ne01 % mmq_y == 0) {
|
||||
|
|
|
@ -126,19 +126,244 @@ You can use GBNF grammars:
|
|||
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
||||
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
|
||||
|
||||
Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
|
||||
Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
|
||||
|
||||
Here is also a non-exhaustive list of **unsupported** features:
|
||||
```bash
|
||||
llama-cli \
|
||||
-hfr bartowski/Phi-3-medium-128k-instruct-GGUF \
|
||||
-hff Phi-3-medium-128k-instruct-Q8_0.gguf \
|
||||
-j '{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"maxLength": 100
|
||||
},
|
||||
"age": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"maximum": 150
|
||||
}
|
||||
},
|
||||
"required": ["name", "age"],
|
||||
"additionalProperties": false
|
||||
},
|
||||
"minItems": 10,
|
||||
"maxItems": 100
|
||||
}' \
|
||||
-p 'Generate a {name, age}[] JSON array with famous actors of all ages.'
|
||||
```
|
||||
|
||||
- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
|
||||
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
|
||||
- `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
|
||||
- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
|
||||
- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
|
||||
- `string` formats `uri`, `email`
|
||||
<details>
|
||||
|
||||
<summary>Show grammar</summary>
|
||||
|
||||
You can convert any schema in command-line with:
|
||||
|
||||
```bash
|
||||
examples/json_schema_to_grammar.py name-age-schema.json
|
||||
```
|
||||
|
||||
```
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
item ::= "{" space item-name-kv "," space item-age-kv "}" space
|
||||
item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space
|
||||
item-age-kv ::= "\"age\"" space ":" space item-age
|
||||
item-name ::= "\"" char{1,100} "\"" space
|
||||
item-name-kv ::= "\"name\"" space ":" space item-name
|
||||
root ::= "[" space item ("," space item){9,99} "]" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
Here is also a list of known limitations (contributions welcome):
|
||||
|
||||
- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
|
||||
- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
|
||||
- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
|
||||
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number`
|
||||
- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073)
|
||||
- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$`
|
||||
- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs)
|
||||
- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email`
|
||||
- No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
|
||||
|
||||
And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars):
|
||||
|
||||
- [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems)
|
||||
- [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
|
||||
- `uniqueItems`
|
||||
- `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
|
||||
- [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
|
||||
- [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
|
||||
- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
|
||||
|
||||
### A word about additionalProperties
|
||||
|
||||
> [!WARNING]
|
||||
> By default, `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties), which you might not want / not expect, and which will make sampling slower (not just because of the extra tokens, but also generates a slower grammar).
|
||||
> You can set `"additionalProperties": false` on the schema of any object to ensure only properties listed in `properties` are generated (not needed for non-`object` types, e.g. `array` or `string`).
|
||||
|
||||
If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can disable additional properties with the `extra` config on each model class:
|
||||
|
||||
```python
|
||||
# pip install pydantic
|
||||
import json
|
||||
from typing import Annotated, List
|
||||
from pydantic import BaseModel, Extra, Field
|
||||
class QAPair(BaseModel):
|
||||
class Config:
|
||||
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
|
||||
question: str
|
||||
concise_answer: str
|
||||
justification: str
|
||||
|
||||
class Summary(BaseModel):
|
||||
class Config:
|
||||
extra = 'forbid'
|
||||
key_facts: List[Annotated[str, Field(pattern='- .{5,}')]]
|
||||
question_answers: List[Annotated[List[QAPair], Field(min_items=5)]]
|
||||
|
||||
print(json.dumps(Summary.model_json_schema(), indent=2))
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary>Show JSON schema & grammar</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"$defs": {
|
||||
"QAPair": {
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"question": {
|
||||
"title": "Question",
|
||||
"type": "string"
|
||||
},
|
||||
"concise_answer": {
|
||||
"title": "Concise Answer",
|
||||
"type": "string"
|
||||
},
|
||||
"justification": {
|
||||
"title": "Justification",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"question",
|
||||
"concise_answer",
|
||||
"justification"
|
||||
],
|
||||
"title": "QAPair",
|
||||
"type": "object"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"key_facts": {
|
||||
"items": {
|
||||
"pattern": "^- .{5,}$",
|
||||
"type": "string"
|
||||
},
|
||||
"title": "Key Facts",
|
||||
"type": "array"
|
||||
},
|
||||
"question_answers": {
|
||||
"items": {
|
||||
"items": {
|
||||
"$ref": "#/$defs/QAPair"
|
||||
},
|
||||
"minItems": 5,
|
||||
"type": "array"
|
||||
},
|
||||
"title": "Question Answers",
|
||||
"type": "array"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"key_facts",
|
||||
"question_answers"
|
||||
],
|
||||
"title": "Summary",
|
||||
"type": "object"
|
||||
}
|
||||
```
|
||||
|
||||
```
|
||||
QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv "}" space
|
||||
QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string
|
||||
QAPair-justification-kv ::= "\"justification\"" space ":" space string
|
||||
QAPair-question-kv ::= "\"question\"" space ":" space string
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
dot ::= [^\x0A\x0D]
|
||||
key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space
|
||||
key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space
|
||||
key-facts-item-1 ::= dot
|
||||
key-facts-kv ::= "\"key_facts\"" space ":" space key-facts
|
||||
question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space
|
||||
question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space
|
||||
question-answers-item-item ::= QAPair
|
||||
question-answers-kv ::= "\"question_answers\"" space ":" space question-answers
|
||||
root ::= "{" space key-facts-kv "," space question-answers-kv "}" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
string ::= "\"" char* "\"" space
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
If you're using [Zod](https://zod.dev/), you can make your objects explicitly strict w/ `z.object(...).strict()` or `z.strictObject(...)`.
|
||||
|
||||
Note however that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always seems to set `"additionalProperties": false` anyway (even w/ zod schemas on which `nonstrict()` / `passthrough()` was called).
|
||||
|
||||
```js
|
||||
import { z } from 'zod';
|
||||
import { zodToJsonSchema } from 'zod-to-json-schema';
|
||||
|
||||
const Foo = z.object({
|
||||
age: z.number().positive(),
|
||||
email: z.string().email(),
|
||||
}).strict();
|
||||
|
||||
console.log(zodToJsonSchema(Foo));
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary>Show JSON schema & grammar</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"age": {
|
||||
"type": "number",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"email": {
|
||||
"type": "string",
|
||||
"format": "email"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"age",
|
||||
"email"
|
||||
],
|
||||
"additionalProperties": false,
|
||||
"$schema": "http://json-schema.org/draft-07/schema#"
|
||||
}
|
||||
```
|
||||
|
||||
```
|
||||
age-kv ::= "\"age\"" space ":" space number
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
email-kv ::= "\"email\"" space ":" space string
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
root ::= "{" space age-kv "," space email-kv "}" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
string ::= "\"" char* "\"" space
|
||||
```
|
||||
|
||||
</details>
|
||||
|
|
|
@ -90,6 +90,7 @@ extern "C" {
|
|||
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
|
|
|
@ -2057,6 +2057,7 @@ enum e_model {
|
|||
MODEL_8x22B,
|
||||
MODEL_16x12B,
|
||||
MODEL_10B_128x3_66B,
|
||||
MODEL_57B_A14B,
|
||||
};
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
|
@ -4288,6 +4289,7 @@ static const char * llama_model_type_name(e_model type) {
|
|||
case MODEL_8x22B: return "8x22B";
|
||||
case MODEL_16x12B: return "16x12B";
|
||||
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
||||
case MODEL_57B_A14B: return "57B.A14B";
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
|
@ -4609,6 +4611,7 @@ static void llm_load_hparams(
|
|||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 24: model.type = e_model::MODEL_A2_7B; break;
|
||||
case 28: model.type = e_model::MODEL_57B_A14B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
@ -5100,6 +5103,9 @@ static void llm_load_vocab(
|
|||
} else if (
|
||||
tokenizer_pre == "chatglm-bpe") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
|
||||
} else if (
|
||||
tokenizer_pre == "viking") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
|
@ -5192,10 +5198,10 @@ static void llm_load_vocab(
|
|||
if (gen_name.find("code") != std::string::npos) {
|
||||
if (model.arch == LLM_ARCH_LLAMA
|
||||
&& 32010 < vocab.id_to_token.size()
|
||||
&& vocab.id_to_token[32007].text == "<PRE>"
|
||||
&& vocab.id_to_token[32008].text == "<SUF>"
|
||||
&& vocab.id_to_token[32009].text == "<MID>"
|
||||
&& vocab.id_to_token[32010].text == "<EOT>") {
|
||||
&& vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
|
||||
&& vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
|
||||
&& vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
|
||||
&& vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
|
||||
vocab.special_prefix_id = 32007;
|
||||
vocab.special_suffix_id = 32008;
|
||||
vocab.special_middle_id = 32009;
|
||||
|
@ -13909,6 +13915,12 @@ struct llm_tokenizer_bpe {
|
|||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_VIKING:
|
||||
regex_exprs = {
|
||||
"\\p{N}",
|
||||
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue