Merge branch 'master' into xsn/minicpm_template2
This commit is contained in:
commit
ad84e69dee
14 changed files with 629 additions and 193 deletions
6
.github/workflows/build.yml
vendored
6
.github/workflows/build.yml
vendored
|
@ -47,7 +47,7 @@ jobs:
|
|||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
|
@ -105,7 +105,7 @@ jobs:
|
|||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
|
||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
|
@ -222,7 +222,7 @@ jobs:
|
|||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
|
|
|
@ -86,7 +86,7 @@ set(GGML_CUDA_USE_GRAPHS ON)
|
|||
function (llama_option_depr TYPE OLD NEW)
|
||||
if (${OLD})
|
||||
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
|
||||
set(${NEW} ON)
|
||||
set(${NEW} ON PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
@ -96,7 +96,6 @@ llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
|||
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
||||
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
||||
llama_option_depr(WARNING LLAMA_OPENMP GGML_OPENMP)
|
||||
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
|
||||
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
||||
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
||||
|
|
2
Makefile
2
Makefile
|
@ -45,6 +45,7 @@ BUILD_TARGETS = \
|
|||
TEST_TARGETS = \
|
||||
tests/test-autorelease \
|
||||
tests/test-backend-ops \
|
||||
tests/test-chat-template \
|
||||
tests/test-double-float \
|
||||
tests/test-grad0 \
|
||||
tests/test-grammar-integration \
|
||||
|
@ -1070,6 +1071,7 @@ clean:
|
|||
rm -rvf src/*.o
|
||||
rm -rvf tests/*.o
|
||||
rm -rvf examples/*.o
|
||||
rm -rvf common/*.o
|
||||
rm -rvf *.a
|
||||
rm -rvf *.dll
|
||||
rm -rvf *.so
|
||||
|
|
|
@ -2618,6 +2618,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
|||
const std::vector<llama_chat_msg> & msgs,
|
||||
bool add_ass) {
|
||||
int alloc_size = 0;
|
||||
bool fallback = false; // indicate if we must fallback to default chatml
|
||||
std::vector<llama_chat_message> chat;
|
||||
for (auto & msg : msgs) {
|
||||
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
||||
|
@ -2630,10 +2631,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
|||
// run the first time to get the total output length
|
||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
|
||||
// error: chat template is not supported
|
||||
if (res < 0) {
|
||||
if (ptr_tmpl != nullptr) {
|
||||
// if the custom "tmpl" is not supported, we throw an error
|
||||
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
||||
throw std::runtime_error("this custom template is not supported");
|
||||
} else {
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
fallback = true;
|
||||
}
|
||||
}
|
||||
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
if ((size_t) res > buf.size()) {
|
||||
buf.resize(res);
|
||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
res = llama_chat_apply_template(
|
||||
fallback ? nullptr : model,
|
||||
fallback ? "chatml" : ptr_tmpl,
|
||||
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||
}
|
||||
|
||||
std::string formatted_chat(buf.data(), res);
|
||||
|
@ -2804,125 +2821,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
|
|||
//
|
||||
|
||||
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
|
||||
int32_t n_tensors;
|
||||
|
||||
size_t n_bytes = 0;
|
||||
|
||||
uint32_t max_direction_layer = 0;
|
||||
|
||||
llama_control_vector_data result = { -1, {} };
|
||||
|
||||
// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
|
||||
{
|
||||
struct ggml_init_params meta_params = {
|
||||
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
|
||||
/* .mem_buffer = */ nullptr,
|
||||
/* .no_alloc = */ true,
|
||||
};
|
||||
ggml_context * meta_ctx = ggml_init(meta_params);
|
||||
struct gguf_init_params meta_gguf_params = {
|
||||
/* .no_alloc = */ true,
|
||||
/* .ctx = */ &meta_ctx,
|
||||
};
|
||||
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
||||
if (!meta_ctx_gguf) {
|
||||
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(meta_ctx);
|
||||
return result;
|
||||
}
|
||||
|
||||
n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
|
||||
for (int i = 0; i < n_tensors; i++) {
|
||||
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
|
||||
|
||||
// split on '.'
|
||||
size_t dotpos = name.find('.');
|
||||
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
|
||||
try {
|
||||
uint32_t layer = std::stoi(name.substr(dotpos + 1));
|
||||
if (layer == 0) {
|
||||
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(meta_ctx);
|
||||
gguf_free(meta_ctx_gguf);
|
||||
return result;
|
||||
}
|
||||
if (layer > max_direction_layer) {
|
||||
max_direction_layer = layer;
|
||||
}
|
||||
} catch (...) {
|
||||
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(meta_ctx);
|
||||
gguf_free(meta_ctx_gguf);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
|
||||
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
|
||||
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(meta_ctx);
|
||||
gguf_free(meta_ctx_gguf);
|
||||
return result;
|
||||
}
|
||||
if (result.n_embd == -1) {
|
||||
result.n_embd = ggml_nelements(tensor_meta);
|
||||
} else if (ggml_nelements(tensor_meta) != result.n_embd) {
|
||||
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(meta_ctx);
|
||||
gguf_free(meta_ctx_gguf);
|
||||
return result;
|
||||
}
|
||||
n_bytes += ggml_nbytes(tensor_meta);
|
||||
}
|
||||
ggml_free(meta_ctx);
|
||||
gguf_free(meta_ctx_gguf);
|
||||
ggml_context * ctx = nullptr;
|
||||
struct gguf_init_params meta_gguf_params = {
|
||||
/* .no_alloc = */ false,
|
||||
/* .ctx = */ &ctx,
|
||||
};
|
||||
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||
if (n_tensors == 0) {
|
||||
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
||||
return result;
|
||||
}
|
||||
|
||||
// load and scale tensors into final control vector context
|
||||
struct ggml_init_params ggml_params = {
|
||||
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
|
||||
/* .mem_buffer = */ nullptr,
|
||||
/* .no_alloc = */ false,
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(ggml_params);
|
||||
for (int i = 0; i < n_tensors; i++) {
|
||||
std::string name = gguf_get_tensor_name(ctx_gguf, i);
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ false,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
|
||||
ggml_free(ctx);
|
||||
return result;
|
||||
}
|
||||
int layer_idx = -1;
|
||||
|
||||
// do not store data for layer 0 (it's not used)
|
||||
result.data.resize(result.n_embd * max_direction_layer);
|
||||
|
||||
for (uint32_t il = 1; il <= max_direction_layer; il++) {
|
||||
const std::string name = "direction." + std::to_string(il);
|
||||
const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
||||
|
||||
float * dst = result.data.data() + result.n_embd * (il - 1);
|
||||
|
||||
if (tensor) {
|
||||
const float * src = (const float *) tensor->data;
|
||||
for (int j = 0; j < result.n_embd; j++) {
|
||||
dst[j] = src[j] * load_info.strength;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < result.n_embd; j++) {
|
||||
dst[j] = 0.0f;
|
||||
// split on '.'
|
||||
size_t dotpos = name.find('.');
|
||||
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
|
||||
try {
|
||||
layer_idx = std::stoi(name.substr(dotpos + 1));
|
||||
} catch (...) {
|
||||
layer_idx = -1;
|
||||
}
|
||||
}
|
||||
if (layer_idx < 0) {
|
||||
fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
} else if (layer_idx == 0) {
|
||||
fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
||||
if (tensor->type != GGML_TYPE_F32) {
|
||||
fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
if (ggml_n_dims(tensor) != 1) {
|
||||
fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (result.n_embd == -1) {
|
||||
result.n_embd = ggml_nelements(tensor);
|
||||
} else if (ggml_nelements(tensor) != result.n_embd) {
|
||||
fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
// extend if necessary - do not store data for layer 0 (it's not used)
|
||||
result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
|
||||
|
||||
const float * src = (const float *) tensor->data;
|
||||
float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
|
||||
for (int j = 0; j < result.n_embd; j++) {
|
||||
dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (result.n_embd == -1) {
|
||||
fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
|
||||
result.data.clear();
|
||||
}
|
||||
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -2933,16 +2912,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|||
auto cur = llama_control_vector_load_one(info);
|
||||
|
||||
if (cur.n_embd == -1) {
|
||||
return result;
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
|
||||
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
|
||||
return result;
|
||||
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
|
||||
fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
|
||||
result.n_embd = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (result.n_embd == -1) {
|
||||
result = std::move(cur);
|
||||
} else {
|
||||
result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
|
||||
for (size_t i = 0; i < cur.data.size(); i++) {
|
||||
result.data[i] += cur.data[i];
|
||||
}
|
||||
|
@ -2950,7 +2932,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|||
}
|
||||
|
||||
if (result.n_embd == -1) {
|
||||
fprintf(stderr, "%s: no vectors passed\n", __func__);
|
||||
fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
|
||||
result.data.clear();
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
|
@ -380,6 +380,8 @@ struct llama_chat_msg {
|
|||
bool llama_chat_verify_template(const std::string & tmpl);
|
||||
|
||||
// CPP wrapper for llama_chat_apply_template
|
||||
// If the built-in template is not supported, we default to chatml
|
||||
// If the custom "tmpl" is not supported, we throw an error
|
||||
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||
const std::string & tmpl,
|
||||
const std::vector<llama_chat_msg> & chat,
|
||||
|
|
|
@ -2340,6 +2340,46 @@ class GemmaModel(Model):
|
|||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("Gemma2ForCausalLM")
|
||||
class Gemma2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA2
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_llama_hf()
|
||||
self.gguf_writer.add_add_space_prefix(False)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
||||
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_key_length(hparams["head_dim"])
|
||||
self.gguf_writer.add_value_length(hparams["head_dim"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unusem
|
||||
|
||||
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
||||
# To prevent errors, skip loading lm_head.weight.
|
||||
if name == "lm_head.weight":
|
||||
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||
return []
|
||||
|
||||
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
|
||||
if name.endswith("norm.weight"):
|
||||
data_torch = data_torch + 1
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("Starcoder2ForCausalLM")
|
||||
class StarCoder2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
|
||||
# For more information about using CMake with Android Studio, read the
|
||||
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||
|
||||
# Sets the minimum CMake version required for this project.
|
||||
cmake_minimum_required(VERSION 3.22.1)
|
||||
|
||||
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
## Fetch latest llama.cpp from GitHub
|
||||
#include(FetchContent)
|
||||
#FetchContent_Declare(
|
||||
# llama
|
||||
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
# GIT_TAG master
|
||||
#)
|
||||
#
|
||||
## Also provides "common"
|
||||
#FetchContent_MakeAvailable(llama)
|
||||
|
||||
# llama.cpp CI uses the code from the current branch
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
|
||||
add_subdirectory(../../../../../../ build-llama)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
# You can define multiple libraries, and CMake builds them for you.
|
||||
# Gradle automatically packages shared libraries with your APK.
|
||||
#
|
||||
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||
# is preferred for the same purpose.
|
||||
#
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||
# List libraries link to the target library
|
||||
llama
|
||||
common
|
||||
android
|
||||
log)
|
|
@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
|
|||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
GIT_TAG master
|
||||
)
|
||||
#include(FetchContent)
|
||||
#FetchContent_Declare(
|
||||
# llama
|
||||
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
# GIT_TAG master
|
||||
#)
|
||||
|
||||
# Also provides "common"
|
||||
FetchContent_MakeAvailable(llama)
|
||||
#FetchContent_MakeAvailable(llama)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
|
@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
|
|||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||
# is preferred for the same purpose.
|
||||
#
|
||||
|
||||
#load local llama.cpp
|
||||
add_subdirectory(../../../../../../ build-llama)
|
||||
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
|||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1718318537,
|
||||
"narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=",
|
||||
"lastModified": 1718895438,
|
||||
"narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420",
|
||||
"rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
|
@ -2475,7 +2475,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
|||
|
||||
const dim3 block_nums_mmq(nsm, 1, 1);
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
ggml_cuda_pool & pool = ctx.pool(id);
|
||||
ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
|
||||
|
||||
if (args.ne01 % mmq_y == 0) {
|
||||
|
|
|
@ -150,6 +150,7 @@ class MODEL_ARCH(IntEnum):
|
|||
INTERNLM2 = auto()
|
||||
MINICPM = auto()
|
||||
GEMMA = auto()
|
||||
GEMMA2 = auto()
|
||||
STARCODER2 = auto()
|
||||
MAMBA = auto()
|
||||
XVERSE = auto()
|
||||
|
@ -180,10 +181,13 @@ class MODEL_TENSOR(IntEnum):
|
|||
ATTN_NORM = auto()
|
||||
ATTN_NORM_2 = auto()
|
||||
ATTN_OUT_NORM = auto()
|
||||
ATTN_POST_NORM = auto()
|
||||
ATTN_ROT_EMBD = auto()
|
||||
FFN_GATE_INP = auto()
|
||||
FFN_GATE_INP_SHEXP = auto()
|
||||
FFN_NORM = auto()
|
||||
FFN_PRE_NORM = auto()
|
||||
FFN_POST_NORM = auto()
|
||||
FFN_GATE = auto()
|
||||
FFN_DOWN = auto()
|
||||
FFN_UP = auto()
|
||||
|
@ -270,6 +274,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||
MODEL_ARCH.MINICPM: "minicpm",
|
||||
MODEL_ARCH.GEMMA: "gemma",
|
||||
MODEL_ARCH.GEMMA2: "gemma2",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
MODEL_ARCH.MAMBA: "mamba",
|
||||
MODEL_ARCH.XVERSE: "xverse",
|
||||
|
@ -303,9 +308,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
|
||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
|
@ -751,6 +759,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
],
|
||||
MODEL_ARCH.GEMMA2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_POST_NORM,
|
||||
MODEL_TENSOR.FFN_PRE_NORM,
|
||||
MODEL_TENSOR.FFN_POST_NORM,
|
||||
],
|
||||
MODEL_ARCH.STARCODER2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
|
|
@ -187,6 +187,10 @@ class TensorNameMap:
|
|||
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_POST_NORM: (
|
||||
"model.layers.{bid}.post_attention_layernorm", # gemma2
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||
|
@ -210,6 +214,16 @@ class TensorNameMap:
|
|||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_PRE_NORM: (
|
||||
"model.layers.{bid}.pre_feedforward_layernorm", # gemma2
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_POST_NORM: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
"layers.{bid}.feed_forward.gate", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||
|
|
|
@ -126,19 +126,244 @@ You can use GBNF grammars:
|
|||
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
||||
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
|
||||
|
||||
Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
|
||||
Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
|
||||
|
||||
Here is also a non-exhaustive list of **unsupported** features:
|
||||
```bash
|
||||
llama-cli \
|
||||
-hfr bartowski/Phi-3-medium-128k-instruct-GGUF \
|
||||
-hff Phi-3-medium-128k-instruct-Q8_0.gguf \
|
||||
-j '{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"maxLength": 100
|
||||
},
|
||||
"age": {
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
"maximum": 150
|
||||
}
|
||||
},
|
||||
"required": ["name", "age"],
|
||||
"additionalProperties": false
|
||||
},
|
||||
"minItems": 10,
|
||||
"maxItems": 100
|
||||
}' \
|
||||
-p 'Generate a {name, age}[] JSON array with famous actors of all ages.'
|
||||
```
|
||||
|
||||
- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
|
||||
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
|
||||
- `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
|
||||
- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
|
||||
- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
|
||||
- `string` formats `uri`, `email`
|
||||
<details>
|
||||
|
||||
<summary>Show grammar</summary>
|
||||
|
||||
You can convert any schema in command-line with:
|
||||
|
||||
```bash
|
||||
examples/json_schema_to_grammar.py name-age-schema.json
|
||||
```
|
||||
|
||||
```
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
item ::= "{" space item-name-kv "," space item-age-kv "}" space
|
||||
item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space
|
||||
item-age-kv ::= "\"age\"" space ":" space item-age
|
||||
item-name ::= "\"" char{1,100} "\"" space
|
||||
item-name-kv ::= "\"name\"" space ":" space item-name
|
||||
root ::= "[" space item ("," space item){9,99} "]" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
Here is also a list of known limitations (contributions welcome):
|
||||
|
||||
- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
|
||||
- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
|
||||
- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
|
||||
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number`
|
||||
- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073)
|
||||
- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$`
|
||||
- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs)
|
||||
- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email`
|
||||
- No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
|
||||
|
||||
And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars):
|
||||
|
||||
- [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems)
|
||||
- [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
|
||||
- `uniqueItems`
|
||||
- `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
|
||||
- [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
|
||||
- [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
|
||||
- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
|
||||
|
||||
### A word about additionalProperties
|
||||
|
||||
> [!WARNING]
|
||||
> By default, `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties), which you might not want / not expect, and which will make sampling slower (not just because of the extra tokens, but also generates a slower grammar).
|
||||
> You can set `"additionalProperties": false` on the schema of any object to ensure only properties listed in `properties` are generated (not needed for non-`object` types, e.g. `array` or `string`).
|
||||
|
||||
If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can disable additional properties with the `extra` config on each model class:
|
||||
|
||||
```python
|
||||
# pip install pydantic
|
||||
import json
|
||||
from typing import Annotated, List
|
||||
from pydantic import BaseModel, Extra, Field
|
||||
class QAPair(BaseModel):
|
||||
class Config:
|
||||
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
|
||||
question: str
|
||||
concise_answer: str
|
||||
justification: str
|
||||
|
||||
class Summary(BaseModel):
|
||||
class Config:
|
||||
extra = 'forbid'
|
||||
key_facts: List[Annotated[str, Field(pattern='- .{5,}')]]
|
||||
question_answers: List[Annotated[List[QAPair], Field(min_items=5)]]
|
||||
|
||||
print(json.dumps(Summary.model_json_schema(), indent=2))
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary>Show JSON schema & grammar</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"$defs": {
|
||||
"QAPair": {
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"question": {
|
||||
"title": "Question",
|
||||
"type": "string"
|
||||
},
|
||||
"concise_answer": {
|
||||
"title": "Concise Answer",
|
||||
"type": "string"
|
||||
},
|
||||
"justification": {
|
||||
"title": "Justification",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"question",
|
||||
"concise_answer",
|
||||
"justification"
|
||||
],
|
||||
"title": "QAPair",
|
||||
"type": "object"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"key_facts": {
|
||||
"items": {
|
||||
"pattern": "^- .{5,}$",
|
||||
"type": "string"
|
||||
},
|
||||
"title": "Key Facts",
|
||||
"type": "array"
|
||||
},
|
||||
"question_answers": {
|
||||
"items": {
|
||||
"items": {
|
||||
"$ref": "#/$defs/QAPair"
|
||||
},
|
||||
"minItems": 5,
|
||||
"type": "array"
|
||||
},
|
||||
"title": "Question Answers",
|
||||
"type": "array"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"key_facts",
|
||||
"question_answers"
|
||||
],
|
||||
"title": "Summary",
|
||||
"type": "object"
|
||||
}
|
||||
```
|
||||
|
||||
```
|
||||
QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv "}" space
|
||||
QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string
|
||||
QAPair-justification-kv ::= "\"justification\"" space ":" space string
|
||||
QAPair-question-kv ::= "\"question\"" space ":" space string
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
dot ::= [^\x0A\x0D]
|
||||
key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space
|
||||
key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space
|
||||
key-facts-item-1 ::= dot
|
||||
key-facts-kv ::= "\"key_facts\"" space ":" space key-facts
|
||||
question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space
|
||||
question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space
|
||||
question-answers-item-item ::= QAPair
|
||||
question-answers-kv ::= "\"question_answers\"" space ":" space question-answers
|
||||
root ::= "{" space key-facts-kv "," space question-answers-kv "}" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
string ::= "\"" char* "\"" space
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
If you're using [Zod](https://zod.dev/), you can make your objects explicitly strict w/ `z.object(...).strict()` or `z.strictObject(...)`.
|
||||
|
||||
Note however that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always seems to set `"additionalProperties": false` anyway (even w/ zod schemas on which `nonstrict()` / `passthrough()` was called).
|
||||
|
||||
```js
|
||||
import { z } from 'zod';
|
||||
import { zodToJsonSchema } from 'zod-to-json-schema';
|
||||
|
||||
const Foo = z.object({
|
||||
age: z.number().positive(),
|
||||
email: z.string().email(),
|
||||
}).strict();
|
||||
|
||||
console.log(zodToJsonSchema(Foo));
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary>Show JSON schema & grammar</summary>
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"age": {
|
||||
"type": "number",
|
||||
"exclusiveMinimum": 0
|
||||
},
|
||||
"email": {
|
||||
"type": "string",
|
||||
"format": "email"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"age",
|
||||
"email"
|
||||
],
|
||||
"additionalProperties": false,
|
||||
"$schema": "http://json-schema.org/draft-07/schema#"
|
||||
}
|
||||
```
|
||||
|
||||
```
|
||||
age-kv ::= "\"age\"" space ":" space number
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
email-kv ::= "\"email\"" space ":" space string
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
root ::= "{" space age-kv "," space email-kv "}" space
|
||||
space ::= | " " | "\n" [ \t]{0,20}
|
||||
string ::= "\"" char* "\"" space
|
||||
```
|
||||
|
||||
</details>
|
||||
|
|
201
src/llama.cpp
201
src/llama.cpp
|
@ -217,6 +217,7 @@ enum llm_arch {
|
|||
LLM_ARCH_INTERNLM2,
|
||||
LLM_ARCH_MINICPM,
|
||||
LLM_ARCH_GEMMA,
|
||||
LLM_ARCH_GEMMA2,
|
||||
LLM_ARCH_STARCODER2,
|
||||
LLM_ARCH_MAMBA,
|
||||
LLM_ARCH_XVERSE,
|
||||
|
@ -257,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||
{ LLM_ARCH_MINICPM, "minicpm" },
|
||||
{ LLM_ARCH_GEMMA, "gemma" },
|
||||
{ LLM_ARCH_GEMMA2, "gemma2" },
|
||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||
{ LLM_ARCH_MAMBA, "mamba" },
|
||||
{ LLM_ARCH_XVERSE, "xverse" },
|
||||
|
@ -478,10 +480,12 @@ enum llm_tensor {
|
|||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_NORM_2,
|
||||
LLM_TENSOR_ATTN_OUT_NORM,
|
||||
LLM_TENSOR_ATTN_POST_NORM,
|
||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||
LLM_TENSOR_FFN_GATE_INP,
|
||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_FFN_POST_NORM,
|
||||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
|
@ -1004,6 +1008,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GEMMA2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_STARCODER2,
|
||||
{
|
||||
|
@ -2038,6 +2060,9 @@ enum e_model {
|
|||
MODEL_8x22B,
|
||||
MODEL_16x12B,
|
||||
MODEL_10B_128x3_66B,
|
||||
MODEL_57B_A14B,
|
||||
MODEL_9B,
|
||||
MODEL_27B,
|
||||
};
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
|
@ -2214,6 +2239,7 @@ struct llama_layer {
|
|||
struct ggml_tensor * attn_q_a_norm;
|
||||
struct ggml_tensor * attn_kv_a_norm;
|
||||
struct ggml_tensor * attn_sub_norm;
|
||||
struct ggml_tensor * attn_post_norm;
|
||||
struct ggml_tensor * ffn_sub_norm;
|
||||
|
||||
// attention
|
||||
|
@ -2237,6 +2263,7 @@ struct llama_layer {
|
|||
// normalization
|
||||
struct ggml_tensor * ffn_norm;
|
||||
struct ggml_tensor * ffn_norm_b;
|
||||
struct ggml_tensor * ffn_post_norm;
|
||||
struct ggml_tensor * layer_out_norm;
|
||||
struct ggml_tensor * layer_out_norm_b;
|
||||
struct ggml_tensor * ffn_norm_exps;
|
||||
|
@ -4267,6 +4294,9 @@ static const char * llama_model_type_name(e_model type) {
|
|||
case MODEL_8x22B: return "8x22B";
|
||||
case MODEL_16x12B: return "16x12B";
|
||||
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
||||
case MODEL_57B_A14B: return "57B.A14B";
|
||||
case MODEL_9B: return "9B";
|
||||
case MODEL_27B: return "27B";
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
|
@ -4588,6 +4618,7 @@ static void llm_load_hparams(
|
|||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 24: model.type = e_model::MODEL_A2_7B; break;
|
||||
case 28: model.type = e_model::MODEL_57B_A14B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
@ -4668,6 +4699,16 @@ static void llm_load_hparams(
|
|||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 42: model.type = e_model::MODEL_9B; break;
|
||||
case 46: model.type = e_model::MODEL_27B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
@ -6509,6 +6550,40 @@ static bool llm_load_tensors(
|
|||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA2:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
||||
|
||||
const int64_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
||||
layer.attn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
@ -10920,6 +10995,125 @@ struct llm_build_context {
|
|||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_gemma2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
|
||||
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
||||
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
||||
cb(Qcur, "Qcur_scaled", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.layers[il].attn_post_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
}
|
||||
|
||||
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
||||
cb(sa_out, "sa_out", il);
|
||||
|
||||
cur = llm_build_norm(ctx0, sa_out, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.layers[il].ffn_post_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "ffn_post_norm", -1);
|
||||
|
||||
cur = ggml_add(ctx0, cur, sa_out);
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_cgraph * build_starcoder2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
|
@ -12300,6 +12494,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
{
|
||||
result = llm.build_gemma();
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA2:
|
||||
{
|
||||
result = llm.build_gemma2();
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
{
|
||||
result = llm.build_starcoder2();
|
||||
|
@ -17594,6 +17792,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||
case LLM_ARCH_PHI2:
|
||||
case LLM_ARCH_PHI3:
|
||||
case LLM_ARCH_GEMMA:
|
||||
case LLM_ARCH_GEMMA2:
|
||||
case LLM_ARCH_STARCODER2:
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
@ -19486,7 +19685,7 @@ static int32_t llama_chat_apply_template_internal(
|
|||
if (add_ass) {
|
||||
ss << "<s>assistant\n";
|
||||
}
|
||||
} else if (tmpl == "gemma" || tmpl_contains("<start_of_turn>")) {
|
||||
} else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
||||
// google/gemma-7b-it
|
||||
std::string system_prompt = "";
|
||||
for (auto message : chat) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue