Merge branch 'master' into master

This commit is contained in:
Andrei 2024-06-27 09:08:44 -07:00 committed by GitHub
commit 6913a17bf5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 112 additions and 180 deletions

View file

@ -2804,125 +2804,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
// //
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
int32_t n_tensors;
size_t n_bytes = 0;
uint32_t max_direction_layer = 0;
llama_control_vector_data result = { -1, {} }; llama_control_vector_data result = { -1, {} };
// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer ggml_context * ctx = nullptr;
{ struct gguf_init_params meta_gguf_params = {
struct ggml_init_params meta_params = { /* .no_alloc = */ false,
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), /* .ctx = */ &ctx,
/* .mem_buffer = */ nullptr, };
/* .no_alloc = */ true, struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
}; if (!ctx_gguf) {
ggml_context * meta_ctx = ggml_init(meta_params); fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
struct gguf_init_params meta_gguf_params = { return result;
/* .no_alloc = */ true,
/* .ctx = */ &meta_ctx,
};
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
if (!meta_ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
return result;
}
n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
for (int i = 0; i < n_tensors; i++) {
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
// split on '.'
size_t dotpos = name.find('.');
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
try {
uint32_t layer = std::stoi(name.substr(dotpos + 1));
if (layer == 0) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
if (layer > max_direction_layer) {
max_direction_layer = layer;
}
} catch (...) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
}
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
if (result.n_embd == -1) {
result.n_embd = ggml_nelements(tensor_meta);
} else if (ggml_nelements(tensor_meta) != result.n_embd) {
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
}
n_bytes += ggml_nbytes(tensor_meta);
}
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
} }
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
if (n_tensors == 0) { if (n_tensors == 0) {
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
return result;
} }
// load and scale tensors into final control vector context for (int i = 0; i < n_tensors; i++) {
struct ggml_init_params ggml_params = { std::string name = gguf_get_tensor_name(ctx_gguf, i);
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(ggml_params);
struct gguf_init_params params = { int layer_idx = -1;
/*.no_alloc = */ false,
/*.ctx = */ &ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
if (!ctx_gguf) {
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
ggml_free(ctx);
return result;
}
// do not store data for layer 0 (it's not used) // split on '.'
result.data.resize(result.n_embd * max_direction_layer); size_t dotpos = name.find('.');
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
for (uint32_t il = 1; il <= max_direction_layer; il++) { try {
const std::string name = "direction." + std::to_string(il); layer_idx = std::stoi(name.substr(dotpos + 1));
const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); } catch (...) {
layer_idx = -1;
float * dst = result.data.data() + result.n_embd * (il - 1);
if (tensor) {
const float * src = (const float *) tensor->data;
for (int j = 0; j < result.n_embd; j++) {
dst[j] = src[j] * load_info.strength;
}
} else {
for (int j = 0; j < result.n_embd; j++) {
dst[j] = 0.0f;
} }
} }
if (layer_idx < 0) {
fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
} else if (layer_idx == 0) {
fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
if (tensor->type != GGML_TYPE_F32) {
fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
if (ggml_n_dims(tensor) != 1) {
fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
if (result.n_embd == -1) {
result.n_embd = ggml_nelements(tensor);
} else if (ggml_nelements(tensor) != result.n_embd) {
fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
// extend if necessary - do not store data for layer 0 (it's not used)
result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
const float * src = (const float *) tensor->data;
float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
for (int j = 0; j < result.n_embd; j++) {
dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
}
} }
if (result.n_embd == -1) {
fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
result.data.clear();
}
gguf_free(ctx_gguf);
ggml_free(ctx);
return result; return result;
} }
@ -2933,16 +2895,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
auto cur = llama_control_vector_load_one(info); auto cur = llama_control_vector_load_one(info);
if (cur.n_embd == -1) { if (cur.n_embd == -1) {
return result; result.n_embd = -1;
break;
} }
if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) { if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str()); fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
return result; result.n_embd = -1;
break;
} }
if (result.n_embd == -1) { if (result.n_embd == -1) {
result = std::move(cur); result = std::move(cur);
} else { } else {
result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
for (size_t i = 0; i < cur.data.size(); i++) { for (size_t i = 0; i < cur.data.size(); i++) {
result.data[i] += cur.data[i]; result.data[i] += cur.data[i];
} }
@ -2950,7 +2915,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
} }
if (result.n_embd == -1) { if (result.n_embd == -1) {
fprintf(stderr, "%s: no vectors passed\n", __func__); fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
result.data.clear();
} }
return result; return result;

View file

@ -85,6 +85,7 @@ models = [
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
] ]

View file

@ -487,6 +487,9 @@ class Model:
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code" res = "jina-v2-code"
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
# ref: https://huggingface.co/LumiOpen/Viking-7B
res = "viking"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")

View file

@ -1,55 +0,0 @@
# For more information about using CMake with Android Studio, read the
# documentation: https://d.android.com/studio/projects/add-native-code.html.
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
# Sets the minimum CMake version required for this project.
cmake_minimum_required(VERSION 3.22.1)
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
# Since this is the top level CMakeLists.txt, the project name is also accessible
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
# build script scope).
project("llama-android")
## Fetch latest llama.cpp from GitHub
#include(FetchContent)
#FetchContent_Declare(
# llama
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
# GIT_TAG master
#)
#
## Also provides "common"
#FetchContent_MakeAvailable(llama)
# llama.cpp CI uses the code from the current branch
# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
add_subdirectory(../../../../../../ build-llama)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
# You can define multiple libraries, and CMake builds them for you.
# Gradle automatically packages shared libraries with your APK.
#
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose.
#
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this
# build script, prebuilt third-party libraries, or Android system libraries.
target_link_libraries(${CMAKE_PROJECT_NAME}
# List libraries link to the target library
llama
common
android
log)

View file

@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
# build script scope). # build script scope).
project("llama-android") project("llama-android")
include(FetchContent) #include(FetchContent)
FetchContent_Declare( #FetchContent_Declare(
llama # llama
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp # GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
GIT_TAG master # GIT_TAG master
) #)
# Also provides "common" # Also provides "common"
FetchContent_MakeAvailable(llama) #FetchContent_MakeAvailable(llama)
# Creates and names a library, sets it as either STATIC # Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code. # or SHARED, and provides the relative paths to its source code.
@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose. # is preferred for the same purpose.
# #
#load local llama.cpp
add_subdirectory(../../../../../../ build-llama)
# In order to load a library into your app from Java/Kotlin, you must call # In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here; # System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be # for GameActivity/NativeActivity derived applications, the same library name must be

6
flake.lock generated
View file

@ -20,11 +20,11 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1718318537, "lastModified": 1718895438,
"narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=", "narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420", "rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3",
"type": "github" "type": "github"
}, },
"original": { "original": {

View file

@ -2475,7 +2475,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
const dim3 block_nums_mmq(nsm, 1, 1); const dim3 block_nums_mmq(nsm, 1, 1);
ggml_cuda_pool & pool = ctx.pool(); ggml_cuda_pool & pool = ctx.pool(id);
ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y); ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
if (args.ne01 % mmq_y == 0) { if (args.ne01 % mmq_y == 0) {

View file

@ -88,6 +88,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_DBRX = 13, LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_PORO = 15, LLAMA_VOCAB_PRE_TYPE_PORO = 15,
LLAMA_VOCAB_PRE_TYPE_VIKING = 16,
}; };
// note: these values should be synchronized with ggml_rope // note: these values should be synchronized with ggml_rope

View file

@ -2060,6 +2060,7 @@ enum e_model {
MODEL_8x22B, MODEL_8x22B,
MODEL_16x12B, MODEL_16x12B,
MODEL_10B_128x3_66B, MODEL_10B_128x3_66B,
MODEL_57B_A14B,
MODEL_9B, MODEL_9B,
MODEL_27B, MODEL_27B,
}; };
@ -4293,6 +4294,7 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_8x22B: return "8x22B"; case MODEL_8x22B: return "8x22B";
case MODEL_16x12B: return "16x12B"; case MODEL_16x12B: return "16x12B";
case MODEL_10B_128x3_66B: return "10B+128x3.66B"; case MODEL_10B_128x3_66B: return "10B+128x3.66B";
case MODEL_57B_A14B: return "57B.A14B";
default: return "?B"; default: return "?B";
} }
} }
@ -4614,6 +4616,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 24: model.type = e_model::MODEL_A2_7B; break; case 24: model.type = e_model::MODEL_A2_7B; break;
case 28: model.type = e_model::MODEL_57B_A14B; break;
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
@ -5103,6 +5106,9 @@ static void llm_load_vocab(
} else if ( } else if (
tokenizer_pre == "poro-chat") { tokenizer_pre == "poro-chat") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
} else if (
tokenizer_pre == "viking") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
} else { } else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
} }
@ -5188,10 +5194,10 @@ static void llm_load_vocab(
if (gen_name.find("code") != std::string::npos) { if (gen_name.find("code") != std::string::npos) {
if (model.arch == LLM_ARCH_LLAMA if (model.arch == LLM_ARCH_LLAMA
&& 32010 < vocab.id_to_token.size() && 32010 < vocab.id_to_token.size()
&& vocab.id_to_token[32007].text == "<PRE>" && vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
&& vocab.id_to_token[32008].text == "<SUF>" && vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
&& vocab.id_to_token[32009].text == "<MID>" && vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
&& vocab.id_to_token[32010].text == "<EOT>") { && vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
vocab.special_prefix_id = 32007; vocab.special_prefix_id = 32007;
vocab.special_suffix_id = 32008; vocab.special_suffix_id = 32008;
vocab.special_middle_id = 32009; vocab.special_middle_id = 32009;
@ -13896,6 +13902,12 @@ struct llm_tokenizer_bpe {
" ?[^(\\s|.,!?…。,、।۔،)]+", " ?[^(\\s|.,!?…。,、।۔،)]+",
}; };
break; break;
case LLAMA_VOCAB_PRE_TYPE_VIKING:
regex_exprs = {
"\\p{N}",
" ?[^(\\s|.,!?…。,、।۔،)]+",
};
break;
default: default:
// default regex for BPE tokenization pre-processing // default regex for BPE tokenization pre-processing
regex_exprs = { regex_exprs = {