Merge branch 'ggerganov:master' into sgemm_iq4_nl

This commit is contained in:
Eve 2024-06-21 15:27:30 +00:00 committed by GitHub
commit 1f6e1b0086
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 184 additions and 103 deletions

View file

@ -541,6 +541,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
else { invalid_param = true; } else { invalid_param = true; }
return true; return true;
} }
@ -1869,6 +1870,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "backend" }); options.push_back({ "backend" });
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
if (llama_supports_mlock()) { if (llama_supports_mlock()) {
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
} }

View file

@ -17,9 +17,10 @@ static std::vector<std::string> split_lines(const std::string & s) {
return lines; return lines;
} }
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) { static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
for (size_t i = 0; i < tokens.size(); i++) { size_t n_tokens = tokens.size();
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
} }
} }
@ -40,13 +41,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
// try to get sequence embeddings - supported only when pooling_type is not NONE // try to get sequence embeddings - supported only when pooling_type is not NONE
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
if (embd == NULL) { GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
continue;
}
}
float * out = output + batch.seq_id[i][0] * n_embd; float * out = output + batch.seq_id[i][0] * n_embd;
//TODO: I would also add a parameter here to enable normalization or not. //TODO: I would also add a parameter here to enable normalization or not.
@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) { if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx); __func__, n_ctx_train, n_ctx);

View file

@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
// clear previous kv_cache values (irrelevant for embeddings) // clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, true);
llama_set_causal_attn(ctx, false); llama_set_causal_attn(ctx, false);
// run model // run model
@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
llama_token eos_token = llama_token_eos(mdl); llama_token eos_token = llama_token_eos(mdl);
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, false);
llama_set_causal_attn(ctx, true); llama_set_causal_attn(ctx, true);
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true); std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
@ -166,8 +169,7 @@ int main(int argc, char * argv[]) {
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams); llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
// create new context - set to embedding mode // create generation context
cparams.embeddings = true;
llama_context * ctx = llama_new_context_with_model(mdl, cparams); llama_context * ctx = llama_new_context_with_model(mdl, cparams);
// ### Embedding/Representation ### // ### Embedding/Representation ###

View file

@ -131,22 +131,29 @@ class LlamaState: ObservableObject {
messageLog += "\(text)" messageLog += "\(text)"
while await llamaContext.n_cur < llamaContext.n_len { Task.detached {
let result = await llamaContext.completion_loop() while await llamaContext.n_cur < llamaContext.n_len {
messageLog += "\(result)" let result = await llamaContext.completion_loop()
await MainActor.run {
self.messageLog += "\(result)"
}
}
let t_end = DispatchTime.now().uptimeNanoseconds
let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
await llamaContext.clear()
await MainActor.run {
self.messageLog += """
\n
Done
Heat up took \(t_heat)s
Generated \(tokens_per_second) t/s\n
"""
}
} }
let t_end = DispatchTime.now().uptimeNanoseconds
let t_generation = Double(t_end - t_heat_end) / NS_PER_S
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
await llamaContext.clear()
messageLog += """
\n
Done
Heat up took \(t_heat)s
Generated \(tokens_per_second) t/s\n
"""
} }
func bench() async { func bench() async {

View file

@ -73,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
return chunks; return chunks;
} }
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) { static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
for (size_t i = 0; i < tokens.size(); i++) { size_t n_tokens = tokens.size();
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
} }
} }
@ -160,6 +161,12 @@ int main(int argc, char ** argv) {
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) { if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx); __func__, n_ctx_train, n_ctx);

View file

@ -1745,31 +1745,37 @@ void ggml_vk_instance_init() {
// Default to using all dedicated GPUs // Default to using all dedicated GPUs
for (size_t i = 0; i < devices.size(); i++) { for (size_t i = 0; i < devices.size(); i++) {
vk::PhysicalDeviceProperties props = devices[i].getProperties(); vk::PhysicalDeviceProperties2 new_props;
vk::PhysicalDeviceDriverProperties new_driver;
vk::PhysicalDeviceIDProperties new_id;
new_props.pNext = &new_driver;
new_driver.pNext = &new_id;
devices[i].getProperties2(&new_props);
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) { if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
// Check if there are two physical devices corresponding to the same GPU // Check if there are two physical devices corresponding to the same GPU
auto old_device = std::find_if( auto old_device = std::find_if(
vk_instance.device_indices.begin(), vk_instance.device_indices.begin(),
vk_instance.device_indices.end(), vk_instance.device_indices.end(),
[&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; } [&devices, &new_id](const size_t k){
vk::PhysicalDeviceProperties2 old_props;
vk::PhysicalDeviceIDProperties old_id;
old_props.pNext = &old_id;
devices[k].getProperties2(&old_props);
return std::equal(std::begin(old_id.deviceUUID), std::end(old_id.deviceUUID), std::begin(new_id.deviceUUID));
}
); );
if (old_device == vk_instance.device_indices.end()) { if (old_device == vk_instance.device_indices.end()) {
vk_instance.device_indices.push_back(i); vk_instance.device_indices.push_back(i);
} else { } else {
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers // There can be two physical devices corresponding to the same GPU if there are 2 different drivers
// This can cause error when splitting layers aross the devices, need to keep only 1 // This can cause error when splitting layers aross the devices, need to keep only 1
VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same device id"); VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same deviceUUID");
vk::PhysicalDeviceProperties2 old_prop; vk::PhysicalDeviceProperties2 old_props;
vk::PhysicalDeviceDriverProperties old_driver; vk::PhysicalDeviceDriverProperties old_driver;
old_prop.pNext = &old_driver; old_props.pNext = &old_driver;
devices[*old_device].getProperties2(&old_prop); devices[*old_device].getProperties2(&old_props);
vk::PhysicalDeviceProperties2 new_prop;
vk::PhysicalDeviceDriverProperties new_driver;
new_prop.pNext = &new_driver;
devices[i].getProperties2(&new_prop);
std::map<vk::DriverId, int> driver_priorities {}; std::map<vk::DriverId, int> driver_priorities {};
int old_priority = std::numeric_limits<int>::max(); int old_priority = std::numeric_limits<int>::max();
@ -1777,7 +1783,7 @@ void ggml_vk_instance_init() {
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id // Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
// Smaller number -> higher priority // Smaller number -> higher priority
switch (old_prop.properties.vendorID) { switch (old_props.properties.vendorID) {
case VK_VENDOR_ID_AMD: case VK_VENDOR_ID_AMD:
driver_priorities[vk::DriverId::eMesaRadv] = 1; driver_priorities[vk::DriverId::eMesaRadv] = 1;
driver_priorities[vk::DriverId::eAmdOpenSource] = 2; driver_priorities[vk::DriverId::eAmdOpenSource] = 2;

169
llama.cpp
View file

@ -2293,6 +2293,8 @@ struct llama_vocab {
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
int max_token_len = 0; // used for optimizing longest token search
std::unordered_map<token, id> token_to_id; std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token; std::vector<token_data> id_to_token;
@ -4939,6 +4941,7 @@ static void llm_load_vocab(
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
vocab.token_to_id[word] = i; vocab.token_to_id[word] = i;
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
auto & token_data = vocab.id_to_token[i]; auto & token_data = vocab.id_to_token[i];
token_data.text = std::move(word); token_data.text = std::move(word);
@ -5249,6 +5252,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); } if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); } if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
if (model.arch == LLM_ARCH_DEEPSEEK2) { if (model.arch == LLM_ARCH_DEEPSEEK2) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@ -7649,6 +7654,50 @@ struct llm_build_context {
return lctx.inp_s_seq; return lctx.inp_s_seq;
} }
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
// find result_norm tensor for input
struct ggml_tensor * inp = nullptr;
for (int i = gf->n_nodes - 1; i >= 0; --i) {
inp = gf->nodes[i];
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
break;
} else {
inp = nullptr;
}
}
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
struct ggml_tensor * cur;
switch (pooling_type) {
case LLAMA_POOLING_TYPE_MEAN:
{
struct ggml_tensor * inp_mean = build_inp_mean();
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
} break;
case LLAMA_POOLING_TYPE_CLS:
case LLAMA_POOLING_TYPE_LAST:
{
struct ggml_tensor * inp_cls = build_inp_cls();
cur = ggml_get_rows(ctx0, inp, inp_cls);
} break;
case LLAMA_POOLING_TYPE_NONE:
{
cur = inp;
} break;
default:
{
GGML_ASSERT(false && "unknown pooling type");
} break;
}
cb(cur, "result_embd_pooled", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}
struct ggml_cgraph * build_llama() { struct ggml_cgraph * build_llama() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@ -8629,8 +8678,6 @@ struct llm_build_context {
if (model.arch != LLM_ARCH_JINA_BERT_V2) { if (model.arch != LLM_ARCH_JINA_BERT_V2) {
inp_pos = build_inp_pos(); inp_pos = build_inp_pos();
} }
struct ggml_tensor * inp_mean = build_inp_mean();
struct ggml_tensor * inp_cls = build_inp_cls();
// construct input embeddings (token, type, position) // construct input embeddings (token, type, position)
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@ -8805,28 +8852,6 @@ struct llm_build_context {
cur = inpL; cur = inpL;
cb(cur, "result_embd", -1); cb(cur, "result_embd", -1);
// pooling layer
switch (pooling_type) {
case LLAMA_POOLING_TYPE_NONE:
{
// nop
} break;
case LLAMA_POOLING_TYPE_MEAN:
{
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
cb(cur, "result_embd_pooled", -1);
} break;
case LLAMA_POOLING_TYPE_CLS:
{
cur = ggml_get_rows(ctx0, cur, inp_cls);
cb(cur, "result_embd_pooled", -1);
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
GGML_ASSERT(false && "Invalid pooling type");
} break;
}
ggml_build_forward_expand(gf, cur); ggml_build_forward_expand(gf, cur);
return gf; return gf;
@ -11911,6 +11936,11 @@ static struct ggml_cgraph * llama_build_graph(
GGML_ASSERT(false); GGML_ASSERT(false);
} }
// add on pooling layer
if (lctx.cparams.embeddings) {
result = llm.append_pooling(result);
}
llm.free(); llm.free();
return result; return result;
@ -12000,7 +12030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
// (!a || b) is a logical implication (a -> b) // (!a || b) is a logical implication (a -> b)
// !hparams.causal_attn -> !cparams.causal_attn // !hparams.causal_attn -> !cparams.causal_attn
(hparams.causal_attn || !cparams.causal_attn) && (hparams.causal_attn || !cparams.causal_attn) &&
"causal attention with embedding models is not supported" "causal attention is not supported by this model"
); );
if (lctx.inp_KQ_mask) { if (lctx.inp_KQ_mask) {
@ -12132,6 +12162,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
} }
} }
if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(lctx.inp_cls);
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
std::vector<int> last_pos(n_tokens, -1);
std::vector<int> last_row(n_tokens, -1);
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i];
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
if (pos >= last_pos[seq_id]) {
last_pos[seq_id] = pos;
last_row[seq_id] = i;
}
}
for (int i = 0; i < n_tokens; ++i) {
if (last_row[i] >= 0) {
data[i] = last_row[i];
}
}
}
if (kv_self.recurrent) { if (kv_self.recurrent) {
const int64_t n_kv = kv_self.n; const int64_t n_kv = kv_self.n;
@ -12193,8 +12254,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto n_embd = hparams.n_embd; const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead // TODO: use a per-batch flag for logits presence instead
const bool has_logits = cparams.causal_attn; const bool has_logits = !cparams.embeddings;
const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
@ -12324,11 +12385,13 @@ static int llama_decode_internal(
std::vector<std::vector<llama_seq_id>> seq_id; std::vector<std::vector<llama_seq_id>> seq_id;
// count outputs // count outputs
if (batch_all.logits) { if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
n_outputs = n_tokens_all;
} else if (batch_all.logits) {
for (uint32_t i = 0; i < n_tokens_all; ++i) { for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs += batch_all.logits[i] != 0; n_outputs += batch_all.logits[i] != 0;
} }
} else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) { } else if (lctx.logits_all) {
n_outputs = n_tokens_all; n_outputs = n_tokens_all;
} else { } else {
// keep last output only // keep last output only
@ -12459,30 +12522,13 @@ static int llama_decode_internal(
// no output // no output
res = nullptr; res = nullptr;
embd = nullptr; embd = nullptr;
} else if (!hparams.causal_attn) {
res = nullptr; // do not extract logits for embedding models such as BERT
// token or sequence embeddings
embd = gf->nodes[gf->n_nodes - 1];
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
} else if (cparams.embeddings) { } else if (cparams.embeddings) {
// the embeddings could be in the second to last tensor, or any of the previous tensors res = nullptr; // do not extract logits for embedding case
int i_embd = gf->n_nodes - 2; embd = gf->nodes[gf->n_nodes - 1];
for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) { if (strcmp(embd->name, "result_embd_pooled") != 0) {
i_embd = gf->n_nodes - i; embd = gf->nodes[gf->n_nodes - 2];
if (i_embd < 0) { break; }
embd = gf->nodes[i_embd];
}
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
if (!cparams.causal_attn) {
res = nullptr; // do not extract logits when not needed
// skip computing logits
// TODO: is this safe?
gf->n_nodes = i_embd + 1;
} }
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
} else { } else {
embd = nullptr; // do not extract embeddings when not needed embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
@ -12551,11 +12597,10 @@ static int llama_decode_internal(
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
} }
} break; } break;
case LLAMA_POOLING_TYPE_CLS:
case LLAMA_POOLING_TYPE_MEAN: case LLAMA_POOLING_TYPE_MEAN:
case LLAMA_POOLING_TYPE_CLS:
case LLAMA_POOLING_TYPE_LAST:
{ {
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
// extract sequence embeddings // extract sequence embeddings
auto & embd_seq_out = lctx.embd_seq; auto & embd_seq_out = lctx.embd_seq;
embd_seq_out.clear(); embd_seq_out.clear();
@ -13448,7 +13493,7 @@ private:
struct llm_tokenizer_wpm { struct llm_tokenizer_wpm {
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {} llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
const auto & token_map = vocab.token_to_id; const auto & token_map = vocab.token_to_id;
// normalize and split by whitespace // normalize and split by whitespace
@ -13457,7 +13502,7 @@ struct llm_tokenizer_wpm {
// bos token prepended already // bos token prepended already
// find the longest tokens that form the words // find the longest tokens that form the words
for (const std::string &word : words) { for (const std::string & word : words) {
// skip empty words // skip empty words
if (word.size() == 0) { if (word.size() == 0) {
continue; continue;
@ -13474,7 +13519,7 @@ struct llm_tokenizer_wpm {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
// loop through possible match length // loop through possible match length
bool match = false; bool match = false;
for (int j = n; j > i; j--) { for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
auto it = token_map.find(word1.substr(i, j - i)); auto it = token_map.find(word1.substr(i, j - i));
if (it != token_map.end()) { if (it != token_map.end()) {
output.push_back(it->second); output.push_back(it->second);
@ -13497,7 +13542,8 @@ struct llm_tokenizer_wpm {
} }
} }
std::vector<std::string> preprocess(const std::string & text) { // TODO: reduce string copies by using cpts_offs array
std::vector<std::string> preprocess(const std::string & text) const {
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
std::vector<std::string> words(1, ""); std::vector<std::string> words(1, "");
@ -13792,6 +13838,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
output.push_back(vocab.special_cls_id); output.push_back(vocab.special_cls_id);
} }
llm_tokenizer_wpm tokenizer(vocab);
for (const auto & fragment : fragment_buffer) { for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@ -13799,7 +13847,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
#ifdef PRETOKENIZERDEBUG #ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
#endif #endif
llm_tokenizer_wpm tokenizer(vocab);
tokenizer.tokenize(raw_text, output); tokenizer.tokenize(raw_text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token); output.push_back(fragment.token);
@ -18112,6 +18159,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
ctx->abort_callback_data = abort_callback_data; ctx->abort_callback_data = abort_callback_data;
} }
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
ctx->cparams.embeddings = embeddings;
}
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) { void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn; ctx->cparams.causal_attn = causal_attn;
} }

View file

@ -174,6 +174,7 @@ extern "C" {
LLAMA_POOLING_TYPE_NONE = 0, LLAMA_POOLING_TYPE_NONE = 0,
LLAMA_POOLING_TYPE_MEAN = 1, LLAMA_POOLING_TYPE_MEAN = 1,
LLAMA_POOLING_TYPE_CLS = 2, LLAMA_POOLING_TYPE_CLS = 2,
LLAMA_POOLING_TYPE_LAST = 3,
}; };
enum llama_split_mode { enum llama_split_mode {
@ -293,7 +294,6 @@ extern "C" {
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
// (ignored if no pooling layer)
// ref: https://github.com/ggerganov/llama.cpp/pull/2054 // ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model float rope_freq_base; // RoPE base frequency, 0 = from model
@ -786,6 +786,10 @@ extern "C" {
// Get the number of threads used for prompt and batch processing (multiple token). // Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx); LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings model or not
// If true, embeddings will be returned but logits will not
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
// Set whether to use causal attention or not // Set whether to use causal attention or not
// If set to true, the model will only attend to the past tokens // If set to true, the model will only attend to the past tokens
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);

View file

@ -596,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) { std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
std::vector<uint32_t> result; std::vector<uint32_t> result;
result.reserve(utf8.size());
size_t offset = 0; size_t offset = 0;
while (offset < utf8.size()) { while (offset < utf8.size()) {
result.push_back(unicode_cpt_from_utf8(utf8, offset)); result.push_back(unicode_cpt_from_utf8(utf8, offset));