find result_norm/result_embd tensors properly; update output allocation logic
This commit is contained in:
parent
010571490f
commit
1756c4b5b6
3 changed files with 17 additions and 11 deletions
|
@ -31,8 +31,8 @@ static bool needs_logit(enum llama_pooling_type pooling_type, int pos, int n_tok
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id, enum llama_pooling_type pooling_type) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id, enum llama_pooling_type pooling_type) {
|
||||||
int n_tokens = tokens.size();
|
size_t n_tokens = tokens.size();
|
||||||
for (size_t i = 0; i < n_tokens; i++) {
|
for (size_t i = 0; i < n_tokens; i++) {
|
||||||
bool logit = needs_logit(pooling_type, i, n_tokens);
|
bool logit = needs_logit(pooling_type, i, n_tokens);
|
||||||
llama_batch_add(batch, tokens[i], i, { seq_id }, logit);
|
llama_batch_add(batch, tokens[i], i, { seq_id }, logit);
|
||||||
|
|
|
@ -87,9 +87,9 @@ static bool needs_logit(enum llama_pooling_type pooling_type, int pos, int n_tok
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id, enum llama_pooling_type pooling_type) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id, enum llama_pooling_type pooling_type) {
|
||||||
int n_tokens = tokens.size();
|
size_t n_tokens = tokens.size();
|
||||||
for (size_t i = 0; i < tokens.size(); i++) {
|
for (size_t i = 0; i < n_tokens; i++) {
|
||||||
bool logit = needs_logit(pooling_type, i, n_tokens);
|
bool logit = needs_logit(pooling_type, i, n_tokens);
|
||||||
llama_batch_add(batch, tokens[i], i, { seq_id }, logit);
|
llama_batch_add(batch, tokens[i], i, { seq_id }, logit);
|
||||||
}
|
}
|
||||||
|
|
18
llama.cpp
18
llama.cpp
|
@ -7436,11 +7436,17 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
||||||
struct ggml_tensor * inp = gf->nodes[gf->n_nodes - 1];
|
// find result_norm tensor for input
|
||||||
if (strcmp(inp->name, "result_embd") != 0) {
|
struct ggml_tensor * inp = nullptr;
|
||||||
inp = gf->nodes[gf->n_nodes - 2];
|
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
||||||
GGML_ASSERT(strcmp(inp->name, "result_norm") == 0 && "embeddings tensor not found");
|
inp = gf->nodes[i];
|
||||||
|
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
inp = nullptr;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
|
|
||||||
|
@ -12029,8 +12035,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
|
|
||||||
// TODO: use a per-batch flag for logits presence instead
|
// TODO: use a per-batch flag for logits presence instead
|
||||||
const bool has_logits = cparams.causal_attn;
|
const bool has_logits = !cparams.embeddings;
|
||||||
const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
const bool has_embd = cparams.embeddings;
|
||||||
|
|
||||||
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||||
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue