llama : streamline embeddings from "non-embedding" models (#8087)
This commit is contained in:
parent
bcefa03bc0
commit
d12f781074
4 changed files with 36 additions and 10 deletions
|
@ -13840,7 +13840,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
||||
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(lctx.inp_mean);
|
||||
|
@ -13872,7 +13872,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
||||
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(lctx.inp_cls);
|
||||
|
@ -13893,7 +13893,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
||||
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(lctx.inp_cls);
|
||||
|
@ -14181,14 +14181,15 @@ static int llama_decode_internal(
|
|||
std::vector<llama_seq_id *> seq_id_arr;
|
||||
std::vector<std::vector<llama_seq_id>> seq_id;
|
||||
|
||||
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
||||
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
||||
|
||||
// count outputs
|
||||
if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
|
||||
n_outputs = n_tokens_all;
|
||||
} else if (batch_all.logits) {
|
||||
if (batch_all.logits && !embd_pooled) {
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
n_outputs += batch_all.logits[i] != 0;
|
||||
}
|
||||
} else if (lctx.logits_all) {
|
||||
} else if (lctx.logits_all || embd_pooled) {
|
||||
n_outputs = n_tokens_all;
|
||||
} else {
|
||||
// keep last output only
|
||||
|
@ -14234,7 +14235,7 @@ static int llama_decode_internal(
|
|||
{
|
||||
int32_t n_outputs_new = 0;
|
||||
|
||||
if (u_batch.logits) {
|
||||
if (u_batch.logits && !embd_pooled) {
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
n_outputs_new += u_batch.logits[i] != 0;
|
||||
}
|
||||
|
@ -18533,6 +18534,7 @@ struct llama_context_params llama_context_default_params() {
|
|||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
||||
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
||||
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
|
||||
/*.rope_freq_base =*/ 0.0f,
|
||||
/*.rope_freq_scale =*/ 0.0f,
|
||||
/*.yarn_ext_factor =*/ -1.0f,
|
||||
|
@ -18785,7 +18787,6 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
|
||||
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
||||
cparams.causal_attn = hparams.causal_attn;
|
||||
|
||||
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
||||
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
||||
|
@ -18795,6 +18796,12 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
}
|
||||
|
||||
if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
|
||||
cparams.causal_attn = hparams.causal_attn;
|
||||
} else {
|
||||
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
|
||||
}
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue