llama : streamline embeddings from "non-embedding" models (#8087)

2024-07-05 02:05:56 -05:00 · 2024-07-05 02:05:56 -05:00 · d12f781074
commit d12f781074
parent bcefa03bc0
4 changed files with 36 additions and 10 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -13840,7 +13840,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        }
    }

-    if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
        const int64_t n_tokens = batch.n_tokens;

        GGML_ASSERT(lctx.inp_mean);
@ -13872,7 +13872,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        }
    }

-    if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
        const int64_t n_tokens = batch.n_tokens;

        GGML_ASSERT(lctx.inp_cls);
@ -13893,7 +13893,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        }
    }

-    if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
        const int64_t n_tokens = batch.n_tokens;

        GGML_ASSERT(lctx.inp_cls);
@ -14181,14 +14181,15 @@ static int llama_decode_internal(
    std::vector<llama_seq_id *>            seq_id_arr;
    std::vector<std::vector<llama_seq_id>> seq_id;

+    // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
+    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
    // count outputs
-    if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
-        n_outputs = n_tokens_all;
-    } else if (batch_all.logits) {
+    if (batch_all.logits && !embd_pooled) {
        for (uint32_t i = 0; i < n_tokens_all; ++i) {
            n_outputs += batch_all.logits[i] != 0;
        }
-    } else if (lctx.logits_all) {
+    } else if (lctx.logits_all || embd_pooled) {
        n_outputs = n_tokens_all;
    } else {
        // keep last output only
@ -14234,7 +14235,7 @@ static int llama_decode_internal(
        {
            int32_t n_outputs_new = 0;

-            if (u_batch.logits) {
+            if (u_batch.logits && !embd_pooled) {
                for (uint32_t i = 0; i < n_tokens; i++) {
                    n_outputs_new += u_batch.logits[i] != 0;
                }
@ -18533,6 +18534,7 @@ struct llama_context_params llama_context_default_params() {
        /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
        /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
+        /*.attention_type              =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
        /*.rope_freq_base              =*/ 0.0f,
        /*.rope_freq_scale             =*/ 0.0f,
        /*.yarn_ext_factor             =*/ -1.0f,
@ -18785,7 +18787,6 @@ struct llama_context * llama_new_context_with_model(
    }

    cparams.yarn_attn_factor *= hparams.rope_attn_factor;
-    cparams.causal_attn = hparams.causal_attn;

    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@ -18795,6 +18796,12 @@ struct llama_context * llama_new_context_with_model(
        }
    }

+    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
+        cparams.causal_attn = hparams.causal_attn;
+    } else {
+        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
+    }
+
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }