llama : allow pooled embeddings on any model (#7477)

* create append_pooling operation; allow to specify attention_type; add last token pooling; update examples * find result_norm/result_embd tensors properly; update output allocation logic * only use embd output for pooling_type NONE * get rid of old causal_attn accessor * take out attention_type; add in llama_set_embeddings * bypass logits when doing non-NONE pooling
2024-06-21 00:38:22 -05:00 · 2024-06-21 00:38:22 -05:00 · 80ea089d77
commit 80ea089d77
parent 0e64591e82
6 changed files with 130 additions and 70 deletions
--- a/llama.h
+++ b/llama.h
@ -174,6 +174,7 @@ extern "C" {
        LLAMA_POOLING_TYPE_NONE = 0,
        LLAMA_POOLING_TYPE_MEAN = 1,
        LLAMA_POOLING_TYPE_CLS  = 2,
+        LLAMA_POOLING_TYPE_LAST = 3,
    };

    enum llama_split_mode {
@ -293,7 +294,6 @@ extern "C" {

        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
-                                                        // (ignored if no pooling layer)

        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;   // RoPE base frequency, 0 = from model
@ -786,6 +786,10 @@ extern "C" {
    // Get the number of threads used for prompt and batch processing (multiple token).
    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);

+    // Set whether the model is in embeddings model or not
+    // If true, embeddings will be returned but logits will not
+    LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
+
    // Set whether to use causal attention or not
    // If set to true, the model will only attend to the past tokens
    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);