From 2a24f714975c029ea3cda80af0a2b43f7effd0d8 Mon Sep 17 00:00:00 2001
From: Matt Grosso <matt.grosso@highspot.com>
Date: Wed, 17 Apr 2024 17:47:25 -0700
Subject: [PATCH] llama_get_embeddings_mean_pooled

---
 llama.cpp | 22 ++++++++++++++++++++++
 llama.h   | 29 +++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index f4f4063cf..f0cad60ee 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -16609,6 +16609,28 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id
     return it->second.data();
 }
 
+void llama_get_embeddings_mean_pooled(struct llama_context * ctx, int32_t skip_tokens, int32_t batch_tokens, float *dest) {
+    GGML_ASSERT(dest);
+    GGML_ASSERT(batch_tokens > 0);
+    GGML_ASSERT(skip_tokens >= 0);
+    GGML_ASSERT(skip_tokens < batch_tokens);
+    float inv_tokens_to_pool = 1.0f / (batch_tokens - skip_tokens);
+    GGML_ASSERT(inv_tokens_to_pool > 0.0f);
+    GGML_ASSERT(inv_tokens_to_pool <= 1.0f);
+    float * all_token_embedddings = ctx->embd.data();
+    const llama_model * mdl = llama_get_model(ctx);
+    int32_t n_embd = llama_n_embd(mdl); // length of each embedding
+    for (int32_t i = skip_tokens; i < batch_tokens; i++) {
+        float * token_embedding = all_token_embedddings + i * n_embd;
+        for (int32_t j = 0; j < n_embd; j++) {
+            dest[j] += token_embedding[j];
+        }
+    }
+    for (int32_t i = 0; i < n_embd; i++) {
+        dest[i] *= inv_tokens_to_pool;
+    }
+}
+
 const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
     GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
     return model->vocab.id_to_token[token].text.c_str();
diff --git a/llama.h b/llama.h
index b5da686f7..2f2e31206 100644
--- a/llama.h
+++ b/llama.h
@@ -773,6 +773,35 @@ extern "C" {
     // shape: [n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
 
+    // Get the mean pooled embeddings for a subset of the tokens from the encoding.
+    //
+    // They will not be normalized. see llama_embd_normalize for that.
+    //
+    // The mean pooling here is done outside of the device and thus will work
+    // for model types that currently refuse to build a pooling layer in the
+    // device. Currently two large llama embedding models, GritLM and
+    // e5-mistral are supported; notably both of those are initialized via
+    // build_llama which won't have a pooling layer, inp_mean. Both models rely
+    // on prompts that are tokenized and which contribute to the attention but
+    // which may or may not be included in the mean pooling, depending on the
+    // application.
+    //
+    // TODO: 1. support inp_mean in llama models when mean pooling is specified
+    // so we can have the man calculated on the device and
+    // TODO: 2. also have the context own the destination pooled embedding
+    // memory to be more consistent with other apis, but also continue to
+    // allow application control over skipping some tokens.
+    //
+    // skip_tokens: The number of tokens to skip from the beginning of the batch tokens
+    // batch_tokens: The number of tokens in the batch
+    // dest: The destination array to store the mean pooled embeddings
+    //
+    // 'dest' array pointer must have the same length as the embeddings
+    // 'batch_tokens' - 'skip_tokens' is the number of tokens to pool
+    // [skip_tokens, batch_tokens) is the range of tokens to pool
+    //
+    LLAMA_API void llama_get_embeddings_mean_pooled(struct llama_context * ctx, int32_t skip_tokens, int32_t batch_tokens, float *dest);
+
     //
     // Vocab
     //