From c702e5593086c42b3fb52ad68e04e37ffe29f61f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 1 Oct 2024 23:57:13 -0400
Subject: [PATCH] Add llama_token_in_embd function to embed input tokens

---
 include/llama.h | 13 +++++++++++
 src/llama.cpp   | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/include/llama.h b/include/llama.h
index 7cae1bbe2..1a6f1d74d 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -960,6 +960,19 @@ extern "C" {
                             bool   remove_special,
                             bool   unparse_special);
 
+
+    // @details Get the input embeddings for a sequence of tokens
+    // @param tokens The tokens to embed
+    // @param n_tokens The number of tokens
+    // @param embeddings The embeddings pointer must be large enough to hold the resulting embeddings.
+    // @param n_embd The number of embeddings per token
+    // @return Returns a negative number on failure
+    LLAMA_API int32_t llama_token_inp_embd(
+              struct llama_context * ctx,
+                       llama_token * tokens,
+                           int32_t   n_tokens,
+                             float * embeddings);
+
     //
     // Chat templates
     //
diff --git a/src/llama.cpp b/src/llama.cpp
index c466cd88b..b9cde30b3 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21334,6 +21334,63 @@ int32_t llama_detokenize(
     return llama_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
 }
 
+int32_t llama_token_inp_embd(struct llama_context * ctx, llama_token * tokens, int32_t n_tokens, float * embeddings) {
+    int32_t n_embd = llama_n_embd(&ctx->model);
+    const struct llama_hparams & hparams = ctx->model.hparams;
+    llama_ubatch batch = {};
+    batch.token = tokens;
+    batch.n_tokens = n_tokens;
+    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
+    ggml_backend_cpu_set_n_threads(ctx->backend_cpu, ctx->cparams.n_threads);
+    if (ctx->threadpool) {
+        ggml_backend_cpu_set_threadpool(ctx->backend_cpu, ctx->threadpool);
+    }
+
+    ggml_init_params params = ggml_init_params{
+        GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(),
+        nullptr,
+        true
+    };
+
+    ggml_context * ctx0 = ggml_init(params);
+    if (!ctx0) {
+        return -1;
+    }
+
+    ggml_tensor * output = llm_build_inp_embd(
+        ctx0,
+        *ctx,
+        hparams,
+        batch,
+        ctx->model.tok_embd,
+        cb
+    );
+
+    ggml_backend_buffer_type_t buffer_type = ggml_backend_get_default_buffer_type(ctx->backend_cpu);
+    ggml_gallocr_t graph_allocator = ggml_gallocr_new(buffer_type);
+    ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    ggml_set_output(output);
+    ggml_build_forward_expand(gf, output);
+
+    if (!ggml_gallocr_reserve(graph_allocator, gf) || !ggml_gallocr_alloc_graph(graph_allocator, gf)) {
+        ggml_gallocr_free(graph_allocator);
+        ggml_free(ctx0);
+        return -1;
+    }
+
+    ggml_backend_tensor_set(ctx->inp_tokens, tokens, 0, n_tokens * sizeof(int32_t));
+
+    ggml_backend_graph_compute(ctx->backend_cpu, gf);
+
+    ggml_backend_tensor_get(output, embeddings, 0, n_tokens * n_embd * sizeof(float));
+
+    ggml_gallocr_free(graph_allocator);
+    ggml_free(ctx0);
+
+    return 0;
+}
+
 //
 // chat templates
 //