From c702e5593086c42b3fb52ad68e04e37ffe29f61f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 1 Oct 2024 23:57:13 -0400 Subject: [PATCH] Add llama_token_in_embd function to embed input tokens --- include/llama.h | 13 +++++++++++ src/llama.cpp | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/include/llama.h b/include/llama.h index 7cae1bbe2..1a6f1d74d 100644 --- a/include/llama.h +++ b/include/llama.h @@ -960,6 +960,19 @@ extern "C" { bool remove_special, bool unparse_special); + + // @details Get the input embeddings for a sequence of tokens + // @param tokens The tokens to embed + // @param n_tokens The number of tokens + // @param embeddings The embeddings pointer must be large enough to hold the resulting embeddings. + // @param n_embd The number of embeddings per token + // @return Returns a negative number on failure + LLAMA_API int32_t llama_token_inp_embd( + struct llama_context * ctx, + llama_token * tokens, + int32_t n_tokens, + float * embeddings); + // // Chat templates // diff --git a/src/llama.cpp b/src/llama.cpp index c466cd88b..b9cde30b3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -21334,6 +21334,63 @@ int32_t llama_detokenize( return llama_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special); } +int32_t llama_token_inp_embd(struct llama_context * ctx, llama_token * tokens, int32_t n_tokens, float * embeddings) { + int32_t n_embd = llama_n_embd(&ctx->model); + const struct llama_hparams & hparams = ctx->model.hparams; + llama_ubatch batch = {}; + batch.token = tokens; + batch.n_tokens = n_tokens; + llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; + ggml_backend_cpu_set_n_threads(ctx->backend_cpu, ctx->cparams.n_threads); + if (ctx->threadpool) { + ggml_backend_cpu_set_threadpool(ctx->backend_cpu, ctx->threadpool); + } + + ggml_init_params params = ggml_init_params{ + GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(), + nullptr, + true + }; + + ggml_context * ctx0 = ggml_init(params); + if (!ctx0) { + return -1; + } + + ggml_tensor * output = llm_build_inp_embd( + ctx0, + *ctx, + hparams, + batch, + ctx->model.tok_embd, + cb + ); + + ggml_backend_buffer_type_t buffer_type = ggml_backend_get_default_buffer_type(ctx->backend_cpu); + ggml_gallocr_t graph_allocator = ggml_gallocr_new(buffer_type); + ggml_cgraph * gf = ggml_new_graph(ctx0); + + ggml_set_output(output); + ggml_build_forward_expand(gf, output); + + if (!ggml_gallocr_reserve(graph_allocator, gf) || !ggml_gallocr_alloc_graph(graph_allocator, gf)) { + ggml_gallocr_free(graph_allocator); + ggml_free(ctx0); + return -1; + } + + ggml_backend_tensor_set(ctx->inp_tokens, tokens, 0, n_tokens * sizeof(int32_t)); + + ggml_backend_graph_compute(ctx->backend_cpu, gf); + + ggml_backend_tensor_get(output, embeddings, 0, n_tokens * n_embd * sizeof(float)); + + ggml_gallocr_free(graph_allocator); + ggml_free(ctx0); + + return 0; +} + // // chat templates //