diff --git a/examples/server-embd.py b/examples/server-embd.py index 7ed7a17d1..e092eda58 100644 --- a/examples/server-embd.py +++ b/examples/server-embd.py @@ -2,7 +2,7 @@ import asyncio import requests import numpy as np -n = 8 +n = 1 result = [] @@ -14,6 +14,9 @@ async def main(): responses: list[requests.Response] = await asyncio.gather(*[requests_post_async( url= f"{model_url}/embedding", json= {"content": str(0)*32} + #json= {"content": str(0)*1024} + #json= {"content": str(i)*32} + #json= {"content": str(i%2)*32} ) for i in range(n)]) for response in responses: diff --git a/llama.cpp b/llama.cpp index 6245af221..cc506fd7d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2002,7 +2002,6 @@ struct llama_context { struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx] struct ggml_tensor * inp_K_shift; // I32 [n_ctx] struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] - struct ggml_tensor * inp_cls; // I32 [n_batch] #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; @@ -6099,7 +6098,6 @@ struct llm_build_context { struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0); - struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0); // construct input embeddings (token, type, position) inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); @@ -6243,12 +6241,20 @@ struct llm_build_context { cur = inpL; // pooling layer - if (pooling_type == LLAMA_POOLING_TYPE_MEAN) { - cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); - } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) { - cur = ggml_get_rows(ctx0, cur, inp_cls); - } else { - GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type"); + switch (pooling_type) { + case LLAMA_POOLING_TYPE_NONE: + case LLAMA_POOLING_TYPE_CLS: + { + // nop + } break; + case LLAMA_POOLING_TYPE_MEAN: + { + cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); + } break; + case LLAMA_POOLING_TYPE_UNSPECIFIED: + { + GGML_ASSERT(false && "Max pooling not supported"); + } break; } cb(cur, "result_embd", -1); @@ -8103,22 +8109,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { data[seq_id*n_tokens + i] = div[seq_id]; } } - - if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { - const int64_t n_tokens = batch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); - - uint32_t * data = (uint32_t *) lctx.inp_cls->data; - - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; - const llama_pos pos = batch.pos[i]; - if (pos == 0) { - data[seq_id] = i; - } - } - } } static void llama_graph_compute( @@ -8379,17 +8369,32 @@ static int llama_decode_internal( if (batch.logits[i] == 0) { continue; } - switch (hparams.pooling_type) { + + switch (cparams.pooling_type) { case LLAMA_POOLING_TYPE_CLS: - ggml_backend_tensor_get_async(backend_embd, embd, embeddings_out.data() + (n_embd*i), (n_embd*batch.seq_id[i][0])*sizeof(float), n_embd*sizeof(float)); - break; - case LLAMA_POOLING_TYPE_MEAN: + { + // find the token with the same seq_id and pos == 0 and use its embeddings + int i_src = -1; + for (int j = 0; j < (int) n_tokens; j++) { + if (batch.seq_id[i][0] == batch.seq_id[j][0] && batch.pos[j] == 0) { + i_src = j; + break; + } + } + + GGML_ASSERT(i_src >= 0); + + ggml_backend_tensor_get_async(backend_embd, embd, embeddings_out.data() + (n_embd*i), (n_embd*i_src)*sizeof(float), n_embd*sizeof(float)); + } break; case LLAMA_POOLING_TYPE_NONE: - ggml_backend_tensor_get_async(backend_embd, embd, embeddings_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float)); - break; + case LLAMA_POOLING_TYPE_MEAN: + { + ggml_backend_tensor_get_async(backend_embd, embd, embeddings_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float)); + } break; default: - GGML_ASSERT(false && "unknown pooling type"); - break; + { + GGML_ASSERT(false && "unknown pooling type"); + } break; } } } @@ -12279,7 +12284,7 @@ struct llama_context * llama_new_context_with_model( // graph inputs { ggml_init_params init_params = { - /* .mem_size */ ggml_tensor_overhead()*8, + /* .mem_size */ ggml_tensor_overhead()*7, /* .mem_buffer */ nullptr, /* .no_alloc */ true, }; @@ -12292,7 +12297,6 @@ struct llama_context * llama_new_context_with_model( ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx); ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); - ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); ggml_set_name(ctx->inp_tokens, "inp_tokens"); ggml_set_name(ctx->inp_embd, "inp_embd"); @@ -12301,7 +12305,6 @@ struct llama_context * llama_new_context_with_model( ggml_set_name(ctx->inp_KQ_pos, "inp_KQ_pos"); ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); ggml_set_name(ctx->inp_mean, "inp_mean"); - ggml_set_name(ctx->inp_cls, "inp_cls"); ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,