Merge c702e55930 into 08828a6d7d

2024-11-04 00:29:20 +08:00 · 2024-11-04 00:29:20 +08:00 · b80781bf1e
commit b80781bf1e
parent 08828a6d7d c702e55930
4 changed files with 94 additions and 6 deletions
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
 #define KEY_IMAGE_STD           "clip.vision.image_std"
 #define KEY_PROJ_TYPE           "clip.projector_type"
+#define KEY_EMBD_SCALE          "clip.embeddings_scale"

 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
@ -548,6 +549,7 @@ struct clip_ctx {

    float image_mean[3];
    float image_std[3];
+    float embeddings_scale = 1.0f;
    bool use_gelu = false;
    int32_t ftype = 1;

@ -786,11 +788,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);

-            embeddings = ggml_gelu(ctx0, embeddings);
-            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-        }
-        else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+            // paligemma missing second linear layer
+            if (model.mm_2_w) {
+                embeddings = ggml_gelu(ctx0, embeddings);
+                embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+                embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+            }
+        } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@ -1019,6 +1023,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        }
    }

+    if (ctx->embeddings_scale != 1.0f) {
+        embeddings = ggml_scale(ctx0, embeddings, ctx->embeddings_scale);
+    }
+
    // build the graph
    ggml_build_forward_expand(gf, embeddings);

@ -1320,6 +1328,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            new_clip->image_std[i]  = std_data[i];
        }

+        try {
+            new_clip->embeddings_scale = get_f32(ctx, KEY_EMBD_SCALE);
+        } catch (const std::exception& /*e*/) {
+            new_clip->embeddings_scale = 1.0f;
+        }
+
        if (verbosity >= 2) {
            LOG_INF("\n%s: vision model hparams\n", __func__);
            LOG_INF("image_size         %d\n", hparams.image_size);
@ -2596,6 +2610,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        return ctx->vision_model.mm_model_peg_0_b->ne[0];
    }
    if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+        // paligemma missing second linear layer
+        if (ctx->vision_model.mm_2_b == nullptr) {
+            return ctx->vision_model.mm_0_b->ne[0];
+        }
        return ctx->vision_model.mm_2_b->ne[0];
    }
    if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -13449,7 +13449,7 @@ static void ggml_compute_forward_get_rows_f32(
        const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
        const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);

-        GGML_ASSERT(i01 >= 0 && i01 < ne01);
+        GGML_ASSERT(i01 >= 0 && i01 <= ne01);

        ggml_vec_cpy_f32(nc,
                (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
--- a/include/llama.h
+++ b/include/llama.h
@ -960,6 +960,19 @@ extern "C" {
                            bool   remove_special,
                            bool   unparse_special);

+
+    // @details Get the input embeddings for a sequence of tokens
+    // @param tokens The tokens to embed
+    // @param n_tokens The number of tokens
+    // @param embeddings The embeddings pointer must be large enough to hold the resulting embeddings.
+    // @param n_embd The number of embeddings per token
+    // @return Returns a negative number on failure
+    LLAMA_API int32_t llama_token_inp_embd(
+              struct llama_context * ctx,
+                       llama_token * tokens,
+                           int32_t   n_tokens,
+                             float * embeddings);
+
    //
    // Chat templates
    //
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -21524,6 +21524,63 @@ int32_t llama_detokenize(
    return llama_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
 }

+int32_t llama_token_inp_embd(struct llama_context * ctx, llama_token * tokens, int32_t n_tokens, float * embeddings) {
+    int32_t n_embd = llama_n_embd(&ctx->model);
+    const struct llama_hparams & hparams = ctx->model.hparams;
+    llama_ubatch batch = {};
+    batch.token = tokens;
+    batch.n_tokens = n_tokens;
+    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
+    ggml_backend_cpu_set_n_threads(ctx->backend_cpu, ctx->cparams.n_threads);
+    if (ctx->threadpool) {
+        ggml_backend_cpu_set_threadpool(ctx->backend_cpu, ctx->threadpool);
+    }
+
+    ggml_init_params params = ggml_init_params{
+        GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(),
+        nullptr,
+        true
+    };
+
+    ggml_context * ctx0 = ggml_init(params);
+    if (!ctx0) {
+        return -1;
+    }
+
+    ggml_tensor * output = llm_build_inp_embd(
+        ctx0,
+        *ctx,
+        hparams,
+        batch,
+        ctx->model.tok_embd,
+        cb
+    );
+
+    ggml_backend_buffer_type_t buffer_type = ggml_backend_get_default_buffer_type(ctx->backend_cpu);
+    ggml_gallocr_t graph_allocator = ggml_gallocr_new(buffer_type);
+    ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    ggml_set_output(output);
+    ggml_build_forward_expand(gf, output);
+
+    if (!ggml_gallocr_reserve(graph_allocator, gf) || !ggml_gallocr_alloc_graph(graph_allocator, gf)) {
+        ggml_gallocr_free(graph_allocator);
+        ggml_free(ctx0);
+        return -1;
+    }
+
+    ggml_backend_tensor_set(ctx->inp_tokens, tokens, 0, n_tokens * sizeof(int32_t));
+
+    ggml_backend_graph_compute(ctx->backend_cpu, gf);
+
+    ggml_backend_tensor_get(output, embeddings, 0, n_tokens * n_embd * sizeof(float));
+
+    ggml_gallocr_free(graph_allocator);
+    ggml_free(ctx0);
+
+    return 0;
+}
+
 //
 // chat templates
 //