Merge branch 'master' into compilade/refactor-kv-cache

2024-06-30 15:31:25 -04:00 · 2024-06-30 15:31:25 -04:00 · 10c3c419e9
commit 10c3c419e9
parent 33425a7e1e 9ef0780062
518 changed files with 78202 additions and 66427 deletions
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET embedding)
+set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@ -9,13 +9,53 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
+./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
 ```

 ### Windows:

 ```powershell
-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
+llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
 ```

 The above command will output space-separated float values.
+
+## extra parameters
+### --embd-normalize $integer$
+| $integer$ | description         | formula |
+|-----------|---------------------|---------|
+| $-1$      | none                |
+| $0$       | max absolute int16  | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
+| $1$       | taxicab             | $\Large{x_i \over\sum \lvert x_i\rvert}$
+| $2$       | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
+| $>2$      | p-norm              | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
+
+### --embd-output-format $'string'$
+| $'string'$ | description                  |  |
+|------------|------------------------------|--|
+| ''         | same as before               | (default)
+| 'array'    | single embeddings            | $[[x_1,...,x_n]]$
+|            | multiple embeddings          | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
+| 'json'     | openai style                 |
+| 'json+'    | add cosine similarity matrix |
+
+### --embd-separator $"string"$
+| $"string"$   | |
+|--------------|-|
+| "\n"         | (default)
+| "<#embSep#>" | for exemple
+| "<#sep#>"    | other exemple
+
+## examples
+### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
+```
+
+### Windows:
+
+```powershell
+embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
+```
+
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -7,23 +7,30 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static std::vector<std::string> split_lines(const std::string & s) {
-    std::string line;
+static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
    std::vector<std::string> lines;
-    std::stringstream ss(s);
-    while (std::getline(ss, line)) {
-        lines.push_back(line);
+    size_t start = 0;
+    size_t end = s.find(separator);
+
+    while (end != std::string::npos) {
+        lines.push_back(s.substr(start, end - start));
+        start = end + separator.length();
+        end = s.find(separator, start);
    }
+
+    lines.push_back(s.substr(start)); // Add the last part
+
    return lines;
 }

-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
-    for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+    size_t n_tokens = tokens.size();
+    for (size_t i = 0; i < n_tokens; i++) {
+        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
    }
 }

-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
    // clear previous kv_cache values (irrelevant for embeddings)
    llama_past_clear(ctx);

@ -40,22 +47,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu

        // try to get sequence embeddings - supported only when pooling_type is not NONE
        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        if (embd == NULL) {
-            embd = llama_get_embeddings_ith(ctx, i);
-            if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
-                continue;
-            }
-        }
+        GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");

        float * out = output + batch.seq_id[i][0] * n_embd;
-        //TODO: I would also add a parameter here to enable normalization or not.
-        /*fprintf(stdout, "unnormalized_embedding:");
-        for (int hh = 0; hh < n_embd; hh++) {
-            fprintf(stdout, "%9.6f ", embd[hh]);
-        }
-        fprintf(stdout, "\n");*/
-        llama_embd_normalize(embd, out, n_embd);
+        llama_embd_normalize(embd, out, n_embd, embd_norm);
    }
 }

@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);

+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+        return 1;
+    }
+
    if (n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
@ -109,7 +110,7 @@ int main(int argc, char ** argv) {
    }

    // split the prompt into lines
-    std::vector<std::string> prompts = split_lines(params.prompt);
+    std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);

    // max batch size
    const uint64_t n_batch = params.n_batch;
@ -169,7 +170,7 @@ int main(int argc, char ** argv) {
        // encode if at capacity
        if (batch.n_tokens + n_toks > n_batch) {
            float * out = emb + p * n_embd;
-            batch_decode(ctx, batch, out, s, n_embd);
+            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
            llama_batch_clear(batch);
            p += s;
            s = 0;
@ -182,29 +183,78 @@ int main(int argc, char ** argv) {

    // final batch
    float * out = emb + p * n_embd;
-    batch_decode(ctx, batch, out, s, n_embd);
+    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);

-    // print the first part of the embeddings or for a single prompt, the full embedding
-    fprintf(stdout, "\n");
-    for (int j = 0; j < n_prompts; j++) {
-        fprintf(stdout, "embedding %d: ", j);
-        for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
-            fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
-        }
+    if (params.embd_out.empty()) {
+        // print the first part of the embeddings or for a single prompt, the full embedding
        fprintf(stdout, "\n");
-    }
-
-    // print cosine similarity matrix
-    if (n_prompts > 1) {
-        fprintf(stdout, "\n");
-        printf("cosine similarity matrix:\n\n");
-        for (int i = 0; i < n_prompts; i++) {
-            for (int j = 0; j < n_prompts; j++) {
-                float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                fprintf(stdout, "%6.2f ", sim);
+        for (int j = 0; j < n_prompts; j++) {
+            fprintf(stdout, "embedding %d: ", j);
+            for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
+                if (params.embd_normalize == 0) {
+                    fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                } else {
+                    fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                }
            }
            fprintf(stdout, "\n");
        }
+
+        // print cosine similarity matrix
+        if (n_prompts > 1) {
+            fprintf(stdout, "\n");
+            printf("cosine similarity matrix:\n\n");
+            for (int i = 0; i < n_prompts; i++) {
+                fprintf(stdout, "%6.6s ", prompts[i].c_str());
+            }
+            fprintf(stdout, "\n");
+            for (int i = 0; i < n_prompts; i++) {
+                for (int j = 0; j < n_prompts; j++) {
+                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                    fprintf(stdout, "%6.2f ", sim);
+                }
+                fprintf(stdout, "%1.10s", prompts[i].c_str());
+                fprintf(stdout, "\n");
+            }
+        }
+    }
+
+    if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
+        const bool notArray = params.embd_out != "array";
+
+        fprintf(stdout, notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+        for (int j = 0;;) { // at least one iteration (one prompt)
+            if (notArray) fprintf(stdout, "    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
+            fprintf(stdout, "[");
+            for (int i = 0;;) { // at least one iteration (n_embd > 0)
+                fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                i++;
+                if (i < n_embd) fprintf(stdout, ","); else break;
+            }
+            fprintf(stdout, notArray ? "]\n    }" : "]");
+            j++;
+            if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
+        }
+        fprintf(stdout, notArray ? "\n  ]" : "]\n");
+
+        if (params.embd_out == "json+" && n_prompts > 1) {
+            fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
+            for (int i = 0;;) { // at least two iteration (n_prompts > 1)
+                fprintf(stdout, "    [");
+                for (int j = 0;;) { // at least two iteration (n_prompts > 1)
+                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                    fprintf(stdout, "%6.2f", sim);
+                    j++;
+                    if (j < n_prompts) fprintf(stdout, ", "); else break;
+                }
+                fprintf(stdout, " ]");
+                i++;
+                if (i < n_prompts) fprintf(stdout, ",\n"); else break;
+            }
+            fprintf(stdout, "\n  ]");
+        }
+
+        if (notArray) fprintf(stdout, "\n}\n");
    }

    // clean up