Merge remote-tracking branch 'upstream/master'

2023-10-17 16:31:33 -04:00 · 2023-10-17 16:31:33 -04:00 · fa0f22f14f
commit fa0f22f14f
parent aa2268f4cd cb33f43a2a
16 changed files with 454 additions and 181 deletions
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -209,7 +209,7 @@ llama_print_timings(context)
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let n_tokens = text.count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
+    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@ -49,9 +49,9 @@ inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
    return eval_tokens(ctx_llama, tokens, 1, n_past);
 }

-inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past){
+inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, true);
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
    return true;
 }
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -97,6 +97,7 @@ int main(int argc, char ** argv) {
    ctx_params.n_ctx           = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.seed            = params.seed;

    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

@ -106,7 +107,8 @@ int main(int argc, char ** argv) {
    }

    // make sure that the correct mmproj was used, i.e., compare apples to apples
-    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+    const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+
    if (n_img_embd != n_llama_embd) {
        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);

@ -125,14 +127,14 @@ int main(int argc, char ** argv) {

    const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;

-    // GG: are we sure that the should be a trailing whitespace at the end of this string?
-    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params.n_batch, &n_past);
+    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
    eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
-    eval_string(ctx_llama, params.prompt.c_str(), params.n_batch, &n_past);
-    eval_string(ctx_llama, "\nASSISTANT:",        params.n_batch, &n_past);
+    eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);

    // generate the response

+    printf("\n");
+    printf("prompt: '%s'\n", params.prompt.c_str());
    printf("\n");

    for (int i = 0; i < max_tgt_len; i++) {
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -238,7 +238,7 @@ int main(int argc, char ** argv) {

    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
        LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
    } else {
        LOG("use session tokens\n");
        embd_inp = session_tokens;
@ -260,10 +260,10 @@ int main(int argc, char ** argv) {
    if (ctx_guidance) {
        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));

-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));

        original_prompt_len = original_inp.size();
@ -320,8 +320,8 @@ int main(int argc, char ** argv) {
    }

    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);

    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
@ -383,6 +383,12 @@ int main(int argc, char ** argv) {
        if (!params.antiprompt.empty()) {
            for (const auto & antiprompt : params.antiprompt) {
                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+                if (params.verbose_prompt) {
+                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
+                    for (int i = 0; i < (int) tmp.size(); i++) {
+                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    }
+                }
            }
        }

@ -392,10 +398,22 @@ int main(int argc, char ** argv) {

        if (!params.input_prefix.empty()) {
            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
+            }
        }

        if (!params.input_suffix.empty()) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            if (params.verbose_prompt) {
+                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
+                for (int i = 0; i < (int) tmp.size(); i++) {
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                }
+            }
        }
    }
    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
@ -717,7 +735,7 @@ int main(int argc, char ** argv) {
                if (params.interactive) {
                    if (!params.antiprompt.empty()) {
                        // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                        is_antiprompt = true;
                    }
@ -744,8 +762,7 @@ int main(int argc, char ** argv) {
                std::string buffer;
                if (!params.input_prefix.empty()) {
                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
+                    printf("%s", params.input_prefix.c_str());
                }

                // color user input only
@ -767,7 +784,6 @@ int main(int argc, char ** argv) {
                    // append input suffix if any
                    if (!params.input_suffix.empty()) {
                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        buffer += params.input_suffix;
                        printf("%s", params.input_suffix.c_str());
                    }

@ -782,10 +798,14 @@ int main(int argc, char ** argv) {
                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                    }

-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
+                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));

+                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());

                    // instruct mode: insert response suffix
                    if (params.instruct) {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -8,10 +8,7 @@

 int main(int argc, char ** argv) {
    gpt_params params;
-    llama_sampling_params & sparams = params.sampling_params;
-    params.seed = 42;
-    params.n_threads = 4;
-    sparams.repeat_last_n = 64;
+
    params.prompt = "The quick brown fox";

    if (!gpt_params_parse(argc, argv, params)) {
@ -25,56 +22,49 @@ int main(int argc, char ** argv) {
    }

    auto n_past = 0;
-    auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
+
+    std::string result0;
+    std::string result1;

    // init
    llama_model * model;
    llama_context * ctx;

-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
-    if (model == nullptr) {
-        return 1;
-    }
-    if (ctx == nullptr) {
-        llama_free_model(model);
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
        return 1;
    }
+
+    // tokenize prompt
    auto tokens = llama_tokenize(ctx, params.prompt, true);
-    auto n_prompt_tokens = tokens.size();
-    if (n_prompt_tokens < 1) {
-        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
-        llama_free(ctx);
-        llama_free_model(model);
-        return 1;
-    }

    // evaluate prompt
-    llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0));
+    llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
+    n_past += tokens.size();

-    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
-    n_past += n_prompt_tokens;
-
-    const size_t state_size = llama_get_state_size(ctx);
-    uint8_t * state_mem = new uint8_t[state_size];
-
-    // Save state (rng, logits, embedding and kv_cache) to file
+    // save state (rng, logits, embedding and kv_cache) to file
    {
-        FILE *fp_write = fopen("dump_state.bin", "wb");
-        llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
-        fwrite(state_mem, 1, state_size, fp_write);
-        fclose(fp_write);
+        std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
+
+        {
+            FILE *fp_write = fopen("dump_state.bin", "wb");
+            llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
+            fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
+            fclose(fp_write);
+        }
    }

    // save state (last tokens)
-    const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
    const auto n_past_saved = n_past;

    // first run
-    printf("\n%s", params.prompt.c_str());
+    printf("\nfirst run: %s", params.prompt.c_str());

    for (auto i = 0; i < params.n_predict; i++) {
        auto * logits = llama_get_logits(ctx);
        auto n_vocab = llama_n_vocab(model);
+
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@ -83,9 +73,10 @@ int main(int argc, char ** argv) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx, next_token);
-        last_n_tokens_data.push_back(next_token);

        printf("%s", next_token_str.c_str());
+        result0 += next_token_str;
+
        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx);
@ -103,32 +94,28 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

-    // Load state (rng, logits, embedding and kv_cache) from file
-    {
-        FILE *fp_read = fopen("dump_state.bin", "rb");
-        if (state_size != llama_get_state_size(ctx2)) {
-            fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
-            llama_free(ctx2);
-            llama_free_model(model);
-            return 1;
-        }
+    printf("\nsecond run: %s", params.prompt.c_str());

-        const size_t ret = fread(state_mem, 1, state_size, fp_read);
-        if (ret != state_size) {
+    // load state (rng, logits, embedding and kv_cache) from file
+    {
+        std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
+
+        FILE * fp_read = fopen("dump_state.bin", "rb");
+
+        const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        if (ret != state_mem.size()) {
            fprintf(stderr, "\n%s : failed to read state\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
            return 1;
        }

-        llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
+        llama_set_state_data(ctx2, state_mem.data());
+
        fclose(fp_read);
    }

-    delete[] state_mem;
-
    // restore state (last tokens)
-    last_n_tokens_data = last_n_tokens_data_saved;
    n_past = n_past_saved;

    // second run
@ -143,10 +130,11 @@ int main(int argc, char ** argv) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx2, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);
-        last_n_tokens_data.push_back(next_token);

        printf("%s", next_token_str.c_str());
-        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+        result1 += next_token_str;
+
+        if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
@ -155,10 +143,17 @@ int main(int argc, char ** argv) {
        n_past += 1;
    }

-    printf("\n\n");
+    printf("\n");

    llama_free(ctx2);
    llama_free_model(model);

+    if (result0 != result1) {
+        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
+        return 1;
+    }
+
+    fprintf(stderr, "\n%s : success\n", __func__);
+
    return 0;
 }
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -114,8 +114,6 @@ node index.js

    *Options:*

-     `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does.
-
    `temperature`: Adjust the randomness of the generated text (default: 0.8).

    `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
@ -124,7 +122,7 @@ node index.js

    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).

-    `n_keep`: Specify the number of tokens from the prompt to retain when context size is exceeded and tokens need to be discarded.
+    `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.

    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
@ -162,42 +160,6 @@ node index.js

    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)

-    *Result JSON:*
-
-    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
-
-    `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
-
-    `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
-
-    `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
-
-    `model`: The path to the model loaded with `-m`
-
-    `prompt`: The provided `prompt`
-
-    `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
-
-    `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
-
-    `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
-
-    `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
-
-    `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
-
-    `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
-
-    `tokens_evaluated`: Number of tokens evaluated in total from the prompt
-
-    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
-
-    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
-
-    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
-
-    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
-
 -   **POST** `/tokenize`: Tokenize a given text.

    *Options:*
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -253,13 +253,14 @@ static void init_model(struct my_llama_model * model) {
    set_param_model(model);

    // measure data size
-    struct ggml_allocr * alloc = NULL;
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    alloc_model(alloc, model);
+    size_t size = 0;
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
+    }

    // allocate data
-    model->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
-    ggml_allocr_free(alloc);
+    struct ggml_allocr * alloc = NULL;
+    model->data.resize(size + tensor_alignment);
    alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
    alloc_model(alloc, model);
    ggml_allocr_free(alloc);
@ -1094,11 +1095,9 @@ int main(int argc, char ** argv) {
    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);

    // measure required memory for input tensors
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    ggml_allocr_alloc(alloc, tokens_input);
-    ggml_allocr_alloc(alloc, target_probs);
-    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
-    ggml_allocr_free(alloc);
+    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
+                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
+                            tensor_alignment;
    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));

    // allocate input tensors