Merge branch 'ggerganov:master' into master

2023-10-06 18:28:10 +02:00 · 2023-10-06 18:28:10 +02:00 · 8bd24b2e5c
commit 8bd24b2e5c
parent dfeda32abd 9ca79d5cbb
50 changed files with 4125 additions and 1237 deletions
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@ -9,7 +9,7 @@ if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
    exit 1
 fi

-MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
+MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
 PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
 USER_NAME="${USER_NAME:-User}"
 AI_NAME="${AI_NAME:-ChatLLaMa}"
@ -61,9 +61,9 @@ fi

 if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
    echo 'Prompt cache does not exist, building...'
-    # Default batch_size to 8 here for better user feedback during initial prompt processing
+    # Default batch_size to 64 here for better user feedback during initial prompt processing
    ./main 2>>"$LOG" \
-        --batch_size 8 \
+        --batch_size 64 \
        "${OPTS[@]}" \
        --prompt-cache "$PROMPT_CACHE_FILE" \
        --file "$CUR_PROMPT_FILE" \
@ -132,7 +132,7 @@ while read -e line; do
    # HACK get num tokens from debug message
    # TODO get both messages in one go
    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
-        ! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
+        ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
        echo >&2 "Couldn't get number of tokens from ./main output!"
        exit 1
    fi
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
 ```

-The scale numbers don't need to add up to one, and you can also use numbers creater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
+The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.

 Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
 If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
--- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py
+++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
@ -313,7 +313,7 @@ class ModelParams:
        gguf_writer.add_feed_forward_length(self.get_n_ff())

 def tensor_name(key, bid=None, suffix=".weight"):
-    return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix
+    return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix

 class Layer:
    def __init__(self, params, lora_params, bid):
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -332,8 +332,8 @@ static void init_model(struct llama_model * input, struct my_llama_model * model

        assert_shape_1d(layer.attention_norm, hparams.n_embd);
        assert_shape_2d(layer.wq,             hparams.n_embd, hparams.n_embd);
-        assert_shape_2d(layer.wk,             hparams.n_embd, hparams.n_embd);
-        assert_shape_2d(layer.wv,             hparams.n_embd, hparams.n_embd);
+        assert_shape_2d(layer.wk,             hparams.n_embd, hparams.n_embd_gqa());
+        assert_shape_2d(layer.wv,             hparams.n_embd, hparams.n_embd_gqa());
        assert_shape_2d(layer.wo,             hparams.n_embd, hparams.n_embd);
        assert_shape_1d(layer.ffn_norm,       hparams.n_embd);
        assert_shape_2d(layer.w1,             hparams.n_embd, hparams.n_ff);
--- a/examples/jeopardy/README.md
+++ b/examples/jeopardy/README.md
@ -2,7 +2,7 @@

 This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.

-The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
+The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.


 Step 1: Open jeopardy.sh and modify the following:
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@ -28,6 +28,16 @@ configure_file(${_common_path}/../build-info.h
 target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
    ${CMAKE_CURRENT_BINARY_DIR})

+# If the common project was part of "main-cmake-pkg" the transient
+# defines would automatically be attached. Because the common func-
+# tionality is separate, but dependent upon the defines, it must be
+# explicitly extracted from the "llama" target.
+#
+get_target_property(_llama_transient_defines llama
+    INTERFACE_COMPILE_DEFINITIONS)
+
+target_compile_definitions(common PRIVATE "${_llama_transient_defines}")
+
 add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
 target_include_directories(${TARGET} PRIVATE ${_common_path})
 install(TARGETS ${TARGET} RUNTIME)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -543,6 +543,9 @@ int main(int argc, char ** argv) {
                if (i > 0) {
                    embd.erase(embd.begin(), embd.begin() + i);
                }
+
+                // remove any "future" tokens that we might have inherited from the session from the KV cache
+                llama_kv_cache_tokens_rm(ctx, n_past, -1);
            }

            // evaluate tokens in batches
@ -667,7 +670,7 @@ int main(int argc, char ** argv) {
            }
            fflush(stdout);
        }
-        // reset color to default if we there is no pending user input
+        // reset color to default if there is no pending user input
        if (input_echo && (int) embd_inp.size() == n_consumed) {
            console::set_display(console::reset);
        }
@ -694,10 +697,8 @@ int main(int argc, char ** argv) {
                    if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
                        if (params.interactive) {
                            is_interacting = true;
-                            console::set_display(console::user_input);
                        }
                        is_antiprompt = true;
-                        fflush(stdout);
                        break;
                    }
                }
@ -721,8 +722,6 @@ int main(int argc, char ** argv) {

                    is_interacting = true;
                    printf("\n");
-                    console::set_display(console::user_input);
-                    fflush(stdout);
                } else if (params.instruct) {
                    is_interacting = true;
                }
@ -747,6 +746,9 @@ int main(int argc, char ** argv) {
                    printf("%s", buffer.c_str());
                }

+                // color user input only
+                console::set_display(console::user_input);
+
                std::string line;
                bool another_line = true;
                do {
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -10,6 +10,7 @@
 #include <cstdio>
 #include <string>
 #include <vector>
+#include <ctime>

 // trim whitespace from the beginning and end of a string
 static std::string trim(const std::string & str) {
@ -70,6 +71,26 @@ struct client {
    std::vector<llama_token> tokens_prev;
 };

+static void print_date_time() {
+    std::time_t current_time = std::time(nullptr);
+    std::tm* local_time = std::localtime(&current_time);
+    char buffer[80];
+    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
+
+    printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
+}
+
+// Define a split string function to ...
+static std::vector<std::string> split_string(const std::string& input, char delimiter) {
+    std::vector<std::string> tokens;
+    std::istringstream stream(input);
+    std::string token;
+    while (std::getline(stream, token, delimiter)) {
+        tokens.push_back(token);
+    }
+    return tokens;
+}
+
 int main(int argc, char ** argv) {
    srand(1234);

@ -104,6 +125,23 @@ int main(int argc, char ** argv) {
    params.logits_all = true;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);

+    // load the prompts from an external file if there are any
+    if (params.prompt.empty()) {
+        printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
+    } else {
+        // Output each line of the input params.prompts vector and copy to k_prompts
+        int index = 0;
+        printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
+
+        std::vector<std::string> prompts = split_string(params.prompt, '\n');
+        for (const auto& prompt : prompts) {
+            k_prompts.resize(index + 1);
+            k_prompts[index] = prompt;
+            index++;
+            printf("%3d prompt: %s\n", index, prompt.c_str());
+        }
+    }
+
    fprintf(stderr, "\n\n");
    fflush(stderr);

@ -233,7 +271,7 @@ int main(int argc, char ** argv) {
                    client.n_decoded = 0;
                    client.i_batch   = batch.n_tokens - 1;

-                    LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);

                    g_seq_id += 1;

@ -332,12 +370,12 @@ int main(int argc, char ** argv) {
                    }

                    // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx);
+                    llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);

                    const auto t_main_end = ggml_time_us();

-                    LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\nResponse: %s\n\n",
-                            client.id, client.seq_id, client.n_prompt, client.n_decoded,
+                    LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
+                            client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
                            (t_main_end - client.t_start_prompt) / 1e6,
                            (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
                            n_cache_miss,
@ -357,13 +395,21 @@ int main(int argc, char ** argv) {

    const auto t_main_end = ggml_time_us();

-    LOG_TEE("\n\n");
+    print_date_time();
+
+    LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    if (params.prompt_file.empty()) {
+        params.prompt_file = "used built-in defaults";
+    }
+    LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
+    LOG_TEE("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
+
    LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
    LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
    LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
    LOG_TEE("Cache misses:        %6d\n", n_cache_miss);

-    LOG_TEE("\n\n");
+    LOG_TEE("\n");

    llama_print_timings(ctx);

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -448,7 +448,7 @@ struct llama_server_context
        n_past = common_part(embd, prompt_tokens);

        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx);
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);

        embd = prompt_tokens;
        if (n_past == num_prompt_tokens)
@ -504,9 +504,11 @@ struct llama_server_context
                                           });
        }

+        bool tg = true;
        while (n_past < embd.size())
        {
            int n_eval = (int)embd.size() - n_past;
+            tg = n_eval == 1;
            if (n_eval > params.n_batch)
            {
                n_eval = params.n_batch;
@ -532,98 +534,20 @@ struct llama_server_context
            return result;
        }

-        // out of user input, sample next token
-        const float temp = params.temp;
-        const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k;
-        const float top_p = params.top_p;
-        const float tfs_z = params.tfs_z;
-        const float typical_p = params.typical_p;
-        const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-        const float repeat_penalty = params.repeat_penalty;
-        const float alpha_presence = params.presence_penalty;
-        const float alpha_frequency = params.frequency_penalty;
-        const int mirostat = params.mirostat;
-        const float mirostat_tau = params.mirostat_tau;
-        const float mirostat_eta = params.mirostat_eta;
-        const bool penalize_nl = params.penalize_nl;
-        const int32_t n_probs = params.n_probs;
-
        {
-            auto *logits = llama_get_logits(ctx);
-            auto n_vocab = llama_n_vocab(model);
-
-            // Apply params.logit_bias map
-            for (const auto &it : params.logit_bias)
-            {
-                logits[it.first] += it.second;
-            }
-
+            // out of user input, sample next token
            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++)
+            candidates.reserve(llama_n_vocab(model));
+
+            result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates);
+
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+            const int32_t n_probs = params.n_probs;
+            if (params.temp <= 0 && n_probs > 0)
            {
-                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-            }
-
-            llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
-
-            // Apply penalties
-            float nl_logit = logits[llama_token_nl(ctx)];
-            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-            llama_sample_repetition_penalty(ctx, &candidates_p,
-                                            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                                            last_n_repeat, repeat_penalty);
-            llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-                                                          last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                                                          last_n_repeat, alpha_frequency, alpha_presence);
-            if (!penalize_nl)
-            {
-                logits[llama_token_nl(ctx)] = nl_logit;
-            }
-
-            if (grammar != nullptr) {
-                llama_sample_grammar(ctx, &candidates_p, grammar);
-            }
-
-            if (temp <= 0)
-            {
-                // Greedy sampling
-                result.tok = llama_sample_token_greedy(ctx, &candidates_p);
-                if (n_probs > 0)
-                {
-                    llama_sample_softmax(ctx, &candidates_p);
-                }
-            }
-            else
-            {
-                if (mirostat == 1)
-                {
-                    static float mirostat_mu = 2.0f * mirostat_tau;
-                    const int mirostat_m = 100;
-                    llama_sample_temp(ctx, &candidates_p, temp);
-                    result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-                }
-                else if (mirostat == 2)
-                {
-                    static float mirostat_mu = 2.0f * mirostat_tau;
-                    llama_sample_temp(ctx, &candidates_p, temp);
-                    result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-                }
-                else
-                {
-                    // Temperature sampling
-                    size_t min_keep = std::max(1, n_probs);
-                    llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
-                    llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
-                    llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
-                    llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
-                    llama_sample_temp(ctx, &candidates_p, temp);
-                    result.tok = llama_sample_token(ctx, &candidates_p);
-                }
-            }
-
-            if (grammar != nullptr) {
-                llama_grammar_accept_token(ctx, grammar, result.tok);
+                // For llama_sample_token_greedy we need to sort candidates
+                llama_sample_softmax(ctx, &candidates_p);
            }

            for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
@ -633,7 +557,9 @@ struct llama_server_context

            last_n_tokens.erase(last_n_tokens.begin());
            last_n_tokens.push_back(result.tok);
-            num_tokens_predicted++;
+            if (tg) {
+                num_tokens_predicted++;
+            }
        }

        // add it to the context
@ -1011,7 +937,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-            params.lora_adapter.push_back({argv[i], 1.0f});
+            params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
            params.use_mmap = false;
        }
        else if (arg == "--lora-scaled")
@ -1027,7 +953,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-            params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
+            params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
            params.use_mmap = false;
        }
        else if (arg == "--lora-base")
@ -1124,8 +1050,6 @@ static json format_timings(llama_server_context &llama)
 {
    const auto timings = llama_get_timings(llama.ctx);

-    assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
-
    return json{
        {"prompt_n", timings.n_p_eval},
        {"prompt_ms", timings.t_p_eval_ms},
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
                LOG("out of drafted tokens\n");
            }

-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx);
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
            llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
            ++n_past_dft;

@ -257,7 +257,7 @@ int main(int argc, char ** argv) {
            }

            // evaluate the drafted token on the draft model
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx);
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
            llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
            ++n_past_cur;

@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
        }

        // evaluate the target model on the drafted tokens
-        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx);
+        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
        llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
        ++n_past_tgt;

--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@ -364,7 +364,7 @@ class ModelParams:
        gguf_writer.add_feed_forward_length(self.get_n_ff())

 def tensor_name(key, bid=None):
-    return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"
+    return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"

 class Layer:
    def __init__(self, params, bid):