Merge branch 'master' into compilade/refactor-kv-cache

2024-07-04 17:33:52 -04:00 · 2024-07-04 17:33:52 -04:00 · 9b38f8bf65
commit 9b38f8bf65
parent 10c3c419e9 d7fd29fff1
102 changed files with 4745 additions and 1675 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -757,7 +757,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.cache_type_v = argv[++i];
        return true;
    }
-    if (arg == "--multiline-input") {
+    if (arg == "-mli" || arg == "--multiline-input") {
        params.multiline_input = true;
        return true;
    }
@ -2070,7 +2070,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    if (params.warmup) {
        LOG("warming up the model with an empty run\n");

-        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
+        std::vector<llama_token> tmp;
+        llama_token bos = llama_token_bos(model);
+        llama_token eos = llama_token_eos(model);
+        // some models (e.g. T5) don't have a BOS token
+        if (bos != -1) {
+            tmp.push_back(bos);
+        }
+        tmp.push_back(eos);
+
+        if (llama_model_has_encoder(model)) {
+            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
+            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+            if (decoder_start_token_id == -1) {
+                decoder_start_token_id = bos;
+            }
+            tmp.clear();
+            tmp.push_back(decoder_start_token_id);
+        }
        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
        llama_past_clear(lctx);
        llama_synchronize(lctx);