whisper : tokenizer fix + re-enable tokenizer test for LLaMa (#3096)

* Fix für #2721 * Reenable tokenizer test for LLaMa * Add `console.cpp` dependency * Fix dependency to `common` * Fixing wrong fix. * Make console usage platform specific Work on compiler warnings. * Adapting makefile * Remove trailing whitespace * Adapting the other parts of the makefile * Fix typo.
2023-09-13 15:19:44 +02:00 · 2023-09-13 15:19:44 +02:00 · 71ca2fad7d
commit 71ca2fad7d
parent 1b6c650d16
6 changed files with 142 additions and 118 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -3121,10 +3121,9 @@ struct llm_tokenizer_spm {
        while (offs < text.size()) {
            llm_symbol sym;
            size_t len = utf8_len(text[offs]);
-            GGML_ASSERT(offs + len <= text.size());
            sym.text = text.c_str() + offs;
-            sym.n = len;
-            offs += len;
+            sym.n = std::min(len, text.size() - offs);
+            offs += sym.n;
            sym.prev = index - 1;
            sym.next = offs == text.size() ? -1 : index + 1;
            index++;
@ -6218,7 +6217,7 @@ int llama_tokenize_with_model(
    auto res = llama_tokenize_internal(model->vocab, text, add_bos);

    if (n_max_tokens < (int) res.size()) {
-        LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+        // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
        return -((int) res.size());
    }