common : don't crash if newline token is not found

2024-02-13 12:20:33 +02:00 · 2024-02-13 12:20:33 +02:00 · 93aed7595b
commit 93aed7595b
parent 037259be68
1 changed files with 6 additions and 1 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -3314,7 +3314,12 @@ static void llm_load_vocab(

    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
-        vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+        try {
+            vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
+        } catch (const std::exception & e) {
+            LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
+            vocab.linefeed_id = vocab.special_pad_id;
+        }
    } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
        vocab.linefeed_id = vocab.special_pad_id;
    } else {