From fe8d4df76b2dc738a1494fd7ad430e23df7a5cab Mon Sep 17 00:00:00 2001 From: mgroeber9110 Date: Wed, 29 Jan 2025 20:27:37 +0100 Subject: [PATCH] Correctly identify LF token for GPT-2 style BPE tokenizer --- src/llama-vocab.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 0782d3a41..345abe77a 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1687,7 +1687,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); linefeed_id = ids[0]; } else { - const std::vector ids = tokenize("\xC4\x8A", false); // U+010A + const std::vector ids = tokenize("\n", false); //GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); if (ids.empty()) {