From 7cac72a80b5314e2bacab7971acd337f41863088 Mon Sep 17 00:00:00 2001 From: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Date: Fri, 12 Apr 2024 16:28:54 +0200 Subject: [PATCH] Do not use special tokens when matching in RWKV tokenizer --- src/llama-vocab.cpp | 5 +++++ src/llama.cpp | 8 ++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 30db1a042..0c9e57215 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1170,6 +1170,11 @@ struct llm_tokenizer_rwkv { while (position < text.size()) { // Iterate through possible tokens backwards, starting with the largest for (int32_t i = (int32_t)tokens.size() - 1; i >= 0; i--) { + // Skip tokens that aren't normal type, we can't match on those + if (vocab.id_to_token[i].attr != LLAMA_TOKEN_TYPE_NORMAL) { + continue; + } + uint32_t token_size = tokens[i].size(); // If there's not enough left for this token diff --git a/src/llama.cpp b/src/llama.cpp index b19b1cce4..50a7d5ff3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5931,8 +5931,8 @@ static void llm_load_vocab( vocab.type = LLAMA_VOCAB_TYPE_RWKV; // default special tokens - vocab.special_bos_id = 0; - vocab.special_eos_id = 0; + vocab.special_bos_id = -1; + vocab.special_eos_id = -1; vocab.special_unk_id = -1; vocab.special_sep_id = -1; vocab.special_pad_id = -1; @@ -8223,6 +8223,10 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_RWKV: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + } default: throw std::runtime_error("unknown architecture"); }