diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 63710676b..164ac105f 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1655,9 +1655,9 @@ class GPT2Model(Model): self.gguf_writer.add_tensor(new_name, data) # note: GPT2 output is tied to (same as) wte in original model - if new_name == "token_embd.weight": - print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor("output.weight", data) + # if new_name == "token_embd.weight": + # print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + # self.gguf_writer.add_tensor("output.weight", data) @Model.register("PhiForCausalLM") diff --git a/llama.cpp b/llama.cpp index 83dd55efa..dd8865fc8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11146,8 +11146,11 @@ struct llm_tokenizer_bpe { const std::string str = std::string(symbol.text, symbol.n); const auto token = vocab.token_to_id.find(str); - - if (token == vocab.token_to_id.end()) { + // U+000A -> Ċ -> [UNK] (line feed, use [SEP] instead) + // U+0020 -> Ġ -> [UNK] (space, use U+3000 instead) + if (str == "Ċ" || str == "Ġ") { + output.push_back(vocab.token_to_id.at("[UNK]")); + } else if (token == vocab.token_to_id.end()) { for (auto j = str.begin(); j != str.end(); ++j) { std::string byte_str(1, *j); auto token_multibyte = vocab.token_to_id.find(byte_str);