fix for use

2024-04-13 15:48:36 +09:00 · 2024-04-13 15:48:36 +09:00 · 8657b8fc94
commit 8657b8fc94
parent ab9a3240a9
2 changed files with 8 additions and 5 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -1655,9 +1655,9 @@ class GPT2Model(Model):
            self.gguf_writer.add_tensor(new_name, data)

            # note: GPT2 output is tied to (same as) wte in original model
-            if new_name == "token_embd.weight":
-                print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-                self.gguf_writer.add_tensor("output.weight", data)
+            # if new_name == "token_embd.weight":
+            #     print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            #     self.gguf_writer.add_tensor("output.weight", data)


@Model.register("PhiForCausalLM")
--- a/llama.cpp
+++ b/llama.cpp
@ -11146,8 +11146,11 @@ struct llm_tokenizer_bpe {

                const std::string str = std::string(symbol.text, symbol.n);
                const auto token = vocab.token_to_id.find(str);
-
-                if (token == vocab.token_to_id.end()) {
+                // U+000A -> Ċ -> [UNK] (line feed, use [SEP] instead)
+                // U+0020 -> Ġ -> [UNK] (space, use U+3000 instead)
+                if (str == "Ċ" || str == "Ġ") {
+                    output.push_back(vocab.token_to_id.at("[UNK]"));
+                } else if (token == vocab.token_to_id.end()) {
                    for (auto j = str.begin(); j != str.end(); ++j) {
                        std::string byte_str(1, *j);
                        auto token_multibyte = vocab.token_to_id.find(byte_str);