fix for use
This commit is contained in:
parent
ab9a3240a9
commit
8657b8fc94
2 changed files with 8 additions and 5 deletions
|
@ -1655,9 +1655,9 @@ class GPT2Model(Model):
|
||||||
self.gguf_writer.add_tensor(new_name, data)
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
# note: GPT2 output is tied to (same as) wte in original model
|
# note: GPT2 output is tied to (same as) wte in original model
|
||||||
if new_name == "token_embd.weight":
|
# if new_name == "token_embd.weight":
|
||||||
print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
# print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||||
self.gguf_writer.add_tensor("output.weight", data)
|
# self.gguf_writer.add_tensor("output.weight", data)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("PhiForCausalLM")
|
@Model.register("PhiForCausalLM")
|
||||||
|
|
|
@ -11146,8 +11146,11 @@ struct llm_tokenizer_bpe {
|
||||||
|
|
||||||
const std::string str = std::string(symbol.text, symbol.n);
|
const std::string str = std::string(symbol.text, symbol.n);
|
||||||
const auto token = vocab.token_to_id.find(str);
|
const auto token = vocab.token_to_id.find(str);
|
||||||
|
// U+000A -> Ċ -> [UNK] (line feed, use [SEP] instead)
|
||||||
if (token == vocab.token_to_id.end()) {
|
// U+0020 -> Ġ -> [UNK] (space, use U+3000 instead)
|
||||||
|
if (str == "Ċ" || str == "Ġ") {
|
||||||
|
output.push_back(vocab.token_to_id.at("[UNK]"));
|
||||||
|
} else if (token == vocab.token_to_id.end()) {
|
||||||
for (auto j = str.begin(); j != str.end(); ++j) {
|
for (auto j = str.begin(); j != str.end(); ++j) {
|
||||||
std::string byte_str(1, *j);
|
std::string byte_str(1, *j);
|
||||||
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue