convert : use utf8 encoding
This commit is contained in:
parent
8843a98c2b
commit
5a91d63d8b
1 changed files with 2 additions and 2 deletions
|
@ -128,7 +128,7 @@ for model in models:
|
||||||
print(f"chkhsh: {chkhsh}")
|
print(f"chkhsh: {chkhsh}")
|
||||||
|
|
||||||
# print the "pre_tokenizer" content from the tokenizer.json
|
# print the "pre_tokenizer" content from the tokenizer.json
|
||||||
with open(f"models/tokenizers/{name}/tokenizer.json", "r") as f:
|
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
|
||||||
cfg = json.load(f)
|
cfg = json.load(f)
|
||||||
pre_tokenizer = cfg["pre_tokenizer"]
|
pre_tokenizer = cfg["pre_tokenizer"]
|
||||||
print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
||||||
|
@ -249,7 +249,7 @@ for model in models:
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||||
|
|
||||||
with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
|
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
|
||||||
for text in tests:
|
for text in tests:
|
||||||
f.write(f"{text}")
|
f.write(f"{text}")
|
||||||
f.write("\n__ggml_vocab_test__\n")
|
f.write("\n__ggml_vocab_test__\n")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue