sentencepiece bpe compatible tokenizer (#252)
* potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
This commit is contained in:
parent
5cb63e2493
commit
074bea2eb1
7 changed files with 180 additions and 44 deletions
|
@ -60,7 +60,8 @@ def write_header(fout, hparams, ftype):
|
|||
|
||||
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||
values = [
|
||||
0x67676d6c, # magic: ggml in hex
|
||||
0x67676d66, # magic: ggml in hex
|
||||
1, # file version
|
||||
*[hparams[key] for key in keys],
|
||||
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
|
||||
ftype
|
||||
|
@ -85,6 +86,7 @@ def write_tokens(fout, tokenizer):
|
|||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
def process_and_write_variables(fout, model, ftype):
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue