convert : various script cleanups/fixes + merges and special token handling (#2842)
* convert: Fix permute calls and method/func definitions * Cleanups for gguf-py * Minor types cleanups. * Initial implementation of handling merges and special tokens * convert: Handle special tokens and merges in vocab only mode convert: Vocab only mode no longer requires loading model tensors * gguf: Refactor tensor name mapping * convert: Fix type hint for special_token_types in SpecialVocab * Use common special vocab handling in various conversion scripts * First pass at implementing suggested changes * Second pass * gguf: SpecialVocab: Fix issue with special token content not in a dict gguf: SpecialVocab: Allow skipping handling of merges * convert-falcon-hf-to-gguf: Support --vocab-only option, bail out if no tokenizer.json * convert-gptneox-hf-to-gguf and convert: Only handle merges for BPE tokenizer * gguf: SpecialVocab: Actually set load_merges in object * Uniform args parsing and vocab only mode for convert examples * convert.py: Set gpt2 as tokenizer model when using BPE * Squish last type warning in gguf.py - yay!
This commit is contained in:
parent
ad9ddcff6e
commit
dc07dc492e
10 changed files with 728 additions and 748 deletions
|
@ -4,7 +4,7 @@ import os
|
|||
import re
|
||||
import struct
|
||||
import sys
|
||||
from typing import Any, Dict, Sequence, TextIO
|
||||
from typing import Any, Dict, Sequence, BinaryIO
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
@ -46,7 +46,7 @@ def translate_tensor_name(t: str) -> str:
|
|||
sys.exit(1)
|
||||
|
||||
|
||||
def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
|
||||
def write_file_header(fout: BinaryIO, params: Dict[str, Any]) -> None:
|
||||
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
fout.write(struct.pack("i", params["r"]))
|
||||
|
@ -60,7 +60,7 @@ def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
|
|||
|
||||
|
||||
def write_tensor_header(
|
||||
self, name: str, shape: Sequence[int], data_type: np.dtype
|
||||
self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
|
||||
) -> None:
|
||||
sname = name.encode("utf-8")
|
||||
fout.write(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue