Revert to commit 0614c33

This commit is contained in:
teleprint-me 2024-01-09 11:52:41 -05:00
parent c6af89e5ce
commit 29abd8d46c
No known key found for this signature in database
GPG key ID: B0D11345E65C4D48

View file

@ -48,9 +48,7 @@ except ModuleNotFoundError as e:
if "NO_LOCAL_GGUF" not in os.environ: if "NO_LOCAL_GGUF" not in os.environ:
# Use absolute path to the gguf-py directory # Use absolute path to the gguf-py directory
gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py") gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
print( print(gguf_py_dir) # NOTE: Remove this once path is verified after changes are completed
gguf_py_dir
) # NOTE: Remove this once path is verified after changes are completed
if gguf_py_dir not in sys.path: if gguf_py_dir not in sys.path:
sys.path.insert(1, gguf_py_dir) sys.path.insert(1, gguf_py_dir)
@ -79,7 +77,6 @@ DEFAULT_CONCURRENCY = 8
# data types # data types
# #
# TODO: Clean up and refactor data types # TODO: Clean up and refactor data types
@dataclass(frozen=True) @dataclass(frozen=True)
class DataType: class DataType:
@ -96,16 +93,10 @@ class UnquantizedDataType(DataType):
pass pass
DT_F16 = UnquantizedDataType( DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
"F16", dtype=np.dtype(np.float16), valid_conversions=["F32", "Q8_0"] DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
) DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
DT_F32 = UnquantizedDataType( DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
"F32", dtype=np.dtype(np.float32), valid_conversions=["F16", "Q8_0"]
)
DT_I32 = UnquantizedDataType("I32", dtype=np.dtype(np.int16), valid_conversions=[])
DT_BF16 = UnquantizedDataType(
"BF16", dtype=np.dtype(np.uint16), valid_conversions=["F32", "F16", "Q8_0"]
)
@dataclass(frozen=True) @dataclass(frozen=True)
@ -115,12 +106,10 @@ class QuantizedDataType(DataType):
ggml_type: gguf.GGMLQuantizationType ggml_type: gguf.GGMLQuantizationType
def quantize(self, arr: NDArray) -> NDArray: def quantize(self, arr: NDArray) -> NDArray:
raise NotImplementedError(f"Quantization for {self.name} not implemented") raise NotImplementedError(f'Quantization for {self.name} not implemented')
def elements_to_bytes(self, n_elements: int) -> int: def elements_to_bytes(self, n_elements: int) -> int:
assert ( assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
n_elements % self.block_size == 0
), f"Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}"
return self.quantized_dtype.itemsize * (n_elements // self.block_size) return self.quantized_dtype.itemsize * (n_elements // self.block_size)
@ -128,47 +117,38 @@ class QuantizedDataType(DataType):
class Q8_0QuantizedDataType(QuantizedDataType): class Q8_0QuantizedDataType(QuantizedDataType):
# Mini Q8_0 quantization in Python! # Mini Q8_0 quantization in Python!
def quantize(self, arr: NDArray) -> NDArray: def quantize(self, arr: NDArray) -> NDArray:
assert ( assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
arr.size % self.block_size == 0 and arr.size != 0 assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
), f"Bad array size {arr.size}"
assert arr.dtype == np.float32, f"Bad array type {arr.dtype}"
n_blocks = arr.size // self.block_size n_blocks = arr.size // self.block_size
blocks = arr.reshape((n_blocks, self.block_size)) blocks = arr.reshape((n_blocks, self.block_size))
# Much faster implementation of block quantization contributed by @Cebtenzzre # Much faster implementation of block quantization contributed by @Cebtenzzre
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
d = abs(blocks).max(axis=1) / np.float32(127) d = abs(blocks).max(axis = 1) / np.float32(127)
with np.errstate(divide="ignore"): with np.errstate(divide = 'ignore'):
qs = (blocks / d[:, None]).round() qs = (blocks / d[:, None]).round()
qs[d == 0] = 0 qs[d == 0] = 0
yield from zip(d, qs) yield from zip(d, qs)
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
return np.fromiter(
quantize_blocks_q8_0(blocks), count=n_blocks, dtype=self.quantized_dtype
)
DT_Q8_0 = Q8_0QuantizedDataType( DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
"Q8_0", dtype = np.dtype(np.float32), valid_conversions = [],
dtype=np.dtype(np.float32), ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
valid_conversions=[], quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
ggml_type=gguf.GGMLQuantizationType.Q8_0,
block_size=32,
quantized_dtype=np.dtype([("d", "<f2"), ("qs", "i1", (32,))]),
)
# Quantized types skipped here because they may also map to np.float32 # Quantized types skipped here because they may also map to np.float32
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {} NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
for dt in (DT_BF16, DT_F16, DT_F32, DT_I32): for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE: if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
raise ValueError(f"Invalid duplicate data type {dt}") raise ValueError(f'Invalid duplicate data type {dt}')
NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
SAFETENSORS_DATA_TYPES: dict[str, DataType] = { SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
"BF16": DT_BF16, 'BF16': DT_BF16,
"F16": DT_F16, 'F16': DT_F16,
"F32": DT_F32, 'F32': DT_F32,
"I32": DT_I32, 'I32': DT_I32,
} }
# TODO: match this with `llama_ftype` # TODO: match this with `llama_ftype`
@ -177,8 +157,8 @@ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
class GGMLFileType(enum.IntEnum): class GGMLFileType(enum.IntEnum):
AllF32 = 0 AllF32 = 0
MostlyF16 = 1 # except 1d tensors MostlyF16 = 1 # except 1d tensors
MostlyQ8_0 = 7 # except 1d tensors MostlyQ8_0 = 7 # except 1d tensors
def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType: def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
@ -190,8 +170,8 @@ class GGMLFileType(enum.IntEnum):
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
GGMLFileType.AllF32: DT_F32, GGMLFileType.AllF32 : DT_F32,
GGMLFileType.MostlyF16: DT_F16, GGMLFileType.MostlyF16 : DT_F16,
GGMLFileType.MostlyQ8_0: DT_Q8_0, GGMLFileType.MostlyQ8_0: DT_Q8_0,
} }
@ -586,13 +566,8 @@ class HfVocab:
token_text = reverse_vocab[token_id].encode("utf-8") token_text = reverse_vocab[token_id].encode("utf-8")
# Yield token text, score, and type # Yield token text, score, and type
yield ( yield token_text, self.get_token_score(token_id), self.get_token_type(
token_text, token_id, self.special_ids # Reuse already stored special IDs
self.get_token_score(token_id),
self.get_token_type(
token_id,
self.special_ids, # Reuse already stored special IDs
),
) )
def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType: def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType:
@ -642,43 +617,28 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv: if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv n_head = n_head_kv
return ( return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) .swapaxes(1, 2)
.swapaxes(1, 2) .reshape(weights.shape))
.reshape(weights.shape)
)
class Tensor(metaclass=ABCMeta): class Tensor(metaclass=ABCMeta):
data_type: DataType data_type: DataType
@abstractmethod @abstractmethod
def astype(self, data_type: DataType) -> Tensor: def astype(self, data_type: DataType) -> Tensor: ...
...
@abstractmethod @abstractmethod
def permute(self, n_head: int, n_head_kv: int) -> Tensor: def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
...
@abstractmethod @abstractmethod
def permute_part( def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
self, n_part: int, n_head: int, n_head_kv: int
) -> UnquantizedTensor:
...
@abstractmethod @abstractmethod
def part(self, n_part: int) -> UnquantizedTensor: def part(self, n_part: int) -> UnquantizedTensor: ...
...
@abstractmethod @abstractmethod
def to_ggml(self) -> GGMLCompatibleTensor: def to_ggml(self) -> GGMLCompatibleTensor: ...
...
def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray: def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
assert ( assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
bf16_arr.dtype == np.uint16
), f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
fp32_arr = bf16_arr.astype(np.uint32) << 16 fp32_arr = bf16_arr.astype(np.uint32) << 16
return fp32_arr.view(np.float32) return fp32_arr.view(np.float32)
@ -698,13 +658,9 @@ class UnquantizedTensor(Tensor):
def to_ggml(self) -> UnquantizedTensor: def to_ggml(self) -> UnquantizedTensor:
return self return self
def permute_part( def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
self, n_part: int, n_head: int, n_head_kv: int
) -> UnquantizedTensor:
r = self.ndarray.shape[0] // 3 r = self.ndarray.shape[0] // 3
return UnquantizedTensor( return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv)
)
def part(self, n_part: int) -> UnquantizedTensor: def part(self, n_part: int) -> UnquantizedTensor:
r = self.ndarray.shape[0] // 3 r = self.ndarray.shape[0] // 3
@ -714,9 +670,7 @@ class UnquantizedTensor(Tensor):
return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv)) return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
def load_unquantized( def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False
) -> NDArray:
tensor = lazy_tensor.load() tensor = lazy_tensor.load()
assert isinstance(tensor, UnquantizedTensor) assert isinstance(tensor, UnquantizedTensor)
@ -727,9 +681,7 @@ def load_unquantized(
if convert: if convert:
tensor.ndarray = tensor.ndarray.astype(expected_dtype) tensor.ndarray = tensor.ndarray.astype(expected_dtype)
else: else:
raise ValueError( raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
f"expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}"
)
return tensor.ndarray return tensor.ndarray
@ -747,9 +699,8 @@ class LazyTensor:
def load(self) -> Tensor: def load(self) -> Tensor:
ret = self._load() ret = self._load()
# Should be okay if it maps to the same numpy type? # Should be okay if it maps to the same numpy type?
assert ret.data_type == self.data_type or ( assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
self.data_type.dtype == ret.data_type.dtype (self.data_type, ret.data_type, self.description)
), (self.data_type, ret.data_type, self.description)
return ret return ret
def astype(self, data_type: DataType) -> LazyTensor: def astype(self, data_type: DataType) -> LazyTensor:
@ -757,29 +708,21 @@ class LazyTensor:
def load() -> Tensor: def load() -> Tensor:
return self.load().astype(data_type) return self.load().astype(data_type)
return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
return LazyTensor(
load, self.shape, data_type, f"convert({data_type}) {self.description}"
)
def validate_conversion_to(self, data_type: DataType) -> None: def validate_conversion_to(self, data_type: DataType) -> None:
if ( if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
data_type != self.data_type raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
and data_type.name not in self.data_type.valid_conversions
):
raise ValueError(
f"Cannot validate conversion from {self.data_type} to {data_type}."
)
LazyModel: TypeAlias = "dict[str, LazyTensor]" LazyModel: TypeAlias = 'dict[str, LazyTensor]'
@dataclass @dataclass
class ModelPlus: class ModelPlus:
model: LazyModel model: LazyModel
paths: list[Path] # Where this was read from. paths: list[Path] # Where this was read from.
format: Literal["ggml", "torch", "safetensors", "none"] format: Literal['ggml', 'torch', 'safetensors', 'none']
vocab: Vocab | None # For GGML models (which have vocab built in), the vocab. vocab: Vocab | None # For GGML models (which have vocab built in), the vocab.
@ -797,11 +740,9 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
if len(lazy_tensors[0].shape) == 1: if len(lazy_tensors[0].shape) == 1:
# the tensor is just duplicated in every file # the tensor is just duplicated in every file
return lazy_tensors[0] return lazy_tensors[0]
if ( if name.startswith('tok_embeddings.') or \
name.startswith("tok_embeddings.") name.endswith('.attention.wo.weight') or \
or name.endswith(".attention.wo.weight") name.endswith('.feed_forward.w2.weight'):
or name.endswith(".feed_forward.w2.weight")
):
# split by columns # split by columns
axis = 1 axis = 1
else: else:
@ -814,16 +755,8 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors] ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
concatenated: NDArray = np.concatenate(ndarrays, axis=axis) concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
return UnquantizedTensor(concatenated) return UnquantizedTensor(concatenated)
description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
description = ( return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
"concatenated[["
+ "] | [".join(lt.description for lt in lazy_tensors)
+ "]]"
)
return LazyTensor(
load, concatenated_shape, lazy_tensors[0].data_type, description
)
return {name: convert(name) for name in names} return {name: convert(name) for name in names}
@ -853,38 +786,23 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
def load() -> Tensor: def load() -> Tensor:
return lazy_tensor.load().permute(n_head, n_head_kv) return lazy_tensor.load().permute(n_head, n_head_kv)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
return LazyTensor(
load,
lazy_tensor.shape,
lazy_tensor.data_type,
f"permute({n_head}, {n_head_kv}) " + lazy_tensor.description,
)
def permute_part_lazy( def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int
) -> LazyTensor:
def load() -> Tensor: def load() -> Tensor:
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv) return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
s = lazy_tensor.shape.copy() s = lazy_tensor.shape.copy()
s[0] = s[0] // 3 s[0] = s[0] // 3
return LazyTensor( return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
load,
s,
lazy_tensor.data_type,
f"permute({n_head}, {n_head_kv}) " + lazy_tensor.description,
)
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
def load() -> Tensor: def load() -> Tensor:
return lazy_tensor.load().part(n_part) return lazy_tensor.load().part(n_part)
s = lazy_tensor.shape.copy() s = lazy_tensor.shape.copy()
s[0] = s[0] // 3 s[0] = s[0] // 3
return LazyTensor(load, s, lazy_tensor.data_type, "part " + lazy_tensor.description) return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
# Functionality that simulates `torch.load` but where individual tensors are # Functionality that simulates `torch.load` but where individual tensors are
@ -914,11 +832,11 @@ class LazyUnpickler(pickle.Unpickler):
self.zip_file = zip_file self.zip_file = zip_file
def persistent_load(self, pid: Any) -> Any: def persistent_load(self, pid: Any) -> Any:
assert pid[0] == "storage" assert pid[0] == 'storage'
assert isinstance(pid[1], LazyStorageKind) assert isinstance(pid[1], LazyStorageKind)
data_type = pid[1].data_type data_type = pid[1].data_type
filename_stem = pid[2] filename_stem = pid[2]
filename = f"{self.data_base_path}/{filename_stem}" filename = f'{self.data_base_path}/{filename_stem}'
info = self.zip_file.getinfo(filename) info = self.zip_file.getinfo(filename)
def load(offset: int, elm_count: int) -> NDArray: def load(offset: int, elm_count: int) -> NDArray:
@ -929,31 +847,18 @@ class LazyUnpickler(pickle.Unpickler):
data = fp.read(size) data = fp.read(size)
assert len(data) == size assert len(data) == size
return np.frombuffer(data, dtype) return np.frombuffer(data, dtype)
description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
description = f"storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}"
return LazyStorage(load=load, kind=pid[1], description=description) return LazyStorage(load=load, kind=pid[1], description=description)
@staticmethod @staticmethod
def lazy_rebuild_tensor_v2( def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
storage: Any, requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
storage_offset: Any,
size: Any,
stride: Any,
requires_grad: Any,
backward_hooks: Any,
metadata: Any = None,
) -> LazyTensor:
assert isinstance(storage, LazyStorage) assert isinstance(storage, LazyStorage)
def load() -> UnquantizedTensor: def load() -> UnquantizedTensor:
elm_count = stride[0] * size[0] elm_count = stride[0] * size[0]
return UnquantizedTensor( return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
storage.load(storage_offset, elm_count).reshape(size) description = f'pickled storage_offset={storage_offset} in {storage.description}'
)
description = (
f"pickled storage_offset={storage_offset} in {storage.description}"
)
return LazyTensor(load, list(size), storage.kind.data_type, description) return LazyTensor(load, list(size), storage.kind.data_type, description)
@staticmethod @staticmethod
@ -977,56 +882,47 @@ class LazyUnpickler(pickle.Unpickler):
} }
def find_class(self, module: str, name: str) -> Any: def find_class(self, module: str, name: str) -> Any:
if not module.startswith("torch"): if not module.startswith('torch'):
return super().find_class(module, name) return super().find_class(module, name)
return self.CLASSES[(module, name)] return self.CLASSES[(module, name)]
def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
zf = zipfile.ZipFile(outer_fp) zf = zipfile.ZipFile(outer_fp)
pickle_paths = [name for name in zf.namelist() if name.endswith(".pkl")] pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
assert len(pickle_paths) == 1, pickle_paths assert len(pickle_paths) == 1, pickle_paths
pickle_fp = zf.open(pickle_paths[0], "r") pickle_fp = zf.open(pickle_paths[0], 'r')
unpickler = LazyUnpickler( unpickler = LazyUnpickler(pickle_fp,
pickle_fp, data_base_path=pickle_paths[0][:-4], zip_file=zf data_base_path=pickle_paths[0][:-4],
) zip_file=zf)
model = unpickler.load() model = unpickler.load()
if "model" in model: if 'model' in model: model = model['model']
model = model["model"]
as_dict = dict(model.items()) as_dict = dict(model.items())
return ModelPlus(model=as_dict, paths=[path], format="torch", vocab=None) return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
(header_size,) = struct.unpack("<Q", fp.read(8)) header_size, = struct.unpack('<Q', fp.read(8))
header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size)) header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
# Use mmap for the actual data to avoid race conditions with the file offset. # Use mmap for the actual data to avoid race conditions with the file offset.
mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)) mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
byte_buf = mapped[8 + header_size :] byte_buf = mapped[8 + header_size:]
def convert(info: dict[str, Any]) -> LazyTensor: def convert(info: dict[str, Any]) -> LazyTensor:
data_type = SAFETENSORS_DATA_TYPES[info["dtype"]] data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
numpy_dtype = data_type.dtype numpy_dtype = data_type.dtype
shape: list[int] = info["shape"] shape: list[int] = info['shape']
begin, end = info["data_offsets"] begin, end = info['data_offsets']
assert 0 <= begin <= end <= len(byte_buf) assert 0 <= begin <= end <= len(byte_buf)
assert end - begin == math.prod(shape) * numpy_dtype.itemsize assert end - begin == math.prod(shape) * numpy_dtype.itemsize
buf = byte_buf[begin:end] buf = byte_buf[begin:end]
def load() -> UnquantizedTensor: def load() -> UnquantizedTensor:
return UnquantizedTensor( return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
np.frombuffer(buf, dtype=numpy_dtype).reshape(shape) description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
)
description = (
f"safetensors begin={begin} end={end} type={data_type} path={path}"
)
return LazyTensor(load, shape, data_type, description) return LazyTensor(load, shape, data_type, description)
model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
model = { return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
name: convert(info) for (name, info) in header.items() if name != "__metadata__"
}
return ModelPlus(model=model, paths=[path], format="safetensors", vocab=None)
def must_read(fp: IO[bytes], length: int) -> bytes: def must_read(fp: IO[bytes], length: int) -> bytes:
@ -1038,34 +934,28 @@ def must_read(fp: IO[bytes], length: int) -> bytes:
@functools.lru_cache(maxsize=None) @functools.lru_cache(maxsize=None)
def lazy_load_file(path: Path) -> ModelPlus: def lazy_load_file(path: Path) -> ModelPlus:
fp = open(path, "rb") fp = open(path, 'rb')
first8 = fp.read(8) first8 = fp.read(8)
fp.seek(0) fp.seek(0)
if first8[:2] == b"PK": if first8[:2] == b'PK':
# A zip file, i.e. PyTorch format # A zip file, i.e. PyTorch format
return lazy_load_torch_file(fp, path) return lazy_load_torch_file(fp, path)
elif struct.unpack("<Q", first8)[0] < 16 * 1024 * 1024: elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
# Probably safetensors # Probably safetensors
return lazy_load_safetensors_file(fp, path) return lazy_load_safetensors_file(fp, path)
else: else:
raise ValueError(f"unknown format: {path}") raise ValueError(f"unknown format: {path}")
In = TypeVar("In") In = TypeVar('In')
Out = TypeVar("Out") Out = TypeVar('Out')
def bounded_parallel_map( def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
func: Callable[[In], Out], '''Parallel map, but with backpressure. If the caller doesn't call `next`
iterable: Iterable[In],
concurrency: int,
max_workers: int | None = None,
use_processpool_executor: bool = False,
) -> Iterable[Out]:
"""Parallel map, but with backpressure. If the caller doesn't call `next`
fast enough, this will stop calling `func` at some point rather than fast enough, this will stop calling `func` at some point rather than
letting results pile up in memory. Specifically, there is a max of one letting results pile up in memory. Specifically, there is a max of one
output value buffered per thread.""" output value buffered per thread.'''
if concurrency < 2: if concurrency < 2:
yield from map(func, iterable) yield from map(func, iterable)
# Not reached. # Not reached.
@ -1075,7 +965,7 @@ def bounded_parallel_map(
executor_class = ProcessPoolExecutor executor_class = ProcessPoolExecutor
else: else:
executor_class = ThreadPoolExecutor executor_class = ThreadPoolExecutor
with executor_class(max_workers=max_workers) as executor: with executor_class(max_workers = max_workers) as executor:
futures: list[concurrent.futures.Future[Out]] = [] futures: list[concurrent.futures.Future[Out]] = []
done = False done = False
for _ in range(concurrency): for _ in range(concurrency):
@ -1342,31 +1232,23 @@ class OutputFile:
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
wq_type = model[ wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"
].data_type
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
return GGMLFileType.AllF32 return GGMLFileType.AllF32
if output_type_str == "f16" or ( if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
output_type_str is None and wq_type in (DT_F16, DT_BF16)
):
return GGMLFileType.MostlyF16 return GGMLFileType.MostlyF16
if output_type_str == "q8_0": if output_type_str == "q8_0":
return GGMLFileType.MostlyQ8_0 return GGMLFileType.MostlyQ8_0
name_to_type = { name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()
}
raise Exception(f"Unexpected combination of types: {name_to_type}") raise Exception(f"Unexpected combination of types: {name_to_type}")
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
return { return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
name: tensor.astype(output_type.type_for_tensor(name, tensor)) for (name, tensor) in model.items()}
for (name, tensor) in model.items()
}
def convert_model_names(model: LazyModel, params: Params) -> LazyModel: def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
@ -1379,43 +1261,21 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
for i in itertools.count(): for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model: if f"model.layers.{i}.self_attn.q_proj.weight" in model:
print(f"Permuting layer {i}") print(f"Permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy( tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
model[f"model.layers.{i}.self_attn.q_proj.weight"], tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
params.n_head,
params.n_head,
)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(
model[f"model.layers.{i}.self_attn.k_proj.weight"],
params.n_head,
params.n_head_kv,
)
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model: elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
print(f"Unpacking and permuting layer {i}") print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy( tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
model[f"model.layers.{i}.self_attn.W_pack.weight"], tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
0, tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
params.n_head,
params.n_head,
)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(
model[f"model.layers.{i}.self_attn.W_pack.weight"],
1,
params.n_head,
params.n_head_kv,
)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy(
model[f"model.layers.{i}.self_attn.W_pack.weight"], 2
)
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
else: else:
break break
out: LazyModel = {} out: LazyModel = {}
for name, lazy_tensor in model.items(): for name, lazy_tensor in model.items():
tensor_type, name_new = tmap.get_type_and_name( tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
name, try_suffixes=(".weight", ".bias")
) or (None, None)
if name_new is None: if name_new is None:
raise Exception(f"Unexpected tensor name: {name}") raise Exception(f"Unexpected tensor name: {name}")
@ -1423,26 +1283,24 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
print(f"skipping tensor {name_new}") print(f"skipping tensor {name_new}")
continue continue
print( print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}"
)
out[name_new] = lazy_tensor out[name_new] = lazy_tensor
return out return out
def nth_multifile_path(path: Path, n: int) -> Path | None: def nth_multifile_path(path: Path, n: int) -> Path | None:
"""Given any path belonging to a multi-file model (e.g. foo.bin.1), return '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
the nth path in the model. the nth path in the model.
""" '''
# Support the following patterns: # Support the following patterns:
patterns: list[tuple[str, str]] = [ patterns: list[tuple[str, str]] = [
# - x.00.pth, x.01.pth, etc. # - x.00.pth, x.01.pth, etc.
(r"\.[0-9]{2}\.pth$", f".{n:02}.pth"), (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
# - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
(r"-[0-9]{5}-of-(.*)$", rf"-{n:05}-of-\1"), (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
# x.bin, x.bin.1, etc. # x.bin, x.bin.1, etc.
(r"(\.[0-9]+)?$", r"\1" if n == 0 else rf"\1.{n}"), (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
] ]
for regex, replacement in patterns: for regex, replacement in patterns:
if re.search(regex, path.name): if re.search(regex, path.name):
@ -1453,9 +1311,9 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
def find_multifile_paths(path: Path) -> list[Path]: def find_multifile_paths(path: Path) -> list[Path]:
"""Given any path belonging to a multi-file model (e.g. foo.bin.1), return '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
the whole list of paths in the model. the whole list of paths in the model.
""" '''
ret: list[Path] = [] ret: list[Path] = []
for i in itertools.count(): for i in itertools.count():
nth_path = nth_multifile_path(path, i) nth_path = nth_multifile_path(path, i)
@ -1471,7 +1329,7 @@ def find_multifile_paths(path: Path) -> list[Path]:
def load_some_model(path: Path) -> ModelPlus: def load_some_model(path: Path) -> ModelPlus:
"""Load a model of any supported format.""" '''Load a model of any supported format.'''
# Be extra-friendly and accept either a file or a directory: # Be extra-friendly and accept either a file or a directory:
if path.is_dir(): if path.is_dir():
# Check if it's a set of safetensors files first # Check if it's a set of safetensors files first
@ -1479,19 +1337,12 @@ def load_some_model(path: Path) -> ModelPlus:
files = [file for glob in globs for file in path.glob(glob)] files = [file for glob in globs for file in path.glob(glob)]
if not files: if not files:
# Try the PyTorch patterns too, with lower priority # Try the PyTorch patterns too, with lower priority
globs = [ globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
"consolidated.00.pth",
"pytorch_model-00001-of-*.bin",
"*.pt",
"pytorch_model.bin",
]
files = [file for glob in globs for file in path.glob(glob)] files = [file for glob in globs for file in path.glob(glob)]
if not files: if not files:
raise Exception(f"Can't find model in directory {path}") raise Exception(f"Can't find model in directory {path}")
if len(files) > 1: if len(files) > 1:
raise Exception( raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
f"Found multiple models in {path}, not sure which to pick: {files}"
)
path = files[0] path = files[0]
paths = find_multifile_paths(path) paths = find_multifile_paths(path)