finish f16 hf bitnet e2e
This commit is contained in:
parent
1f2e0ee012
commit
5e59660173
10 changed files with 440 additions and 11 deletions
|
@ -1421,7 +1421,31 @@ class BitnetModel(Model):
|
|||
result = (weight * s).round().clamp(-1, 1) / s
|
||||
return result.type(dtype)
|
||||
|
||||
def transform_to_i2(self, x):
|
||||
from gguf.lazy import LazyNumpyTensor
|
||||
x = LazyNumpyTensor.to_eager(x)
|
||||
x_num = np.prod(x.shape)
|
||||
x = np.reshape(x, x_num)
|
||||
scale = 1
|
||||
for i in range(x_num):
|
||||
if x[i] != 0:
|
||||
scale = x[i]
|
||||
break
|
||||
x = np.divide(x, scale)
|
||||
x = x.astype(np.uint8)
|
||||
x = np.reshape(x, [x.shape[0] // 4, 4])
|
||||
keep_bit = {0:192, 1:48, 2:12, 3:3}
|
||||
ans = np.zeros([x_num // 4], dtype=np.uint8)
|
||||
for i in range(4):
|
||||
x_bit_col = x[:, i]
|
||||
x_bit_shift = np.left_shift(x_bit_col, 6 - i * 2)
|
||||
x_bit_shift = np.bitwise_and(x_bit_shift, keep_bit[i])
|
||||
ans = np.bitwise_or(ans, x_bit_shift)
|
||||
scale = np.tile(scale, 8)
|
||||
return ans, scale
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# quant weight to i2 (in fp16)
|
||||
if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
|
||||
"down_proj.weight", "up_proj.weight", "gate_proj.weight",
|
||||
"o_proj.weight")):
|
||||
|
@ -1429,6 +1453,103 @@ class BitnetModel(Model):
|
|||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def write_tensors(self):
|
||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||
|
||||
for name, data_torch in self.get_tensors():
|
||||
# we don't need these
|
||||
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
||||
continue
|
||||
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
# use the first number-like part of the tensor name as the block id
|
||||
bid = None
|
||||
for part in name.split("."):
|
||||
if part.isdecimal():
|
||||
bid = int(part)
|
||||
break
|
||||
|
||||
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
||||
data: np.ndarray = data # type hint
|
||||
data_shape = data.shape
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
data_qtype: gguf.GGMLQuantizationType | None = None
|
||||
|
||||
# when both are True, f32 should win
|
||||
extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
|
||||
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
|
||||
|
||||
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
|
||||
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
||||
extra_f32 = any(cond for cond in (
|
||||
extra_f32,
|
||||
n_dims == 1,
|
||||
new_name.endswith("_norm.weight"),
|
||||
))
|
||||
|
||||
# Some tensor types are always in float32
|
||||
extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
|
||||
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
||||
gguf.MODEL_TENSOR.POS_EMBD,
|
||||
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||
))
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
extra_f16 = any(cond for cond in (
|
||||
extra_f16,
|
||||
(name.endswith(".weight") and n_dims >= 2),
|
||||
))
|
||||
|
||||
suit_i2 = True
|
||||
if name.endswith('embed_tokens.weight') or name.endswith('norm.weight'):
|
||||
suit_i2 = False
|
||||
|
||||
i2_scale = None
|
||||
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
||||
if self.ftype == gguf.LlamaFileType.MOSTLY_I2 and suit_i2:
|
||||
data, i2_scale = self.transform_to_i2(data)
|
||||
assert data.dtype == np.uint8
|
||||
assert i2_scale.dtype == np.float32
|
||||
data_qtype = gguf.GGMLQuantizationType.I2
|
||||
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||
data = gguf.quantize_bf16(data)
|
||||
assert data.dtype == np.int16
|
||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
||||
data = gguf.quantize_q8_0(data)
|
||||
assert data.dtype == np.uint8
|
||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||
|
||||
else: # default to float16 for quantized tensors
|
||||
if data_dtype != np.float16:
|
||||
data = data.astype(np.float16)
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
|
||||
if data_qtype is None: # by default, convert to float32
|
||||
if data_dtype != np.float32:
|
||||
data = data.astype(np.float32)
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
shape = data_shape
|
||||
# shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||
# reverse shape to make it similar to the internal ggml dimension order
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
||||
|
||||
# n_dims is implicit in the shape
|
||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data, raw_shape=shape, raw_dtype=data_qtype)
|
||||
if i2_scale is not None:
|
||||
self.gguf_writer.add_tensor(new_name + "_scale", i2_scale, raw_dtype=gguf.GGMLQuantizationType.F32)
|
||||
|
||||
@Model.register("GrokForCausalLM")
|
||||
class GrokModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.GROK
|
||||
|
@ -2804,7 +2925,7 @@ def parse_args() -> argparse.Namespace:
|
|||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "i2", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
)
|
||||
parser.add_argument(
|
||||
|
@ -2864,6 +2985,7 @@ def main() -> None:
|
|||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||
"i2" : gguf.LlamaFileType.MOSTLY_I2,
|
||||
"auto": gguf.LlamaFileType.GUESSED,
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue