llama : add support for BitnetForCausalLM (#7931)
* hf bitnet v1 * hf bitnet e2e v2 * finish bitnet e2e * finish f16 hf bitnet e2e * remove unsed * finish bitnet i2 e2e * move i2s to quantize v1 * move i2 to quantize * clean code * clean code 2 * fix codestyle * fix code * fix * fix code * fix merge * remove unused * change table name * fix whitespace * delete redundant * i2_s to absmax * finish i2_s/i8_s vec_dot x86 simd * i2s->q22 * fix code * remove block scale * add dequantize * fix seq * update avx2 * remove q2_2 * remove q22_grid * fix whitespace * reuse llm_build_kv * fix bo --------- Co-authored-by: root <root@wangjinheng>
This commit is contained in:
parent
6a2f298bd7
commit
e112b610a1
4 changed files with 307 additions and 1 deletions
|
@ -1404,6 +1404,48 @@ class LlamaModel(Model):
|
|||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@Model.register("BitnetForCausalLM")
|
||||
class BitnetModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.BITNET
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||
|
||||
def weight_quant(self, weight):
|
||||
dtype = weight.dtype
|
||||
weight = weight.float()
|
||||
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
||||
weight = (weight * s).round().clamp(-1, 1) / s
|
||||
scale = weight.abs().max().unsqueeze(0)
|
||||
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
||||
weight = torch.sign(weight).type(dtype)
|
||||
return weight.type(dtype), scale.type(torch.float32)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
|
||||
gguf.MODEL_TENSOR.ATTN_Q,
|
||||
gguf.MODEL_TENSOR.ATTN_K,
|
||||
gguf.MODEL_TENSOR.ATTN_V,
|
||||
gguf.MODEL_TENSOR.ATTN_OUT,
|
||||
gguf.MODEL_TENSOR.FFN_UP,
|
||||
gguf.MODEL_TENSOR.FFN_DOWN,
|
||||
gguf.MODEL_TENSOR.FFN_GATE,
|
||||
]):
|
||||
# transform weight into 1/0/-1 (in fp32)
|
||||
weight_torch, scale_torch = self.weight_quant(data_torch)
|
||||
yield (new_name, weight_torch)
|
||||
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
||||
else:
|
||||
yield (new_name, data_torch)
|
||||
|
||||
|
||||
@Model.register("GrokForCausalLM")
|
||||
class GrokModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.GROK
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue