truncate intermediate fp32 if converting bf16 to bf16
This commit is contained in:
parent
6a52bfe332
commit
46054d1aab
1 changed files with 2 additions and 2 deletions
|
@ -295,8 +295,8 @@ class Model:
|
||||||
|
|
||||||
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
||||||
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||||
data = gguf.quantize_bf16(data)
|
data = gguf.truncate_bf16(data) if old_dtype == torch.bfloat16 else gguf.quantize_bf16(data)
|
||||||
assert data.dtype == np.int16
|
assert data.dtype in (np.int16, np.uint16)
|
||||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
|
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue