convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16 to allow quantizing them to Q4_K and Q6_K with llama-quantize. * llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0 Q4_0 is not completely symmetric (so not lossless for ternary models), but it should be good enough.
This commit is contained in:
parent
3a0bf17d57
commit
895004f3f8
2 changed files with 24 additions and 2 deletions
|
@ -301,6 +301,20 @@ class Model:
|
||||||
):
|
):
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
|
if data_qtype is False and any(
|
||||||
|
self.match_model_tensor_name(new_name, key, bid)
|
||||||
|
for key in (
|
||||||
|
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
gguf.MODEL_TENSOR.OUTPUT,
|
||||||
|
)
|
||||||
|
):
|
||||||
|
if self.ftype in (
|
||||||
|
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||||
|
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||||
|
):
|
||||||
|
# TODO: use Q4_K and Q6_K
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
|
||||||
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||||
if isinstance(data_qtype, bool):
|
if isinstance(data_qtype, bool):
|
||||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||||
|
@ -311,6 +325,10 @@ class Model:
|
||||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||||
|
|
||||||
|
@ -3814,8 +3832,8 @@ def parse_args() -> argparse.Namespace:
|
||||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
||||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bigendian", action="store_true",
|
"--bigendian", action="store_true",
|
||||||
|
@ -3902,6 +3920,8 @@ def main() -> None:
|
||||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||||
|
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||||
|
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15717,6 +15717,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
}
|
}
|
||||||
if (convert_incompatible_tensor) {
|
if (convert_incompatible_tensor) {
|
||||||
switch (new_type) {
|
switch (new_type) {
|
||||||
|
case GGML_TYPE_TQ1_0:
|
||||||
|
case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue