Updated file conversion and simplified activation logic

2023-09-12 22:44:51 -07:00 · 2023-09-12 22:44:51 -07:00 · 7954f8defd
commit 7954f8defd
parent 11f72245ff
6 changed files with 1445 additions and 1330 deletions
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@ -33,7 +33,6 @@ GGML_QUANT_SIZES = {
    gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
    gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
-    gguf.GGMLQuantizationType.Q4_SQ  : (1, 4),
 }

 class GGMLFormat(IntEnum):
@ -59,7 +58,6 @@ class GGMLFType(IntEnum):
    MOSTLY_Q5_K_S        = 16
    MOSTLY_Q5_K_M        = 17
    MOSTLY_Q6_K          = 18
-    MOSTLY_Q4_SQ         = 19

 class Hyperparameters:
    def __init__(self):
@ -122,7 +120,7 @@ class Tensor:
        self.len_bytes = np.int64(0)
        self.use_padding = use_padding

-    def load(self, data, offset, squeezellm=False):
+    def load(self, data, offset):
        orig_offset = offset
        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
@ -139,9 +137,6 @@ class Tensor:
        pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
        offset += pad
        n_elems = np.prod(self.dims)
-        if squeezellm and n_dims > 1 and dtype == gguf.GGMLQuantizationType.Q4_SQ:
-            n_elems = n_elems / 8
-            n_elems += self.dims[1] * 8 # add 16 fp16 elements per row
        n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
        self.start_offset = offset
        self.len_bytes = n_bytes
@ -191,20 +186,19 @@ class GGMLModel:
        if len(err) > 0:
            raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')

-    def load(self, data, offset, squeezellm=False):
+    def load(self, data, offset):
        offset += self.validate_header(data, offset)
        hp = Hyperparameters()
        offset += hp.load(data, offset)
        print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
-        if not squeezellm:
-            self.validate_conversion(hp.ftype)
+        self.validate_conversion(hp.ftype)
        vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
        offset += vocab.load(data, offset, hp.n_vocab)
        tensors: list[Tensor] = []
        tensor_map = {}
        while offset < len(data):
            tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
-            offset += tensor.load(data, offset, squeezellm=squeezellm)
+            offset += tensor.load(data, offset)
            tensor_map[tensor.name] = len(tensors)
            tensors.append(tensor)
        self.hyperparameters = hp
@ -420,7 +414,6 @@ def handle_args():
        help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
    parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
-    parser.add_argument("--squeezellm",  action="store_true",    help="Convert to SQLLM")
    return parser.parse_args()

 def main():
@ -432,7 +425,7 @@ def main():
    data = np.memmap(cfg.input, mode = 'r')
    model = GGMLModel()
    print('* Scanning GGML input file')
-    offset = model.load(data, 0, cfg.squeezellm)
+    offset = model.load(data, 0)
    print(f'* GGML model hyperparameters: {model.hyperparameters}')
    vocab_override = None
    params_override = None
--- a/convert-sqllm-to-ggml.py
+++ b/convert-sqllm-to-ggml.py
--- a/convert-sqllm-to-gguf.py
+++ b/convert-sqllm-to-gguf.py
--- a/ggml.c
+++ b/ggml.c
@ -1790,10 +1790,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .type_size                = sizeof(int32_t),
        .is_quantized             = true,
        .to_float                 = NULL,
-        .from_float               = NULL,
+        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
        .from_float_reference     = NULL,
        .vec_dot                  = ggml_vec_dot_q4_sq_fp16,
-        .vec_dot_type             = GGML_TYPE_F32,
+        .vec_dot_type             = GGML_TYPE_F16,
    }
 #endif
 };
@ -11351,8 +11351,9 @@ static void ggml_compute_forward_mul_mat(
    }
 #endif

-    if (params->type == GGML_TASK_INIT && src0->type != GGML_TYPE_Q4_SQ) {
+    if (params->type == GGML_TASK_INIT){
        if (src1->type != vec_dot_type) {
+
            char * wdata = params->wdata;
            const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);

@ -11366,21 +11367,6 @@ static void ggml_compute_forward_mul_mat(
            }
        }

-        return;
-    } else if (params->type == GGML_TASK_INIT) { //SQLLM - copy fp32 vec over
-        ggml_fp16_t * wdata = params->wdata;
-        float * srcvec;
-        for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                    srcvec = (float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-                    for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                        *wdata = ggml_fp32_to_fp16(srcvec[i10]);
-                        wdata += 1;
-                    }
-                }
-            }
-        }
        return;
    }

--- a/ggml.h
+++ b/ggml.h
@ -304,7 +304,7 @@ extern "C" {
        GGML_TYPE_Q5_K = 13,
        GGML_TYPE_Q6_K = 14,
        GGML_TYPE_Q8_K = 15,
-        GGML_TYPE_Q4_SQ = 16,
+        GGML_TYPE_Q4_SQ = 19,
        GGML_TYPE_I8,
        GGML_TYPE_I16,
        GGML_TYPE_I32,
@ -333,7 +333,7 @@ extern "C" {
        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_SQ = 16,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_SQ = 19,  // except 1d tensors
    };

    // available tensor operations:
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@ -389,7 +389,7 @@ class GGMLQuantizationType(IntEnum):
    Q5_K = 13
    Q6_K = 14
    Q8_K = 15
-    Q4_SQ = 16
+    Q4_SQ = 19


 class GGUFValueType(IntEnum):