From 4d698495eae6912db94dcdedb0c3b01c63143646 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 11:16:07 +0300
Subject: [PATCH 01/26] gguf : init

---
 .gitignore             |  1 +
 Makefile               |  7 +++++--
 examples/gguf/gguf.cpp | 34 ++++++++++++++++++++++++++++++++++
 ggml.h                 | 25 ++++++++++++++++++++++++-
 4 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 examples/gguf/gguf.cpp
diff --git a/.gitignore b/.gitignore
index c1ab6bb6d..abe8e28cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,7 @@ models-mnt
 /server
 /Pipfile
 /embd-input-test
+/gguf
 /libllama.so
 build-info.h
 arm_neon.h
diff --git a/Makefile b/Makefile
index fb7c27cd9..e19acfbb2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test gguf
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@@ -330,7 +330,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test gguf build-info.h $(TEST_TARGETS)
 
 #
 # Examples
@@ -370,6 +370,9 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
 embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
 
+gguf: examples/gguf/gguf.cpp                                  build-info.h ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
new file mode 100644
index 000000000..602de519a
--- /dev/null
+++ b/examples/gguf/gguf.cpp
@@ -0,0 +1,34 @@
+#include "ggml.h"
+
+#include <cstdio>
+#include <string>
+
+bool gguf_write(const std::string & fname) {
+
+
+    return true;
+}
+
+bool gguf_read(const std::string & fname) {
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 3) {
+        fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
+        return -1;
+    }
+
+    const std::string fname(argv[1]);
+    const std::string mode(argv[2]);
+
+    GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
+
+    if (mode == "w") {
+        GGML_ASSERT(gguf_write(fname) && "failed to write gguf file");
+    } else if (mode == "r") {
+        GGML_ASSERT(gguf_read(fname)  && "failed to read gguf file");
+    }
+
+    return 0;
+}
diff --git a/ggml.h b/ggml.h
index 9919cce7c..2e700c9a0 100644
--- a/ggml.h
+++ b/ggml.h
@@ -190,6 +190,9 @@
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1
 
+#define GGUF_FILE_MAGIC   0x47475546 // "GGUF"
+#define GGUF_FILE_VERSION 1
+
 #define GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 
@@ -202,7 +205,6 @@
 #define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4
 
-
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 
@@ -1611,6 +1613,27 @@ extern "C" {
 
     GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
 
+    //
+    // gguf
+    //
+
+    enum gguf_metadata_value_type {
+        GGUF_METADATA_VALUE_TYPE_UINT8   = 0,
+        GGUF_METADATA_VALUE_TYPE_INT8    = 1,
+        GGUF_METADATA_VALUE_TYPE_UINT16  = 2,
+        GGUF_METADATA_VALUE_TYPE_INT16   = 3,
+        GGUF_METADATA_VALUE_TYPE_UINT32  = 4,
+        GGUF_METADATA_VALUE_TYPE_INT32   = 5,
+        GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
+        GGUF_METADATA_VALUE_TYPE_BOOL    = 7,
+        GGUF_METADATA_VALUE_TYPE_STRING  = 8,
+        GGUF_METADATA_VALUE_TYPE_ARRAY   = 9,
+    };
+
+    struct gguf_string {
+        uint32_t n;
+        char * data;
+    };
     //
     // system info
     //

From bae6b125f6d6148d3bebb774f1b73ecc67dc0051 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Wed, 26 Jul 2023 11:17:05 +0300
Subject: [PATCH 02/26] wip : implement GGUF (#2397)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add LLAMA_DEFAULT_RMS_EPS so we can change the default (#2384)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>

* WIP: python class to write GGUF, incomplete C apı for reading

---------

Co-authored-by: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 constants.py |  32 +++++++
 gguf.c       | 192 ++++++++++++++++++++++++++++++++++++++
 gguf.py      | 257 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 481 insertions(+)
 create mode 100644 constants.py
 create mode 100644 gguf.c
 create mode 100644 gguf.py

diff --git a/constants.py b/constants.py
new file mode 100644
index 000000000..7c7456403
--- /dev/null
+++ b/constants.py
@@ -0,0 +1,32 @@
+GGUF_MAGIC = 0x47475546
+GGUF_VERSION = 1
+
+# general
+KEY_GENERAL_ARCHITECTURE = "general.architecture"
+KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
+KEY_GENERAL_NAME = "general.name"
+KEY_GENERAL_AUTHOR = "general.author"
+KEY_GENERAL_URL = "general.url"
+KEY_GENERAL_DESCRIPTION = "general.description"
+KEY_GENERAL_FILE_TYPE = "general.file_type"
+KEY_GENERAL_LICENSE = "general.license"
+KEY_GENERAL_SOURCE_URL = "general.source.url"
+KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
+
+# LLM
+KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length"
+KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length"
+KEY_LLM_LAYER_COUNT = "{llm}.layer_count"
+KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length"
+KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual"
+KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout"
+
+# attention
+KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count"
+KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv"
+KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias"
+KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv"
+
+# RoPE
+KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count"
+KEY_ROPE_SCALE = "{llm}.rope.scale"
diff --git a/gguf.c b/gguf.c
new file mode 100644
index 000000000..54b31d411
--- /dev/null
+++ b/gguf.c
@@ -0,0 +1,192 @@
+// TODO: convert to proper gguf.h gguf.c structure, now I'm trying to be fast as much as possible,
+// and everything is in this file for quick debugging.
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+
+enum ggml_type {
+    GGML_TYPE_F32  = 0,
+    GGML_TYPE_F16  = 1,
+    GGML_TYPE_Q4_0 = 2,
+    GGML_TYPE_Q4_1 = 3,
+    // GGML_TYPE_Q4_2 = 4, support has been removed
+    // GGML_TYPE_Q4_3 (5) support has been removed
+    GGML_TYPE_Q5_0 = 6,
+    GGML_TYPE_Q5_1 = 7,
+    GGML_TYPE_Q8_0 = 8,
+    GGML_TYPE_Q8_1 = 9,
+    // k-quantizations
+    GGML_TYPE_Q2_K = 10,
+    GGML_TYPE_Q3_K = 11,
+    GGML_TYPE_Q4_K = 12,
+    GGML_TYPE_Q5_K = 13,
+    GGML_TYPE_Q6_K = 14,
+    GGML_TYPE_Q8_K = 15,
+    GGML_TYPE_I8,
+    GGML_TYPE_I16,
+    GGML_TYPE_I32,
+    GGML_TYPE_COUNT,
+};
+
+enum gguf_metadata_value_type {
+    GGUF_METADATA_VALUE_TYPE_UINT8 = 0,
+    GGUF_METADATA_VALUE_TYPE_INT8 = 1,
+    GGUF_METADATA_VALUE_TYPE_UINT16 = 2,
+    GGUF_METADATA_VALUE_TYPE_INT16 = 3,
+    GGUF_METADATA_VALUE_TYPE_UINT32 = 4,
+    GGUF_METADATA_VALUE_TYPE_INT32 = 5,
+    GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
+    GGUF_METADATA_VALUE_TYPE_BOOL = 7,
+    GGUF_METADATA_VALUE_TYPE_STRING = 8,
+    GGUF_METADATA_VALUE_TYPE_ARRAY = 9,
+};
+
+struct gguf_string_t {
+    uint32_t len;
+    char * string;
+};
+
+union gguf_metadata_value_t;
+
+// Union definition for gguf_metadata_value_t
+union gguf_metadata_value_t {
+    uint8_t uint8;
+    int8_t int8;
+    uint16_t uint16;
+    int16_t int16;
+    uint32_t uint32;
+    int32_t int32;
+    float float32;
+    bool bool_;
+    struct gguf_string_t string;
+    struct {
+        uint32_t len;
+        enum gguf_metadata_value_type type;
+        union gguf_metadata_value_t * array;
+    } array;
+};
+
+
+struct gguf_metadata_kv_t {
+    struct gguf_string_t key;
+    uint32_t value_len;
+    enum gguf_metadata_value_type value_type;
+    union gguf_metadata_value_t* value;
+};
+
+struct gguf_header_t {
+    uint32_t magic;
+    uint32_t version;
+    uint32_t tensor_count;
+    uint32_t metadata_kv_count;
+    struct gguf_metadata_kv_t * metadata_kv;
+};
+
+struct gguf_tensor_info_t {
+    struct gguf_string_t name;
+    uint32_t n_dimensions;
+    uint32_t dimensions[];
+};
+
+struct gguf_file_t {
+    struct gguf_header_t header;
+    uint8_t tensor_data[];
+};
+
+void read_gguf_file(const char * file_path, struct gguf_file_t * gguf_file) {
+    FILE* file = fopen(file_path, "rb");
+    if (file == NULL) {
+        printf("Error opening the file.\n");
+        return;
+    }
+
+    fread(&gguf_file->header.magic, sizeof(uint32_t), 1, file);
+
+    // Verify magic and version
+    if (gguf_file->header.magic != 0x47475546) {
+        printf("Invalid magic number. Not a valid GGUF file.\n");
+        fclose(file);
+        return;
+    }
+
+    fread(&gguf_file->header.version, sizeof(uint32_t), 1, file);
+
+    if (gguf_file->header.version != 1) {
+        printf("Unsupported version. Expected version 1.\n");
+        fclose(file);
+        return;
+    }
+
+    fread(&gguf_file->header.tensor_count, sizeof(uint32_t), 1, file);
+    fread(&gguf_file->header.metadata_kv_count, sizeof(uint32_t), 1, file);
+
+    printf("Magic: %x\n", gguf_file->header.magic);
+    printf("Version: %d\n", gguf_file->header.version);
+    printf("Tensor Count: %d\n", gguf_file->header.tensor_count);
+    printf("Metadata Key-Value Count: %d\n", gguf_file->header.metadata_kv_count);
+
+    gguf_file->header.metadata_kv = (struct gguf_metadata_kv_t*)malloc(gguf_file->header.metadata_kv_count * sizeof(struct gguf_metadata_kv_t));
+
+    for (int i = 0; i < gguf_file->header.metadata_kv_count; i++) {
+        struct gguf_metadata_kv_t* kv = &gguf_file->header.metadata_kv[i];
+        fread(&kv->key.len, sizeof(uint32_t), 1, file);
+        kv->key.string = (char*)malloc(kv->key.len ); // Allocate memory for the key string
+        fread(kv->key.string, sizeof(char), kv->key.len, file);
+        //kv->key.string[kv->key.len] = '\0'; // Null-terminate the key string
+
+        fread(&kv->value_type, sizeof(uint32_t), 1, file);
+
+        printf("Metadata Value Type: %d\n", kv->value_type);
+        printf("Metadata Key: %s\n", kv->key.string);
+
+        // Read metadata value according to its type using reinterpret_cast
+        switch (kv->value_type) {
+            case GGUF_METADATA_VALUE_TYPE_UINT32:
+            kv->value = (uint32_t *) malloc(sizeof(uint32_t));
+            fread(kv->value, sizeof(uint32_t), 1, file);
+            printf("value: %d\n", kv->value->uint32);
+            break;
+            case GGUF_METADATA_VALUE_TYPE_FLOAT32:
+            kv->value = (float *)malloc(sizeof(float));
+            fread(kv->value, sizeof(float), 1, file);
+            printf("value: %f\n", (float)kv->value->float32);
+            break;
+            case GGUF_METADATA_VALUE_TYPE_STRING:
+            fread(&kv->value_len, sizeof(uint32_t), 1, file);
+            printf("value len: %d\n", kv->value_len);
+kv->value = (char *)malloc(sizeof(char) * kv->value_len); // Allocate memory for the value string
+fread(kv->value, sizeof(char), kv->value_len, file);
+        printf("value: %s\n", (char *)kv->value);
+        break;
+            // ... (handle other types in a similar manner)
+            default:
+                printf("Unsupported metadata value type.\n");
+                fclose(file);
+                return;
+        }
+    }
+
+    // TODO: handle reading tensor data
+
+    fclose(file);
+}
+
+void gguf_free(struct gguf_file_t * gguf_file) {
+    // Free allocated memory for key strings avd values
+    for (int i = 0; i < gguf_file->header.metadata_kv_count; i++) {
+        free(gguf_file->header.metadata_kv[i].key.string);
+        free(gguf_file->header.metadata_kv[i].value);
+    }
+    free(gguf_file->header.metadata_kv);
+}
+
+int main() {
+    const char* file_path = "example.gguf";
+    struct gguf_file_t gguf_file;
+    read_gguf_file(file_path, &gguf_file);
+    gguf_free(&gguf_file);
+    return 0;
+}
diff --git a/gguf.py b/gguf.py
new file mode 100644
index 000000000..dfd5ba5bf
--- /dev/null
+++ b/gguf.py
@@ -0,0 +1,257 @@
+"""TODOs
+1. Implement writing tensor data with alignment.
+2. Implement writers for known architectures, LLaMA in particular.
+3. Add docstrings from the format specs.
+4. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
+"""
+
+import struct
+from enum import IntEnum
+from typing import List, Any
+import constants
+
+
+class GGMLQuantizationType(IntEnum):
+    F32 = 0
+    F16 = 1
+    QR_0 = 2
+    Q4_1 = 3
+    # Q4_2 = 4 # support has been removed
+    # Q4_3 = 5 # support has been removed
+    Q5_0 = 6
+    Q5_1 = 7
+    Q8_0 = 8
+    Q8_1 = 9
+    Q2_K = 10
+    Q3_K = 11
+    Q4_K = 12
+    Q5_K = 13
+    Q6_K = 14
+    Q8_K = 15
+
+
+class GGUFValueType(IntEnum):
+    UINT8 = 0
+    INT8 = 1
+    UINT16 = 2
+    INT16 = 3
+    UINT32 = 4
+    INT32 = 5
+    FLOAT32 = 6
+    BOOL = 7
+    STRING = 8
+    ARRAY = 9
+
+    @staticmethod
+    def get_type(value):
+        if isinstance(value, str):
+            return GGUFValueType.STRING
+        elif isinstance(value, list):
+            return GGUFValueType.ARRAY
+        elif isinstance(value, float):
+            return GGUFValueType.FLOAT32
+        elif isinstance(value, bool):
+            return GGUFValueType.BOOL
+        else:
+            return GGUFValueType.INT32
+
+
+class GGUFWriter:
+    def __init__(self, buffered_writer):
+        self.buffered_writer = buffered_writer
+
+    def write_header(self, tensor_count: int, metadata_kv_count: int):
+        self.buffered_writer.write(struct.pack("<I", constants.GGUF_MAGIC))
+        self.buffered_writer.write(struct.pack("<I", constants.GGUF_VERSION))
+        self.buffered_writer.write(struct.pack("<I", tensor_count))
+        self.buffered_writer.write(struct.pack("<I", metadata_kv_count))
+
+    @classmethod
+    def open(cls, path: str) -> "GGUFWriter":
+        f = open(path, "wb")
+        return cls(f)
+
+    def write_key(self, key: str, value_type: GGUFValueType):
+        encoded_key = key.encode("utf8")
+        self.buffered_writer.write(struct.pack("<I", len(encoded_key)))
+        self.buffered_writer.write(encoded_key)
+        self.buffered_writer.write(struct.pack("<I", value_type))
+
+    def write_uint8(self, key: str, value: int):
+        self.write_key(key, GGUFValueType.UINT8)
+        self.buffered_writer.write(struct.pack("<B", value))
+
+    def write_int8(self, key: str, value: int):
+        self.write_key(key, GGUFValueType.INT8)
+        self.buffered_writer.write(struct.pack("<b", value))
+
+    def write_uint16(self, key: str, value: int):
+        self.write_key(key, GGUFValueType.UINT16)
+        self.buffered_writer.write(struct.pack("<H", value))
+
+    def write_int16(self, key: str, value: int):
+        self.write_key(key, GGUFValueType.INT16)
+        self.buffered_writer.write(struct.pack("<h", value))
+
+    def write_uint32(self, key: str, value: int):
+        self.write_key(key, GGUFValueType.UINT32)
+        self.buffered_writer.write(struct.pack("<I", value))
+
+    def write_int32(self, key: str, value: int):
+        self.write_key(key, GGUFValueType.INT32)
+        self.buffered_writer.write(struct.pack("<i", value))
+
+    def write_float32(self, key: str, value: float):
+        self.write_key(key, GGUFValueType.FLOAT32)
+        self.buffered_writer.write(struct.pack("<f", value))
+
+    def write_bool(self, key: str, value: bool):
+        self.write_key(key, GGUFValueType.BOOL)
+        self.buffered_writer.write(struct.pack("<?", value))
+
+    def write_string(self, key: str, value: str):
+        self.write_key(key, GGUFValueType.STRING)
+        encoded_string = value.encode('utf-8')
+        self.buffered_writer.write(struct.pack("<I", len(encoded_string)))
+        self.buffered_writer.write(encoded_string)
+
+    def write_array(self, key: str, value: list):
+        if not isinstance(value, list):
+            raise ValueError("Value must be a list for array type")
+
+        self.write_key(key, GGUFValueType.ARRAY)
+
+        self.buffered_writer.write(struct.pack("<I", len(value)))
+
+        for item in value:
+            self.write_value(item)
+
+    def write_value(self: str, value: Any):
+        value_type = GGUFValueType.get_type(value)
+        self.buffered_writer.write(struct.pack("<I", value_type))
+
+        if value_type == GGUFValueType.UINT8:
+            self.buffered_writer.write(struct.pack("<B", value))
+        elif value_type == GGUFValueType.INT8:
+            self.buffered_writer.write(struct.pack("<b", value))
+        elif value_type == GGUFValueType.UINT16:
+            self.buffered_writer.write(struct.pack("<H", value))
+        elif value_type == GGUFValueType.INT16:
+            self.buffered_writer.write(struct.pack("<h", value))
+        elif value_type == GGUFValueType.UINT32:
+            self.buffered_writer.write(struct.pack("<I", value))
+        elif value_type == GGUFValueType.INT32:
+            self.buffered_writer.write(struct.pack("<i", value))
+        elif value_type == GGUFValueType.FLOAT32:
+            self.buffered_writer.write(struct.pack("<f", value))
+        elif value_type == GGUFValueType.BOOL:
+            self.buffered_writer.write(struct.pack("?", value))
+        elif value_type == GGUFValueType.STRING:
+            encoded_value = value.encode("utf8")
+            self.buffered_writer.write(struct.pack("<I", len(encoded_value)))
+            self.buffered_writer.write(encoded_value)
+        elif value_type == GGUFValueType.ARRAY:
+            self.buffered_writer.write(struct.pack("<I", len(value)))
+            for item in value:
+                self.write_value(item)
+        else:
+            raise ValueError("Invalid GGUF metadata value type")
+
+    def flush(self):
+        self.buffered_writer.flush()
+
+    def close(self):
+        self.buffered_writer.close()
+
+    def write_architecture(self, architecture: str):
+        self.write_string(constants.KEY_GENERAL_ARCHITECTURE,
+                          architecture)
+
+    def write_author(self, author: str):
+        self.write_string(constants.KEY_GENERAL_AUTHOR, author)
+
+    def write_url(self, url: str):
+        self.write_string(constants.KEY_GENERAL_URL, url)
+
+    def write_description(self, description: str):
+        self.write_string(constants.KEY_GENERAL_DESCRIPTION, description)
+
+    def write_file_type(self, file_type: str):
+        self.write_string(constants.KEY_GENERAL_FILE_TYPE, file_type)
+
+    def write_source_url(self, url: str):
+        self.write_string(constants.KEY_GENERAL_SOURCE_URL, url)
+
+    def write_source_hf_repo(self, repo: str):
+        self.write_string(constants.KEY_GENERAL_SOURCE_HF_REPO, repo)
+
+    def write_name(self, name: str):
+        self.write_string(constants.KEY_GENERAL_NAME, name)
+
+    def write_quantization_version(self, quantization_version: GGMLQuantizationType):
+        self.write_uint32(
+            constants.KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
+
+    def write_context_length(self, llm: str, length: int):
+        self.write_uint32(
+            constants.KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length)
+
+    def write_embedding_length(self, llm: str, length: int):
+        self.write_uint32(
+            constants.KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length)
+
+    def write_layer_count(self, llm: str, length: int):
+        self.write_uint32(
+            constants.KEY_LLM_LAYER_COUNT.format(llm=llm), length)
+
+    def write_feed_forward_length(self, llm: str, length: int):
+        self.write_uint32(
+            constants.KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length)
+
+    def write_parallel_residual(self, llm: str, use: bool):
+        self.write_bool(
+            constants.KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use)
+
+    def write_tensor_data_layout(self, llm: str, layout: str):
+        self.write_string(
+            constants.KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout)
+
+    def write_head_count(self, llm: str, count: int):
+        self.write_uint32(
+            constants.KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count)
+
+    def write_head_count_kv(self, llm: str, count: int):
+        self.write_uint32(
+            constants.KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count)
+
+    def write_max_alibi_bias(self, llm: str, bias: float):
+        self.write_float32(
+            constants.KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias)
+
+    def write_clamp_kqv(self, llm: str, value: float):
+        self.write_float32(
+            constants.KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value)
+
+    def write_rope_dimension_count(self, llm: str, count: int):
+        self.write_uint32(
+            constants.KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count)
+
+    def write_rope_scale(self, llm: str, value:  float):
+        self.write_float32(constants.KEY_ROPE_SCALE.format(llm=llm), value)
+
+
+# Example usage:
+if __name__ == "__main__":
+    # Example usage with a file
+    gguf_writer = GGUFWriter.open("example.gguf")
+    gguf_writer.write_header(0, 3)
+
+gguf_writer.write_architecture("llama")
+gguf_writer.write_uint32("answer", 42)  # Write a 32-bit integer
+gguf_writer.write_float32("answer_in_float", 42.0)  # Write a 32-bit float
+# Write an array of integers
+#gguf_writer.write_array("simple_array", [1, 2, 3, 4])
+# Write a nested array
+#gguf_writer.write_array("nested", [1, "nested", [2, 3]])
+
+gguf_writer.close()

From 7e82d25f40386540c2c15226300ad998ecd871ea Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 11:26:14 +0300
Subject: [PATCH 03/26] ci : disable CI temporary to not waste energy

---
 .github/ISSUE_TEMPLATE/custom.md   | 185 ---------
 .github/workflows/build.yml        | 632 -----------------------------
 .github/workflows/docker.yml       |  65 ---
 .github/workflows/editorconfig.yml |  17 -
 .github/workflows/tidy-post.yml    |  20 -
 .github/workflows/tidy-review.yml  |  23 --
 6 files changed, 942 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/custom.md
 delete mode 100644 .github/workflows/build.yml
 delete mode 100644 .github/workflows/docker.yml
 delete mode 100644 .github/workflows/editorconfig.yml
 delete mode 100644 .github/workflows/tidy-post.yml
 delete mode 100644 .github/workflows/tidy-review.yml

diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md
deleted file mode 100644
index 8fd955356..000000000
--- a/.github/ISSUE_TEMPLATE/custom.md
+++ /dev/null
@@ -1,185 +0,0 @@
----
-name: Issue and enhancement template
-about: Used to report issues and request enhancements for llama.cpp
-title: "[User] Insert summary of your issue or enhancement.."
-labels: ''
-assignees: ''
-
----
-
-# Prerequisites
-
-Please answer the following questions for yourself before submitting an issue.
-
-- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
-- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
-- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
-- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
-
-# Expected Behavior
-
-Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
-
-# Current Behavior
-
-Please provide a detailed written description of what `llama.cpp` did, instead.
-
-# Environment and Context
-
-Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
-
-* Physical (or virtual) hardware you are using, e.g. for Linux:
-
-`$ lscpu`
-
-* Operating System, e.g. for Linux:
-
-`$ uname -a`
-
-* SDK version, e.g. for Linux:
-
-```
-$ python3 --version
-$ make --version
-$ g++ --version
-```
-
-# Failure Information (for bugs)
-
-Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
-
-# Steps to Reproduce
-
-Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
-
-1. step 1
-2. step 2
-3. step 3
-4. etc.
-
-# Failure Logs
-
-Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
-
-Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
-
-Example environment info:
-```
-llama.cpp$ git log | head -1
-commit 2af23d30434a677c6416812eea52ccc0af65119c
-
-llama.cpp$ lscpu | egrep "AMD|Flags"
-Vendor ID:                       AuthenticAMD
-Model name:                      AMD Ryzen Threadripper 1950X 16-Core Processor
-Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev
-Virtualization:                  AMD-V
-
-llama.cpp$ python3 --version
-Python 3.10.9
-
-llama.cpp$ pip list | egrep "torch|numpy|sentencepiece"
-numpy                         1.24.2
-numpydoc                      1.5.0
-sentencepiece                 0.1.97
-torch                         1.13.1
-torchvision                   0.14.1
-
-llama.cpp$ make --version | head -1
-GNU Make 4.3
-
-$ md5sum ./models/65B/ggml-model-q4_0.bin
-dbdd682cce80e2d6e93cefc7449df487  ./models/65B/ggml-model-q4_0.bin
-```
-
-Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
-```
-llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
-main: seed = 1679149377
-llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ...
-llama_model_load: n_vocab = 32000
-llama_model_load: n_ctx   = 512
-llama_model_load: n_embd  = 8192
-llama_model_load: n_mult  = 256
-llama_model_load: n_head  = 64
-llama_model_load: n_layer = 80
-llama_model_load: n_rot   = 128
-llama_model_load: f16     = 2
-llama_model_load: n_ff    = 22016
-llama_model_load: n_parts = 8
-llama_model_load: ggml ctx size = 41477.73 MB
-llama_model_load: memory_size =  2560.00 MB, n_mem = 40960
-llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-
-system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
-
-main: prompt: 'Please close your issue when it has been answered.'
-main: number of tokens in prompt = 11
-     1 -> ''
- 12148 -> 'Please'
-  3802 -> ' close'
-   596 -> ' your'
-  2228 -> ' issue'
-   746 -> ' when'
-   372 -> ' it'
-   756 -> ' has'
-  1063 -> ' been'
-  7699 -> ' answered'
- 29889 -> '.'
-
-sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
-
-
-Please close your issue when it has been answered.
-@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine??
-I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!!
-@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text]
-
-
-main: mem per token = 71159620 bytes
-main:     load time = 19309.95 ms
-main:   sample time =   168.62 ms
-main:  predict time = 223895.61 ms / 888.47 ms per token
-main:    total time = 246406.42 ms
-
- Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
-
-        3636882.89 msec task-clock                #   14.677 CPUs utilized
-             13509      context-switches          #    3.714 /sec
-              2436      cpu-migrations            #    0.670 /sec
-          10476679      page-faults               #    2.881 K/sec
-    13133115082869      cycles                    #    3.611 GHz                      (16.77%)
-       29314462753      stalled-cycles-frontend   #    0.22% frontend cycles idle     (16.76%)
-    10294402631459      stalled-cycles-backend    #   78.39% backend cycles idle      (16.74%)
-    23479217109614      instructions              #    1.79  insn per cycle
-                                                  #    0.44  stalled cycles per insn  (16.76%)
-     2353072268027      branches                  #  647.002 M/sec                    (16.77%)
-        1998682780      branch-misses             #    0.08% of all branches          (16.76%)
-
-     247.802177522 seconds time elapsed
-
-    3618.573072000 seconds user
-      18.491698000 seconds sys
-```
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
deleted file mode 100644
index 84faad37a..000000000
--- a/.github/workflows/build.yml
+++ /dev/null
@@ -1,632 +0,0 @@
-name: CI
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
-
-env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-  GGML_NLOOP: 3
-  GGML_NITER: 1
-  GGML_N_THREADS: 1
-
-jobs:
-  ubuntu-focal-make:
-    runs-on: ubuntu-20.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8
-
-      - name: Build
-        id: make_build
-        run: |
-          CC=gcc-8 make
-
-  ubuntu-latest-cmake:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake ..
-          cmake --build . --config Release
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest --verbose --timeout 900
-
-  ubuntu-latest-cmake-sanitizer:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build . --config ${{ matrix.build_type }}
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest --verbose --timeout 900
-
-  ubuntu-latest-cmake-mpi:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        mpi_library: [mpich, libopenmpi-dev]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ${{ matrix.mpi_library }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake -DLLAMA_MPI=ON ..
-          cmake --build . --config Release
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest --verbose
-
-  macOS-latest-make:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: make_build
-        run: |
-          make
-
-  macOS-latest-cmake:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
-          cmake --build . --config Release
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest --verbose --timeout 900
-
-  windows-latest-cmake:
-    runs-on: windows-latest
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      OPENCL_VERSION: 2023.04.17
-      CLBLAST_VERSION: 1.6.0
-
-    strategy:
-      matrix:
-        include:
-          - build: 'noavx'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
-          - build: 'avx2'
-            defines: '-DLLAMA_BUILD_SERVER=ON'
-          - build: 'avx'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'clblast'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
-          - build: 'openblas'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Download OpenCL SDK
-        id: get_opencl
-        if: ${{ matrix.build == 'clblast' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
-          mkdir $env:RUNNER_TEMP/opencl
-          tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
-
-      - name: Download CLBlast
-        id: get_clblast
-        if: ${{ matrix.build == 'clblast' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
-          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
-          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
-          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
-          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
-            $txt = Get-Content -Path $f -Raw
-            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
-          }
-
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. ${{ matrix.defines }}
-          cmake --build . --config Release
-
-      - name: Add clblast.dll
-        id: add_clblast_dll
-        if: ${{ matrix.build == 'clblast' }}
-        run: |
-          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
-          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Check AVX512F support
-        id: check_avx512f
-        if: ${{ matrix.build == 'avx512' }}
-        continue-on-error: true
-        run: |
-          cd build
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $cl =  $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
-          echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
-          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
-          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
-
-      - name: Test
-        id: cmake_test
-        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
-        run: |
-          cd build
-          ctest -C Release --verbose --timeout 900
-
-      - name: Get commit hash
-        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
-        with:
-          path: |
-            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
-
-  windows-latest-cmake-cublas:
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        cuda: ['12.1.0', '11.7.1']
-        build: ['cublas']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - uses: Jimver/cuda-toolkit@v0.2.10
-        id: cuda-toolkit
-        with:
-          cuda: ${{ matrix.cuda }}
-          # TODO(green-sky): _dev seems to fail, and non dev are not enought
-          #sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
-          cmake --build . --config Release
-
-      - name: Get commit hash
-        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
-        with:
-          path: |
-            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
-
-      - name: Copy and pack Cuda runtime
-        if: ${{ matrix.cuda == '12.1.0' }}
-        # TODO(green-sky): paths are cuda 12 specific
-        run: |
-          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
-          mkdir '.\build\bin\cudart\'
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
-
-      - name: Copy and pack Cuda runtime
-        if: ${{ matrix.cuda == '11.7.1' }}
-        # TODO(green-sky): paths are cuda 11 specific
-        run: |
-          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
-          mkdir '.\build\bin\cudart\'
-          ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
-          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
-
-      - name: Upload Cuda runtime
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
-        with:
-          path: |
-            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-
-  release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
-    runs-on: ubuntu-latest
-
-    needs:
-      - ubuntu-focal-make
-      - ubuntu-latest-cmake
-      - macOS-latest-make
-      - macOS-latest-cmake
-      - windows-latest-cmake
-      - windows-latest-cmake-cublas
-
-    steps:
-      - name: Download artifacts
-        id: download-artifact
-        uses: actions/download-artifact@v3
-
-      - name: Get commit hash
-        id: commit
-        uses: pr-mpt/actions-commit-hash@v2
-
-      - name: Create release
-        id: create_release
-        uses: anzz1/action-create-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
-
-      - name: Upload release
-        id: upload_release
-        uses: actions/github-script@v3
-        with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
-          script: |
-            const path = require('path');
-            const fs = require('fs');
-            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact')) {
-              if (path.extname(file) === '.zip') {
-                console.log('uploadReleaseAsset', file);
-                await github.repos.uploadReleaseAsset({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  release_id: release_id,
-                  name: file,
-                  data: await fs.readFileSync(`./artifact/${file}`)
-                });
-              }
-            }
-
-#  ubuntu-latest-gcc:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Debug, Release]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v1
-#
-#      - name: Dependencies
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#          sudo apt-get install cmake
-#
-#      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#
-#      - name: Build
-#        run: |
-#          make
-#
-#  ubuntu-latest-clang:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Debug, Release]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v1
-#
-#      - name: Dependencies
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#          sudo apt-get install cmake
-#
-#      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
-#
-#      - name: Build
-#        run: |
-#          make
-#
-#  ubuntu-latest-gcc-sanitized:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v1
-#
-#      - name: Dependencies
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#          sudo apt-get install cmake
-#
-#      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
-#
-#      - name: Build
-#        run: |
-#          make
-#
-#  windows:
-#    runs-on: windows-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Release]
-#        arch: [Win32, x64]
-#        include:
-#          - arch: Win32
-#            s2arc: x86
-#          - arch: x64
-#            s2arc: x64
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v1
-#
-#      - name: Add msbuild to PATH
-#        uses: microsoft/setup-msbuild@v1
-#
-#      - name: Configure
-#        run: >
-#          cmake -S . -B ./build -A ${{ matrix.arch }}
-#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#
-#      - name: Build
-#        run: |
-#          cd ./build
-#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-#
-#      - name: Upload binaries
-#        uses: actions/upload-artifact@v1
-#        with:
-#          name: llama-bin-${{ matrix.arch }}
-#          path: build/bin/${{ matrix.build }}
-#
-#  windows-blas:
-#    runs-on: windows-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Release]
-#        arch: [Win32, x64]
-#        blas: [ON]
-#        include:
-#          - arch: Win32
-#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
-#            s2arc: x86
-#          - arch: x64
-#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
-#            s2arc: x64
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v1
-#
-#      - name: Add msbuild to PATH
-#        uses: microsoft/setup-msbuild@v1
-#
-#      - name: Fetch OpenBLAS
-#        if: matrix.blas == 'ON'
-#        run: |
-#          C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
-#          7z x blas.zip -oblas -y
-#          copy blas/include/cblas.h .
-#          copy blas/include/openblas_config.h .
-#          echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
-#
-#      - name: Configure
-#        run: >
-#          cmake -S . -B ./build -A ${{ matrix.arch }}
-#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#          -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
-#          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
-#
-#      - name: Build
-#        run: |
-#          cd ./build
-#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-#
-#      - name: Copy libopenblas.dll
-#        if: matrix.blas == 'ON'
-#        run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
-#
-#      - name: Upload binaries
-#        if: matrix.blas == 'ON'
-#        uses: actions/upload-artifact@v1
-#        with:
-#          name: llama-blas-bin-${{ matrix.arch }}
-#          path: build/bin/${{ matrix.build }}
-#
-#  emscripten:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Release]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v1
-#
-#      - name: Dependencies
-#        run: |
-#          wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
-#          tar -xvf master.tar.gz
-#          emsdk-master/emsdk update
-#          emsdk-master/emsdk install latest
-#          emsdk-master/emsdk activate latest
-#
-#      - name: Configure
-#        run: echo "tmp"
-#
-#      - name: Build
-#        run: |
-#          pushd emsdk-master
-#          source ./emsdk_env.sh
-#          popd
-#          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#          make
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
deleted file mode 100644
index 379fbd7ad..000000000
--- a/.github/workflows/docker.yml
+++ /dev/null
@@ -1,65 +0,0 @@
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-# GitHub recommends pinning actions to a commit SHA.
-# To get a newer version, you will need to update the SHA.
-# You can also reference a tag or branch, but the action may change without warning.
-
-name: Publish Docker image
-
-on:
-  pull_request:
-  push:
-    branches:
-      - master
-
-jobs:
-  push_to_registry:
-    name: Push Docker image to Docker Hub
-    if: github.event.pull_request.draft == false
-
-    runs-on: ubuntu-latest
-    env:
-      COMMIT_SHA: ${{ github.sha }}
-    strategy:
-      matrix:
-        config:
-          - { tag: "light", dockerfile: ".devops/main.Dockerfile" }
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile" }
-    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v3
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-
-      - name: Log in to Docker Hub
-        uses: docker/login-action@v2
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Build and push Docker image (versioned)
-        if: github.event_name == 'push'
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          push: true
-          platforms: linux/amd64,linux/arm64
-          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
-          file: ${{ matrix.config.dockerfile }}
-
-      - name: Build and push Docker image (tagged)
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          push: ${{ github.event_name == 'push' }}
-          platforms: linux/amd64,linux/arm64
-          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
-          file: ${{ matrix.config.dockerfile }}
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
deleted file mode 100644
index b4e535acf..000000000
--- a/.github/workflows/editorconfig.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: EditorConfig Checker
-
-on:
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
-
-jobs:
-  editorconfig:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: editorconfig-checker/action-editorconfig-checker@main
-      - run: editorconfig-checker
diff --git a/.github/workflows/tidy-post.yml b/.github/workflows/tidy-post.yml
deleted file mode 100644
index 03652760c..000000000
--- a/.github/workflows/tidy-post.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: clang-tidy review post comments
-
-on:
-  workflow_dispatch:
-    workflows: ["clang-tidy-review"]
-    types:
-      - completed
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: ZedThree/clang-tidy-review/post@v0.13.0
-        # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
-        with:
-          # adjust options as necessary
-          lgtm_comment_body: ''
-          annotations: false
-          max_comments: 25
diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml
deleted file mode 100644
index a4bc8d976..000000000
--- a/.github/workflows/tidy-review.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: clang-tidy-review
-
-on:
-  pull_request:
-    branches:
-      - master
-
-jobs:
-  clang-tidy-review:
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-
-    - uses: ZedThree/clang-tidy-review@v0.13.0
-      id: review
-      with:
-        lgtm_comment_body: ''
-        build_dir: build
-        cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
-        split_workflow: true
-
-    - uses: ZedThree/clang-tidy-review/upload@v0.13.0

From 6873148771a08861922fb5070b18b56bc25f3701 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 13:24:20 +0300
Subject: [PATCH 04/26] gguf : first API pass

---
 ggml.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ggml.h | 55 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 109 insertions(+), 15 deletions(-)

diff --git a/ggml.c b/ggml.c
index 33459f263..8c01dcabf 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18297,6 +18297,75 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
 
 ////////////////////////////////////////////////////////////////////////////////
 
+struct gguf_string {
+    uint32_t n;
+    char * data;
+};
+
+union gguf_value;
+
+union gguf_value {
+    uint8_t  uint8;
+    int8_t   int8;
+    uint16_t uint16;
+    int16_t  int16;
+    uint32_t uint32;
+    int32_t  int32;
+    float    float32;
+    bool     bool_;
+
+    struct gguf_string str;
+
+    struct {
+        enum gguf_type type;
+
+        uint32_t n;
+        union gguf_value * arr;
+    } arr;
+};
+
+struct gguf_kv {
+    struct gguf_string key;
+
+    uint32_t n_bytes; // TODO: is this actually needed?
+
+    enum  gguf_type  type;
+    union gguf_value value;
+};
+
+struct gguf_header {
+    uint32_t magic;
+    uint32_t version;
+    uint32_t n_tensors;
+
+    uint32_t n_kv;
+    struct gguf_kv * kv;
+};
+
+struct gguf_tensor_info {
+    struct gguf_string name;
+
+    uint32_t n_dims;
+    uint32_t ne[GGML_MAX_DIMS];
+    uint32_t n_elements; // TODO: is this needed?
+
+    enum ggml_type type;
+
+    uint64_t offset; // must be a multiple of `ALIGNMENT`.
+};
+
+struct gguf_context {
+    struct gguf_header        header;
+    struct gguf_tensor_info * infos;
+
+    size_t alignment;
+
+    uint8_t * padding;
+    uint8_t * data;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
 int ggml_cpu_has_avx(void) {
 #if defined(__AVX__)
     return 1;
diff --git a/ggml.h b/ggml.h
index 2e700c9a0..51885917f 100644
--- a/ggml.h
+++ b/ggml.h
@@ -204,6 +204,7 @@
 #define GGML_MAX_NAME          48
 #define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4
+#define GGUF_DEFAULT_ALIGNMENT 32
 
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
@@ -1617,23 +1618,47 @@ extern "C" {
     // gguf
     //
 
-    enum gguf_metadata_value_type {
-        GGUF_METADATA_VALUE_TYPE_UINT8   = 0,
-        GGUF_METADATA_VALUE_TYPE_INT8    = 1,
-        GGUF_METADATA_VALUE_TYPE_UINT16  = 2,
-        GGUF_METADATA_VALUE_TYPE_INT16   = 3,
-        GGUF_METADATA_VALUE_TYPE_UINT32  = 4,
-        GGUF_METADATA_VALUE_TYPE_INT32   = 5,
-        GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
-        GGUF_METADATA_VALUE_TYPE_BOOL    = 7,
-        GGUF_METADATA_VALUE_TYPE_STRING  = 8,
-        GGUF_METADATA_VALUE_TYPE_ARRAY   = 9,
+    enum gguf_type {
+        GGUF_TYPE_UINT8   = 0,
+        GGUF_TYPE_INT8    = 1,
+        GGUF_TYPE_UINT16  = 2,
+        GGUF_TYPE_INT16   = 3,
+        GGUF_TYPE_UINT32  = 4,
+        GGUF_TYPE_INT32   = 5,
+        GGUF_TYPE_FLOAT32 = 6,
+        GGUF_TYPE_BOOL    = 7,
+        GGUF_TYPE_STRING  = 8,
+        GGUF_TYPE_ARRAY   = 9,
     };
 
-    struct gguf_string {
-        uint32_t n;
-        char * data;
-    };
+    struct gguf_context;
+
+    GGML_API struct gguf_context * gguf_gguf_init(const char * path);
+    GGML_API void                  gguf_gguf_free(struct gguf_context * ctx);
+
+    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
+
+    GGML_API int            gguf_get_n_kv(struct gguf_context * ctx);
+    GGML_API const char *   gguf_get_key (struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_type(struct gguf_context * ctx, int i);
+    GGML_API void           gguf_get_val (struct gguf_context * ctx, int i, void * val);
+
+    GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
+    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
+    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
+    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);
+    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
+    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
+    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
+    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
+    // TODO: arr
+
+    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
+
     //
     // system info
     //

From 8d6acfec127bcf769ee015cb005ad12c02631cfb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 14:33:53 +0300
Subject: [PATCH 05/26] gguf : read header + meta data

---
 ggml.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 ggml.h |  13 ++++---
 2 files changed, 120 insertions(+), 11 deletions(-)

diff --git a/ggml.c b/ggml.c
index 8c01dcabf..0c4651802 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18297,7 +18297,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
 
 ////////////////////////////////////////////////////////////////////////////////
 
-struct gguf_string {
+struct gguf_str {
     uint32_t n;
     char * data;
 };
@@ -18314,7 +18314,7 @@ union gguf_value {
     float    float32;
     bool     bool_;
 
-    struct gguf_string str;
+    struct gguf_str str;
 
     struct {
         enum gguf_type type;
@@ -18325,7 +18325,7 @@ union gguf_value {
 };
 
 struct gguf_kv {
-    struct gguf_string key;
+    struct gguf_str key;
 
     uint32_t n_bytes; // TODO: is this actually needed?
 
@@ -18337,13 +18337,13 @@ struct gguf_header {
     uint32_t magic;
     uint32_t version;
     uint32_t n_tensors;
-
     uint32_t n_kv;
+
     struct gguf_kv * kv;
 };
 
 struct gguf_tensor_info {
-    struct gguf_string name;
+    struct gguf_str name;
 
     uint32_t n_dims;
     uint32_t ne[GGML_MAX_DIMS];
@@ -18364,6 +18364,114 @@ struct gguf_context {
     uint8_t * data;
 };
 
+static bool gguf_fread_el(void * dst, size_t size, FILE * file, size_t * offset) {
+    const size_t n = fread(dst, 1, size, file);
+    *offset += n;
+    return n == size;
+}
+
+static bool gguf_fread_str(void * dst, FILE * file, size_t * offset) {
+    struct gguf_str * p = (struct gguf_str *) dst;
+
+    p->n    = 0;
+    p->data = NULL;
+
+    bool ok = true;
+
+    ok = ok && gguf_fread_el(&p->n,    sizeof(p->n), file, offset);
+    ok = ok && gguf_fread_el(&p->data, p->n,         file, offset);
+
+    return ok;
+}
+
+struct gguf_context * gguf_init(const char * path, bool load) {
+    FILE * file = fopen(path, "rb");
+    if (!file) {
+        return NULL;
+    }
+
+    // offset from start of file
+    size_t offset = 0;
+
+    // check the magic before making allocations
+    uint32_t magic = 0;
+    gguf_fread_el(&magic, sizeof(magic), file, &offset);
+    if (magic != GGUF_MAGIC) {
+        fprintf(stderr, "gguf: invalid magic number %08x\n", magic);
+        fclose(file);
+        return NULL;
+    }
+
+    bool ok = true;
+
+    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+
+    ctx->header.magic = magic;
+
+    ok = ok && gguf_fread_el(&ctx->header.version,   sizeof(ctx->header.version),   file, &offset);
+    ok = ok && gguf_fread_el(&ctx->header.n_tensors, sizeof(ctx->header.n_tensors), file, &offset);
+    ok = ok && gguf_fread_el(&ctx->header.n_kv,      sizeof(ctx->header.n_kv),      file, &offset);
+
+    if (!ok) {
+        fprintf(stderr, "gguf: failed to read header\n");
+        fclose(file);
+        gguf_free(ctx);
+        return NULL;
+    }
+
+    ctx->header.kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
+
+    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+        struct gguf_kv * kv = &ctx->header.kv[i];
+
+        ok = ok && gguf_fread_str(&kv->key,                          file, &offset);
+      //ok = ok && gguf_fread_el (&kv->n_bytes, sizeof(kv->n_bytes), file, &offset);
+        ok = ok && gguf_fread_el (&kv->type,    sizeof(kv->type),    file, &offset);
+
+        switch (kv->type) {
+            case GGUF_TYPE_UINT8:
+                ok = ok && gguf_fread_el (&kv->value.uint8,   sizeof(kv->value.uint8),   file, &offset); break;
+            case GGUF_TYPE_INT8:
+                ok = ok && gguf_fread_el (&kv->value.int8,    sizeof(kv->value.int8),    file, &offset); break;
+            case GGUF_TYPE_UINT16:
+                ok = ok && gguf_fread_el (&kv->value.uint16,  sizeof(kv->value.uint16),  file, &offset); break;
+            case GGUF_TYPE_INT16:
+                ok = ok && gguf_fread_el (&kv->value.int16,   sizeof(kv->value.int16),   file, &offset); break;
+            case GGUF_TYPE_UINT32:
+                ok = ok && gguf_fread_el (&kv->value.uint32,  sizeof(kv->value.uint32),  file, &offset); break;
+            case GGUF_TYPE_INT32:
+                ok = ok && gguf_fread_el (&kv->value.int32,   sizeof(kv->value.int32),   file, &offset); break;
+            case GGUF_TYPE_FLOAT32:
+                ok = ok && gguf_fread_el (&kv->value.float32, sizeof(kv->value.float32), file, &offset); break;
+            case GGUF_TYPE_BOOL:
+                ok = ok && gguf_fread_el (&kv->value.bool_,   sizeof(kv->value.bool_),   file, &offset); break;
+            case GGUF_TYPE_STRING:
+                ok = ok && gguf_fread_str(&kv->value.str,                                file, &offset); break;
+            case GGUF_TYPE_ARRAY:
+                GGML_ASSERT("gguf: array type not implemented");
+                break;
+            };
+    }
+
+    if (!ok) {
+        fprintf(stderr, "gguf: failed to read key-value pairs\n");
+        free(ctx->header.kv);
+        fclose(file);
+        gguf_free(ctx);
+        return NULL;
+    }
+
+    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
+
+
+
+    return ctx;
+}
+
+void gguf_free(struct gguf_context * ctx) {
+    GGML_ALIGNED_FREE(ctx);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 int ggml_cpu_has_avx(void) {
diff --git a/ggml.h b/ggml.h
index 51885917f..1983bcd3e 100644
--- a/ggml.h
+++ b/ggml.h
@@ -190,9 +190,6 @@
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1
 
-#define GGUF_FILE_MAGIC   0x47475546 // "GGUF"
-#define GGUF_FILE_VERSION 1
-
 #define GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 
@@ -204,11 +201,15 @@
 #define GGML_MAX_NAME          48
 #define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4
-#define GGUF_DEFAULT_ALIGNMENT 32
 
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 
+#define GGUF_MAGIC   0x47475546 // "GGUF"
+#define GGUF_VERSION 1
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
 #define GGML_UNUSED(x) (void)(x)
 
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -1633,8 +1634,8 @@ extern "C" {
 
     struct gguf_context;
 
-    GGML_API struct gguf_context * gguf_gguf_init(const char * path);
-    GGML_API void                  gguf_gguf_free(struct gguf_context * ctx);
+    GGML_API struct gguf_context * gguf_init(const char * path, bool load);
+    GGML_API void                  gguf_free(struct gguf_context * ctx);
 
     GGML_API int    gguf_get_version    (struct gguf_context * ctx);
     GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);

From d91b985d2db894d9194506e0fdc2d50c31371777 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 14:58:35 +0300
Subject: [PATCH 06/26] gguf : read tensor info

---
 ggml.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 0c4651802..4c30725fe 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18347,7 +18347,7 @@ struct gguf_tensor_info {
 
     uint32_t n_dims;
     uint32_t ne[GGML_MAX_DIMS];
-    uint32_t n_elements; // TODO: is this needed?
+    uint32_t n_elms; // TODO: is this needed?
 
     enum ggml_type type;
 
@@ -18359,8 +18359,9 @@ struct gguf_context {
     struct gguf_tensor_info * infos;
 
     size_t alignment;
+    size_t offset;
 
-    uint8_t * padding;
+    //uint8_t * padding;
     uint8_t * data;
 };
 
@@ -18461,9 +18462,55 @@ struct gguf_context * gguf_init(const char * path, bool load) {
         return NULL;
     }
 
+    ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+
+    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        struct gguf_tensor_info * info = &ctx->infos[i];
+
+        memset(info->ne, 0, sizeof(info->ne));
+
+        ok = ok && gguf_fread_str(&info->name,                          file, &offset);
+        ok = ok && gguf_fread_el (&info->n_dims, sizeof(info->n_dims),  file, &offset);
+        for (uint32_t j = 0; j < info->n_dims; ++j) {
+            ok = ok && gguf_fread_el (&info->ne[j], sizeof(info->ne[j]), file, &offset);
+        }
+      //ok = ok && gguf_fread_el (&info->n_elms, sizeof(info->n_elms),  file, &offset);
+        ok = ok && gguf_fread_el (&info->type,    sizeof(info->type),   file, &offset);
+        ok = ok && gguf_fread_el (&info->offset,  sizeof(info->offset), file, &offset);
+
+        if (!ok) {
+            fprintf(stderr, "gguf: failed to read tensor info\n");
+            free(ctx->header.kv);
+            free(ctx->infos);
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
+        }
+    }
+
     ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
 
+    // TODO: determine new alignment from kv if available
 
+    {
+        const size_t offset_pad = offset % ctx->alignment;
+
+        if (offset_pad != 0) {
+            offset += ctx->alignment - offset_pad;
+            fseek(file, offset, SEEK_SET);
+        }
+    }
+
+    ctx->offset = offset;
+
+    if (load) {
+        GGML_ASSERT("gguf: load not implemented");
+        // - compute total tensor size
+        // - allocate buffer
+        // - read tensor data into buffer
+        // - add gguf_get_tensor_data() API
+        // - maybe create a ggml_context and return it
+    }
 
     return ctx;
 }

From 78b226a9597c662a81b5ba986f64fb42b8de40eb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 16:32:05 +0300
Subject: [PATCH 07/26] gguf : initial model loading - not tested

---
 ggml.c | 327 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 ggml.h |  13 ++-
 2 files changed, 301 insertions(+), 39 deletions(-)

diff --git a/ggml.c b/ggml.c
index 4c30725fe..f252363d9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18351,7 +18351,7 @@ struct gguf_tensor_info {
 
     enum ggml_type type;
 
-    uint64_t offset; // must be a multiple of `ALIGNMENT`.
+    uint64_t offset; // offset from beginning of file, must be a multiple of `ALIGNMENT`
 };
 
 struct gguf_context {
@@ -18359,7 +18359,8 @@ struct gguf_context {
     struct gguf_tensor_info * infos;
 
     size_t alignment;
-    size_t offset;
+    size_t offset;    // offset of `data` from beginning of file
+    size_t size_data; // size of `data` in bytes
 
     //uint8_t * padding;
     uint8_t * data;
@@ -18379,14 +18380,15 @@ static bool gguf_fread_str(void * dst, FILE * file, size_t * offset) {
 
     bool ok = true;
 
-    ok = ok && gguf_fread_el(&p->n,    sizeof(p->n), file, offset);
+    // TODO: how to avoid mallocs for strings?
+    ok = ok && gguf_fread_el(&p->n,    sizeof(p->n), file, offset); p->data = calloc(p->n + 1, 1);
     ok = ok && gguf_fread_el(&p->data, p->n,         file, offset);
 
     return ok;
 }
 
-struct gguf_context * gguf_init(const char * path, bool load) {
-    FILE * file = fopen(path, "rb");
+struct gguf_context * gguf_init(const char * fname, struct gguf_init_params params) {
+    FILE * file = fopen(fname, "rb");
     if (!file) {
         return NULL;
     }
@@ -18398,7 +18400,7 @@ struct gguf_context * gguf_init(const char * path, bool load) {
     uint32_t magic = 0;
     gguf_fread_el(&magic, sizeof(magic), file, &offset);
     if (magic != GGUF_MAGIC) {
-        fprintf(stderr, "gguf: invalid magic number %08x\n", magic);
+        fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
         fclose(file);
         return NULL;
     }
@@ -18408,13 +18410,17 @@ struct gguf_context * gguf_init(const char * path, bool load) {
     struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 
     ctx->header.magic = magic;
+    ctx->header.kv    = NULL;
+
+    ctx->infos = NULL;
+    ctx->data  = NULL;
 
     ok = ok && gguf_fread_el(&ctx->header.version,   sizeof(ctx->header.version),   file, &offset);
     ok = ok && gguf_fread_el(&ctx->header.n_tensors, sizeof(ctx->header.n_tensors), file, &offset);
     ok = ok && gguf_fread_el(&ctx->header.n_kv,      sizeof(ctx->header.n_kv),      file, &offset);
 
     if (!ok) {
-        fprintf(stderr, "gguf: failed to read header\n");
+        fprintf(stderr, "%s: failed to read header\n", __func__);
         fclose(file);
         gguf_free(ctx);
         return NULL;
@@ -18430,24 +18436,15 @@ struct gguf_context * gguf_init(const char * path, bool load) {
         ok = ok && gguf_fread_el (&kv->type,    sizeof(kv->type),    file, &offset);
 
         switch (kv->type) {
-            case GGUF_TYPE_UINT8:
-                ok = ok && gguf_fread_el (&kv->value.uint8,   sizeof(kv->value.uint8),   file, &offset); break;
-            case GGUF_TYPE_INT8:
-                ok = ok && gguf_fread_el (&kv->value.int8,    sizeof(kv->value.int8),    file, &offset); break;
-            case GGUF_TYPE_UINT16:
-                ok = ok && gguf_fread_el (&kv->value.uint16,  sizeof(kv->value.uint16),  file, &offset); break;
-            case GGUF_TYPE_INT16:
-                ok = ok && gguf_fread_el (&kv->value.int16,   sizeof(kv->value.int16),   file, &offset); break;
-            case GGUF_TYPE_UINT32:
-                ok = ok && gguf_fread_el (&kv->value.uint32,  sizeof(kv->value.uint32),  file, &offset); break;
-            case GGUF_TYPE_INT32:
-                ok = ok && gguf_fread_el (&kv->value.int32,   sizeof(kv->value.int32),   file, &offset); break;
-            case GGUF_TYPE_FLOAT32:
-                ok = ok && gguf_fread_el (&kv->value.float32, sizeof(kv->value.float32), file, &offset); break;
-            case GGUF_TYPE_BOOL:
-                ok = ok && gguf_fread_el (&kv->value.bool_,   sizeof(kv->value.bool_),   file, &offset); break;
-            case GGUF_TYPE_STRING:
-                ok = ok && gguf_fread_str(&kv->value.str,                                file, &offset); break;
+            case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (&kv->value.uint8,   sizeof(kv->value.uint8),   file, &offset); break;
+            case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (&kv->value.int8,    sizeof(kv->value.int8),    file, &offset); break;
+            case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (&kv->value.uint16,  sizeof(kv->value.uint16),  file, &offset); break;
+            case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (&kv->value.int16,   sizeof(kv->value.int16),   file, &offset); break;
+            case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (&kv->value.uint32,  sizeof(kv->value.uint32),  file, &offset); break;
+            case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (&kv->value.int32,   sizeof(kv->value.int32),   file, &offset); break;
+            case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (&kv->value.float32, sizeof(kv->value.float32), file, &offset); break;
+            case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (&kv->value.bool_,   sizeof(kv->value.bool_),   file, &offset); break;
+            case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(&kv->value.str,                                file, &offset); break;
             case GGUF_TYPE_ARRAY:
                 GGML_ASSERT("gguf: array type not implemented");
                 break;
@@ -18455,8 +18452,7 @@ struct gguf_context * gguf_init(const char * path, bool load) {
     }
 
     if (!ok) {
-        fprintf(stderr, "gguf: failed to read key-value pairs\n");
-        free(ctx->header.kv);
+        fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
         fclose(file);
         gguf_free(ctx);
         return NULL;
@@ -18467,7 +18463,7 @@ struct gguf_context * gguf_init(const char * path, bool load) {
     for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
         struct gguf_tensor_info * info = &ctx->infos[i];
 
-        memset(info->ne, 0, sizeof(info->ne));
+        memset(info->ne, 1, sizeof(info->ne));
 
         ok = ok && gguf_fread_str(&info->name,                          file, &offset);
         ok = ok && gguf_fread_el (&info->n_dims, sizeof(info->n_dims),  file, &offset);
@@ -18479,9 +18475,7 @@ struct gguf_context * gguf_init(const char * path, bool load) {
         ok = ok && gguf_fread_el (&info->offset,  sizeof(info->offset), file, &offset);
 
         if (!ok) {
-            fprintf(stderr, "gguf: failed to read tensor info\n");
-            free(ctx->header.kv);
-            free(ctx->infos);
+            fprintf(stderr, "%s: failed to read tensor info\n", __func__);
             fclose(file);
             gguf_free(ctx);
             return NULL;
@@ -18503,22 +18497,279 @@ struct gguf_context * gguf_init(const char * path, bool load) {
 
     ctx->offset = offset;
 
-    if (load) {
-        GGML_ASSERT("gguf: load not implemented");
-        // - compute total tensor size
-        // - allocate buffer
-        // - read tensor data into buffer
-        // - add gguf_get_tensor_data() API
-        // - maybe create a ggml_context and return it
+    ctx->size_data = 0;
+
+    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        struct gguf_tensor_info * info = &ctx->infos[i];
+
+        const int64_t ne =
+            (int64_t) info->ne[0] *
+            (int64_t) info->ne[1] *
+            (int64_t) info->ne[2] *
+            (int64_t) info->ne[3];
+
+        if (ne % ggml_blck_size(info->type) != 0) {
+            fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
+                    __func__, info->name.data, ne, ggml_blck_size(info->type));
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
+        }
+
+        const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
+
+        // TODO: pad size_cur to alignment
+        ctx->size_data += size_cur;
+    }
+
+    // TODO: simplify
+    if (params.load) {
+        if (params.malloc) {
+            ctx->data = GGML_ALIGNED_MALLOC(ctx->size_data);
+            fseek(file, ctx->offset, SEEK_SET);
+            ok = ok && gguf_fread_el(ctx->data, ctx->size_data, file, &offset);
+        } else if (params.ctx != NULL) {
+            bool ctx_new = false;
+            bool ctx_no_alloc = false;
+
+            if (*params.ctx == NULL) {
+                const size_t mem_size =
+                    ctx->header.n_tensors*ggml_tensor_overhead() + 1 +
+                    ctx->size_data;
+
+                struct ggml_init_params pdata = {
+                    .mem_size   = mem_size,
+                    .mem_buffer = NULL,
+                    .no_alloc   = false,
+                };
+
+                *params.ctx = ggml_init(pdata);
+
+                ctx_new = true;
+            } else {
+                ctx_no_alloc = ggml_get_no_alloc(*params.ctx);
+                ggml_set_no_alloc(*params.ctx, false);
+            }
+
+            struct ggml_context * ctx_data = *params.ctx;
+
+            struct ggml_tensor * data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size_data);
+
+            // read the tensor data
+            ok = ok && gguf_fread_el(data->data, ctx->size_data, file, &offset);
+
+            if (!ok) {
+                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
+                fclose(file);
+                if (ctx_new) {
+                    ggml_free(ctx_data);
+                } else {
+                    ggml_set_no_alloc(ctx_data, ctx_no_alloc);
+                }
+                gguf_free(ctx);
+                return NULL;
+            }
+
+            ctx->data = data->data;
+
+            // create the tensors
+            ggml_set_no_alloc(ctx_data, true);
+
+            for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+                const int64_t ne[GGML_MAX_DIMS] = {
+                    ctx->infos[i].ne[0],
+                    ctx->infos[i].ne[1],
+                    ctx->infos[i].ne[2],
+                    ctx->infos[i].ne[3],
+                };
+
+                struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
+
+                ok = ok && cur != NULL;
+
+                if (!ok) {
+                    break;
+                }
+
+                cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset;
+            }
+
+            if (!ok) {
+                fprintf(stderr, "%s: failed to create tensors\n", __func__);
+                fclose(file);
+                if (ctx_new) {
+                    ggml_free(ctx_data);
+                } else {
+                    ggml_set_no_alloc(ctx_data, ctx_no_alloc);
+                }
+                gguf_free(ctx);
+                return NULL;
+            }
+
+            ggml_set_no_alloc(ctx_data, ctx_no_alloc);
+        } else {
+            GGML_ASSERT("gguf: invalid params - load requires malloc or ctx");
+        }
+    }
+
+    if (!ok) {
+        fprintf(stderr, "%s: failed to read tensor data\n", __func__);
+        fclose(file);
+        gguf_free(ctx);
+        return NULL;
     }
 
     return ctx;
 }
 
 void gguf_free(struct gguf_context * ctx) {
+    if (ctx == NULL) {
+        return;
+    }
+
+    if (ctx->header.kv) {
+        // free string memory - not great..
+        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+            struct gguf_kv * kv = &ctx->header.kv[i];
+
+            if (kv->key.data) {
+                free(kv->key.data);
+            }
+
+            if (kv->type == GGUF_TYPE_STRING) {
+                if (kv->value.str.data) {
+                    free(kv->value.str.data);
+                }
+            }
+        }
+
+        GGML_ALIGNED_FREE(ctx->header.kv);
+    }
+
+    if (ctx->infos) {
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+            struct gguf_tensor_info * info = &ctx->infos[i];
+
+            if (info->name.data) {
+                free(info->name.data);
+            }
+        }
+
+        GGML_ALIGNED_FREE(ctx->infos);
+    }
+
     GGML_ALIGNED_FREE(ctx);
 }
 
+int gguf_get_version(struct gguf_context * ctx) {
+    return ctx->header.version;
+}
+
+size_t gguf_get_alignment(struct gguf_context * ctx) {
+    return ctx->alignment;
+}
+
+size_t gguf_get_data_offset(struct gguf_context * ctx) {
+    return ctx->offset;
+}
+
+void * gguf_get_data(struct gguf_context * ctx) {
+    return ctx->data;
+}
+
+int gguf_get_n_kv(struct gguf_context * ctx) {
+    return ctx->header.n_kv;
+}
+
+const char * gguf_get_key(struct gguf_context * ctx, int i) {
+    return ctx->header.kv[i].key.data;
+}
+
+enum gguf_type gguf_get_type(struct gguf_context * ctx, int i) {
+    return ctx->header.kv[i].type;
+}
+
+void gguf_get_val(struct gguf_context * ctx, int i, void * val) {
+    struct gguf_kv * kv = &ctx->header.kv[i];
+
+    switch (kv->type) {
+        case GGUF_TYPE_UINT8:   memcpy(val, &kv->value.uint8,    sizeof(uint8_t));  break;
+        case GGUF_TYPE_INT8:    memcpy(val, &kv->value.int8,     sizeof(int8_t));   break;
+        case GGUF_TYPE_UINT16:  memcpy(val, &kv->value.uint16,   sizeof(uint16_t)); break;
+        case GGUF_TYPE_INT16:   memcpy(val, &kv->value.int16,    sizeof(int16_t));  break;
+        case GGUF_TYPE_UINT32:  memcpy(val, &kv->value.uint32,   sizeof(uint32_t)); break;
+        case GGUF_TYPE_INT32:   memcpy(val, &kv->value.int32,    sizeof(int32_t));  break;
+        case GGUF_TYPE_FLOAT32: memcpy(val, &kv->value.float32,  sizeof(float));    break;
+        case GGUF_TYPE_BOOL:    memcpy(val, &kv->value.bool_,    sizeof(bool));     break;
+        case GGUF_TYPE_STRING:  memcpy(val, &kv->value.str.data, sizeof(char *));   break;
+        default:
+            GGML_ASSERT("gguf: not implemented");
+            break;
+    }
+}
+
+uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
+    uint8_t val;
+    gguf_get_val(ctx, i, &val);
+    return val;
+}
+
+int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
+    int8_t val;
+    gguf_get_val(ctx, i, &val);
+    return val;
+}
+
+uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
+    uint16_t val;
+    gguf_get_val(ctx, i, &val);
+    return val;
+}
+
+int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
+    int16_t val;
+    gguf_get_val(ctx, i, &val);
+    return val;
+}
+
+uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
+    uint32_t val;
+    gguf_get_val(ctx, i, &val);
+    return val;
+}
+
+int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
+    int32_t val;
+    gguf_get_val(ctx, i, &val);
+    return val;
+}
+
+float gguf_get_val_f32(struct gguf_context * ctx, int i) {
+    float val;
+    gguf_get_val(ctx, i, &val);
+    return val;
+}
+
+bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
+    bool val;
+    gguf_get_val(ctx, i, &val);
+    return val;
+}
+
+const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
+    char * val;
+    gguf_get_val(ctx, i, &val);
+    return val;
+}
+
+int gguf_get_n_tensors(struct gguf_context * ctx) {
+    return ctx->header.n_tensors;
+}
+
+size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
+    return ctx->infos[i].offset;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 int ggml_cpu_has_avx(void) {
diff --git a/ggml.h b/ggml.h
index 1983bcd3e..fac0f5e68 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1634,12 +1634,23 @@ extern "C" {
 
     struct gguf_context;
 
-    GGML_API struct gguf_context * gguf_init(const char * path, bool load);
+    struct gguf_init_params {
+        bool load;   // load the tensor data
+        bool malloc; // if false, use the provided ggml_context to allocate the tensor data
+                     //           it no ggml_context is provided, it will be created
+                     // if true,  use malloc to allocate the tensor data
+
+        struct ggml_context ** ctx;
+    };
+
+    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
     GGML_API void                  gguf_free(struct gguf_context * ctx);
 
     GGML_API int    gguf_get_version    (struct gguf_context * ctx);
     GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
     GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
+    GGML_API void * gguf_get_data       (struct gguf_context * ctx);
 
     GGML_API int            gguf_get_n_kv(struct gguf_context * ctx);
     GGML_API const char *   gguf_get_key (struct gguf_context * ctx, int i);

From 860c9c63ce204dea31fe994ef370c569f7a75596 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 16:36:03 +0300
Subject: [PATCH 08/26] gguf : add gguf_get_tensor_name()

---
 ggml.c | 4 ++++
 ggml.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/ggml.c b/ggml.c
index f252363d9..030475062 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18770,6 +18770,10 @@ size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
     return ctx->infos[i].offset;
 }
 
+char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
+    return ctx->infos[i].name.data;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 int ggml_cpu_has_avx(void) {
diff --git a/ggml.h b/ggml.h
index fac0f5e68..1a748d8d8 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1670,6 +1670,7 @@ extern "C" {
 
     GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
     GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
+    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
 
     //
     // system info

From cb871fa022aa7a8b72c3a616f7ac7e8e9f1748d9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 18:48:52 +0300
Subject: [PATCH 09/26] gguf : do not support passing existing ggml_context to
 gguf_init

---
 ggml.c | 51 ++++++++++++++++-----------------------------------
 ggml.h |  5 ++---
 2 files changed, 18 insertions(+), 38 deletions(-)

diff --git a/ggml.c b/ggml.c
index 030475062..1b14c3790 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18388,6 +18388,8 @@ static bool gguf_fread_str(void * dst, FILE * file, size_t * offset) {
 }
 
 struct gguf_context * gguf_init(const char * fname, struct gguf_init_params params) {
+    GGML_ASSERT(!params.load || params.malloc || params.ctx != NULL);
+
     FILE * file = fopen(fname, "rb");
     if (!file) {
         return NULL;
@@ -18518,8 +18520,7 @@ struct gguf_context * gguf_init(const char * fname, struct gguf_init_params para
 
         const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
 
-        // TODO: pad size_cur to alignment
-        ctx->size_data += size_cur;
+        ctx->size_data += GGML_PAD(size_cur, ctx->alignment);
     }
 
     // TODO: simplify
@@ -18528,28 +18529,18 @@ struct gguf_context * gguf_init(const char * fname, struct gguf_init_params para
             ctx->data = GGML_ALIGNED_MALLOC(ctx->size_data);
             fseek(file, ctx->offset, SEEK_SET);
             ok = ok && gguf_fread_el(ctx->data, ctx->size_data, file, &offset);
-        } else if (params.ctx != NULL) {
-            bool ctx_new = false;
-            bool ctx_no_alloc = false;
+        } else {
+            const size_t mem_size =
+                ctx->header.n_tensors*ggml_tensor_overhead() + 1 +
+                ctx->size_data;
 
-            if (*params.ctx == NULL) {
-                const size_t mem_size =
-                    ctx->header.n_tensors*ggml_tensor_overhead() + 1 +
-                    ctx->size_data;
+            struct ggml_init_params pdata = {
+                .mem_size   = mem_size,
+                .mem_buffer = NULL,
+                .no_alloc   = false,
+            };
 
-                struct ggml_init_params pdata = {
-                    .mem_size   = mem_size,
-                    .mem_buffer = NULL,
-                    .no_alloc   = false,
-                };
-
-                *params.ctx = ggml_init(pdata);
-
-                ctx_new = true;
-            } else {
-                ctx_no_alloc = ggml_get_no_alloc(*params.ctx);
-                ggml_set_no_alloc(*params.ctx, false);
-            }
+            *params.ctx = ggml_init(pdata);
 
             struct ggml_context * ctx_data = *params.ctx;
 
@@ -18561,11 +18552,7 @@ struct gguf_context * gguf_init(const char * fname, struct gguf_init_params para
             if (!ok) {
                 fprintf(stderr, "%s: failed to read tensor data\n", __func__);
                 fclose(file);
-                if (ctx_new) {
-                    ggml_free(ctx_data);
-                } else {
-                    ggml_set_no_alloc(ctx_data, ctx_no_alloc);
-                }
+                ggml_free(ctx_data);
                 gguf_free(ctx);
                 return NULL;
             }
@@ -18597,18 +18584,12 @@ struct gguf_context * gguf_init(const char * fname, struct gguf_init_params para
             if (!ok) {
                 fprintf(stderr, "%s: failed to create tensors\n", __func__);
                 fclose(file);
-                if (ctx_new) {
-                    ggml_free(ctx_data);
-                } else {
-                    ggml_set_no_alloc(ctx_data, ctx_no_alloc);
-                }
+                ggml_free(ctx_data);
                 gguf_free(ctx);
                 return NULL;
             }
 
-            ggml_set_no_alloc(ctx_data, ctx_no_alloc);
-        } else {
-            GGML_ASSERT("gguf: invalid params - load requires malloc or ctx");
+            ggml_set_no_alloc(ctx_data, false);
         }
     }
 
diff --git a/ggml.h b/ggml.h
index 1a748d8d8..7d5514ba3 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1636,9 +1636,8 @@ extern "C" {
 
     struct gguf_init_params {
         bool load;   // load the tensor data
-        bool malloc; // if false, use the provided ggml_context to allocate the tensor data
-                     //           it no ggml_context is provided, it will be created
-                     // if true,  use malloc to allocate the tensor data
+        bool malloc; // if false, create a ggml_context and allocate the tensor data in it
+                     // if  true, use malloc to allocate the tensor data instead
 
         struct ggml_context ** ctx;
     };

From d313c0fa33dc284e23a88fae90b1f94cb0ff6f5c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 18:53:57 +0300
Subject: [PATCH 10/26] gguf : simplify gguf_get_val

---
 ggml.c | 68 +++++++++++++++++++---------------------------------------
 ggml.h | 20 +++--------------
 2 files changed, 25 insertions(+), 63 deletions(-)

diff --git a/ggml.c b/ggml.c
index 1b14c3790..e68e91e18 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18297,6 +18297,19 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
 
 ////////////////////////////////////////////////////////////////////////////////
 
+enum gguf_type {
+    GGUF_TYPE_UINT8   = 0,
+    GGUF_TYPE_INT8    = 1,
+    GGUF_TYPE_UINT16  = 2,
+    GGUF_TYPE_INT16   = 3,
+    GGUF_TYPE_UINT32  = 4,
+    GGUF_TYPE_INT32   = 5,
+    GGUF_TYPE_FLOAT32 = 6,
+    GGUF_TYPE_BOOL    = 7,
+    GGUF_TYPE_STRING  = 8,
+    GGUF_TYPE_ARRAY   = 9,
+};
+
 struct gguf_str {
     uint32_t n;
     char * data;
@@ -18670,77 +18683,40 @@ enum gguf_type gguf_get_type(struct gguf_context * ctx, int i) {
     return ctx->header.kv[i].type;
 }
 
-void gguf_get_val(struct gguf_context * ctx, int i, void * val) {
-    struct gguf_kv * kv = &ctx->header.kv[i];
-
-    switch (kv->type) {
-        case GGUF_TYPE_UINT8:   memcpy(val, &kv->value.uint8,    sizeof(uint8_t));  break;
-        case GGUF_TYPE_INT8:    memcpy(val, &kv->value.int8,     sizeof(int8_t));   break;
-        case GGUF_TYPE_UINT16:  memcpy(val, &kv->value.uint16,   sizeof(uint16_t)); break;
-        case GGUF_TYPE_INT16:   memcpy(val, &kv->value.int16,    sizeof(int16_t));  break;
-        case GGUF_TYPE_UINT32:  memcpy(val, &kv->value.uint32,   sizeof(uint32_t)); break;
-        case GGUF_TYPE_INT32:   memcpy(val, &kv->value.int32,    sizeof(int32_t));  break;
-        case GGUF_TYPE_FLOAT32: memcpy(val, &kv->value.float32,  sizeof(float));    break;
-        case GGUF_TYPE_BOOL:    memcpy(val, &kv->value.bool_,    sizeof(bool));     break;
-        case GGUF_TYPE_STRING:  memcpy(val, &kv->value.str.data, sizeof(char *));   break;
-        default:
-            GGML_ASSERT("gguf: not implemented");
-            break;
-    }
-}
-
 uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
-    uint8_t val;
-    gguf_get_val(ctx, i, &val);
-    return val;
+    return ctx->header.kv[i].value.uint8;
 }
 
 int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
-    int8_t val;
-    gguf_get_val(ctx, i, &val);
-    return val;
+    return ctx->header.kv[i].value.int8;
 }
 
 uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
-    uint16_t val;
-    gguf_get_val(ctx, i, &val);
-    return val;
+    return ctx->header.kv[i].value.uint16;
 }
 
 int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
-    int16_t val;
-    gguf_get_val(ctx, i, &val);
-    return val;
+    return ctx->header.kv[i].value.int16;
 }
 
 uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
-    uint32_t val;
-    gguf_get_val(ctx, i, &val);
-    return val;
+    return ctx->header.kv[i].value.uint32;
 }
 
 int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
-    int32_t val;
-    gguf_get_val(ctx, i, &val);
-    return val;
+    return ctx->header.kv[i].value.int32;
 }
 
 float gguf_get_val_f32(struct gguf_context * ctx, int i) {
-    float val;
-    gguf_get_val(ctx, i, &val);
-    return val;
+    return ctx->header.kv[i].value.float32;
 }
 
 bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
-    bool val;
-    gguf_get_val(ctx, i, &val);
-    return val;
+    return ctx->header.kv[i].value.bool_;
 }
 
 const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
-    char * val;
-    gguf_get_val(ctx, i, &val);
-    return val;
+    return ctx->header.kv[i].value.str.data;
 }
 
 int gguf_get_n_tensors(struct gguf_context * ctx) {
diff --git a/ggml.h b/ggml.h
index 7d5514ba3..75a41a28f 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1619,19 +1619,6 @@ extern "C" {
     // gguf
     //
 
-    enum gguf_type {
-        GGUF_TYPE_UINT8   = 0,
-        GGUF_TYPE_INT8    = 1,
-        GGUF_TYPE_UINT16  = 2,
-        GGUF_TYPE_INT16   = 3,
-        GGUF_TYPE_UINT32  = 4,
-        GGUF_TYPE_INT32   = 5,
-        GGUF_TYPE_FLOAT32 = 6,
-        GGUF_TYPE_BOOL    = 7,
-        GGUF_TYPE_STRING  = 8,
-        GGUF_TYPE_ARRAY   = 9,
-    };
-
     struct gguf_context;
 
     struct gguf_init_params {
@@ -1651,10 +1638,9 @@ extern "C" {
     GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
     GGML_API void * gguf_get_data       (struct gguf_context * ctx);
 
-    GGML_API int            gguf_get_n_kv(struct gguf_context * ctx);
-    GGML_API const char *   gguf_get_key (struct gguf_context * ctx, int i);
-    GGML_API enum gguf_type gguf_get_type(struct gguf_context * ctx, int i);
-    GGML_API void           gguf_get_val (struct gguf_context * ctx, int i, void * val);
+    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
+    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
+    GGML_API void         gguf_get_val (struct gguf_context * ctx, int i, void * val);
 
     GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
     GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);

From e46870f5af3432212982645fcd1cc59b8e106734 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 18:55:32 +0300
Subject: [PATCH 11/26] gguf : gguf.c is now part of ggml.c

---
 gguf.c | 192 ---------------------------------------------------------
 1 file changed, 192 deletions(-)
 delete mode 100644 gguf.c

diff --git a/gguf.c b/gguf.c
deleted file mode 100644
index 54b31d411..000000000
--- a/gguf.c
+++ /dev/null
@@ -1,192 +0,0 @@
-// TODO: convert to proper gguf.h gguf.c structure, now I'm trying to be fast as much as possible,
-// and everything is in this file for quick debugging.
-
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdbool.h>
-
-
-enum ggml_type {
-    GGML_TYPE_F32  = 0,
-    GGML_TYPE_F16  = 1,
-    GGML_TYPE_Q4_0 = 2,
-    GGML_TYPE_Q4_1 = 3,
-    // GGML_TYPE_Q4_2 = 4, support has been removed
-    // GGML_TYPE_Q4_3 (5) support has been removed
-    GGML_TYPE_Q5_0 = 6,
-    GGML_TYPE_Q5_1 = 7,
-    GGML_TYPE_Q8_0 = 8,
-    GGML_TYPE_Q8_1 = 9,
-    // k-quantizations
-    GGML_TYPE_Q2_K = 10,
-    GGML_TYPE_Q3_K = 11,
-    GGML_TYPE_Q4_K = 12,
-    GGML_TYPE_Q5_K = 13,
-    GGML_TYPE_Q6_K = 14,
-    GGML_TYPE_Q8_K = 15,
-    GGML_TYPE_I8,
-    GGML_TYPE_I16,
-    GGML_TYPE_I32,
-    GGML_TYPE_COUNT,
-};
-
-enum gguf_metadata_value_type {
-    GGUF_METADATA_VALUE_TYPE_UINT8 = 0,
-    GGUF_METADATA_VALUE_TYPE_INT8 = 1,
-    GGUF_METADATA_VALUE_TYPE_UINT16 = 2,
-    GGUF_METADATA_VALUE_TYPE_INT16 = 3,
-    GGUF_METADATA_VALUE_TYPE_UINT32 = 4,
-    GGUF_METADATA_VALUE_TYPE_INT32 = 5,
-    GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
-    GGUF_METADATA_VALUE_TYPE_BOOL = 7,
-    GGUF_METADATA_VALUE_TYPE_STRING = 8,
-    GGUF_METADATA_VALUE_TYPE_ARRAY = 9,
-};
-
-struct gguf_string_t {
-    uint32_t len;
-    char * string;
-};
-
-union gguf_metadata_value_t;
-
-// Union definition for gguf_metadata_value_t
-union gguf_metadata_value_t {
-    uint8_t uint8;
-    int8_t int8;
-    uint16_t uint16;
-    int16_t int16;
-    uint32_t uint32;
-    int32_t int32;
-    float float32;
-    bool bool_;
-    struct gguf_string_t string;
-    struct {
-        uint32_t len;
-        enum gguf_metadata_value_type type;
-        union gguf_metadata_value_t * array;
-    } array;
-};
-
-
-struct gguf_metadata_kv_t {
-    struct gguf_string_t key;
-    uint32_t value_len;
-    enum gguf_metadata_value_type value_type;
-    union gguf_metadata_value_t* value;
-};
-
-struct gguf_header_t {
-    uint32_t magic;
-    uint32_t version;
-    uint32_t tensor_count;
-    uint32_t metadata_kv_count;
-    struct gguf_metadata_kv_t * metadata_kv;
-};
-
-struct gguf_tensor_info_t {
-    struct gguf_string_t name;
-    uint32_t n_dimensions;
-    uint32_t dimensions[];
-};
-
-struct gguf_file_t {
-    struct gguf_header_t header;
-    uint8_t tensor_data[];
-};
-
-void read_gguf_file(const char * file_path, struct gguf_file_t * gguf_file) {
-    FILE* file = fopen(file_path, "rb");
-    if (file == NULL) {
-        printf("Error opening the file.\n");
-        return;
-    }
-
-    fread(&gguf_file->header.magic, sizeof(uint32_t), 1, file);
-
-    // Verify magic and version
-    if (gguf_file->header.magic != 0x47475546) {
-        printf("Invalid magic number. Not a valid GGUF file.\n");
-        fclose(file);
-        return;
-    }
-
-    fread(&gguf_file->header.version, sizeof(uint32_t), 1, file);
-
-    if (gguf_file->header.version != 1) {
-        printf("Unsupported version. Expected version 1.\n");
-        fclose(file);
-        return;
-    }
-
-    fread(&gguf_file->header.tensor_count, sizeof(uint32_t), 1, file);
-    fread(&gguf_file->header.metadata_kv_count, sizeof(uint32_t), 1, file);
-
-    printf("Magic: %x\n", gguf_file->header.magic);
-    printf("Version: %d\n", gguf_file->header.version);
-    printf("Tensor Count: %d\n", gguf_file->header.tensor_count);
-    printf("Metadata Key-Value Count: %d\n", gguf_file->header.metadata_kv_count);
-
-    gguf_file->header.metadata_kv = (struct gguf_metadata_kv_t*)malloc(gguf_file->header.metadata_kv_count * sizeof(struct gguf_metadata_kv_t));
-
-    for (int i = 0; i < gguf_file->header.metadata_kv_count; i++) {
-        struct gguf_metadata_kv_t* kv = &gguf_file->header.metadata_kv[i];
-        fread(&kv->key.len, sizeof(uint32_t), 1, file);
-        kv->key.string = (char*)malloc(kv->key.len ); // Allocate memory for the key string
-        fread(kv->key.string, sizeof(char), kv->key.len, file);
-        //kv->key.string[kv->key.len] = '\0'; // Null-terminate the key string
-
-        fread(&kv->value_type, sizeof(uint32_t), 1, file);
-
-        printf("Metadata Value Type: %d\n", kv->value_type);
-        printf("Metadata Key: %s\n", kv->key.string);
-
-        // Read metadata value according to its type using reinterpret_cast
-        switch (kv->value_type) {
-            case GGUF_METADATA_VALUE_TYPE_UINT32:
-            kv->value = (uint32_t *) malloc(sizeof(uint32_t));
-            fread(kv->value, sizeof(uint32_t), 1, file);
-            printf("value: %d\n", kv->value->uint32);
-            break;
-            case GGUF_METADATA_VALUE_TYPE_FLOAT32:
-            kv->value = (float *)malloc(sizeof(float));
-            fread(kv->value, sizeof(float), 1, file);
-            printf("value: %f\n", (float)kv->value->float32);
-            break;
-            case GGUF_METADATA_VALUE_TYPE_STRING:
-            fread(&kv->value_len, sizeof(uint32_t), 1, file);
-            printf("value len: %d\n", kv->value_len);
-kv->value = (char *)malloc(sizeof(char) * kv->value_len); // Allocate memory for the value string
-fread(kv->value, sizeof(char), kv->value_len, file);
-        printf("value: %s\n", (char *)kv->value);
-        break;
-            // ... (handle other types in a similar manner)
-            default:
-                printf("Unsupported metadata value type.\n");
-                fclose(file);
-                return;
-        }
-    }
-
-    // TODO: handle reading tensor data
-
-    fclose(file);
-}
-
-void gguf_free(struct gguf_file_t * gguf_file) {
-    // Free allocated memory for key strings avd values
-    for (int i = 0; i < gguf_file->header.metadata_kv_count; i++) {
-        free(gguf_file->header.metadata_kv[i].key.string);
-        free(gguf_file->header.metadata_kv[i].value);
-    }
-    free(gguf_file->header.metadata_kv);
-}
-
-int main() {
-    const char* file_path = "example.gguf";
-    struct gguf_file_t gguf_file;
-    read_gguf_file(file_path, &gguf_file);
-    gguf_free(&gguf_file);
-    return 0;
-}

From 5628ec71636e0b390213caa4c273d3ef8bbd7459 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 20:04:22 +0300
Subject: [PATCH 12/26] gguf : read / write sample models

---
 examples/gguf/gguf.cpp | 323 ++++++++++++++++++++++++++++++++++++++++-
 ggml.c                 | 119 ++++++++-------
 ggml.h                 |   5 +-
 3 files changed, 386 insertions(+), 61 deletions(-)

diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index 602de519a..a5c442ac5 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -1,15 +1,326 @@
 #include "ggml.h"
 
 #include <cstdio>
+#include <cinttypes>
 #include <string>
+#include <sstream>
+#include <fstream>
+#include <vector>
 
-bool gguf_write(const std::string & fname) {
+enum gguf_type {
+    GGUF_TYPE_UINT8   = 0,
+    GGUF_TYPE_INT8    = 1,
+    GGUF_TYPE_UINT16  = 2,
+    GGUF_TYPE_INT16   = 3,
+    GGUF_TYPE_UINT32  = 4,
+    GGUF_TYPE_INT32   = 5,
+    GGUF_TYPE_FLOAT32 = 6,
+    GGUF_TYPE_BOOL    = 7,
+    GGUF_TYPE_STRING  = 8,
+    GGUF_TYPE_ARRAY   = 9,
+};
 
+template<typename T>
+static std::string to_string(const T & val) {
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+}
+
+void gguf_ex_write_str(std::ofstream & fout, const std::string & val) {
+    const int32_t n = val.size();
+    fout.write((const char *) &n, sizeof(n));
+    fout.write(val.c_str(), n);
+}
+
+void gguf_ex_write_i32(std::ofstream & fout, int32_t val) {
+    fout.write((const char *) &val, sizeof(val));
+}
+
+void gguf_ex_write_u64(std::ofstream & fout, size_t val) {
+    fout.write((const char *) &val, sizeof(val));
+}
+
+template<typename T>
+void gguf_ex_write_param(std::ofstream & fout, const std::string & key, enum gguf_type type, const T & val) {
+    gguf_ex_write_str(fout, key);
+    fout.write((const char *) &type, sizeof(type));
+    fout.write((const char *) &val,  sizeof(val));
+
+    fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), to_string(val).c_str());
+}
+
+template<>
+void gguf_ex_write_param<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::string & val) {
+    gguf_ex_write_str(fout, key);
+    fout.write((const char *) &type, sizeof(type));
+
+    const int32_t n = val.size();
+    fout.write((const char *) &n, sizeof(n));
+    fout.write(val.c_str(), n);
+}
+
+bool gguf_ex_write(const std::string & fname) {
+    std::ofstream fout(fname.c_str(), std::ios::binary);
+
+    {
+        const int32_t magic = GGUF_MAGIC;
+        fout.write((const char *) &magic, sizeof(magic));
+    }
+
+    {
+        const int32_t version = GGUF_VERSION;
+        fout.write((const char *) &version, sizeof(version));
+    }
+
+    const int n_tensors = 10;
+    const int n_kv = 9;
+
+    fout.write((const char*) &n_tensors, sizeof(n_tensors));
+    fout.write((const char*) &n_kv, sizeof(n_kv));
+
+    fprintf(stdout, "%s: write header\n", __func__);
+
+    // kv data
+    {
+        gguf_ex_write_param< uint8_t>(fout, "some.parameter.uint8",   GGUF_TYPE_UINT8,   0x12);
+        gguf_ex_write_param<  int8_t>(fout, "some.parameter.int8",    GGUF_TYPE_INT8,   -0x13);
+        gguf_ex_write_param<uint16_t>(fout, "some.parameter.uint16",  GGUF_TYPE_UINT16,  0x1234);
+        gguf_ex_write_param< int16_t>(fout, "some.parameter.int16",   GGUF_TYPE_INT16,  -0x1235);
+        gguf_ex_write_param<uint32_t>(fout, "some.parameter.uint32",  GGUF_TYPE_UINT32,  0x12345678);
+        gguf_ex_write_param< int32_t>(fout, "some.parameter.int32",   GGUF_TYPE_INT32,  -0x12345679);
+
+        gguf_ex_write_param<float>   (fout, "some.parameter.float32", GGUF_TYPE_FLOAT32, 0.123456789f);
+        gguf_ex_write_param<bool>    (fout, "some.parameter.bool",    GGUF_TYPE_BOOL,    true);
+
+        gguf_ex_write_param<std::string>(fout, "some.parameter.string",  GGUF_TYPE_STRING,  "hello world");
+    }
+
+    uint64_t offset_tensor = 0;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ 128ull*1024ull*1024ull,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx_data = ggml_init(params);
+
+    // tensor infos
+    for (int i = 0; i < n_tensors; ++i) {
+        const std::string name = "tensor_" + to_string(i);
+
+        int64_t ne[GGML_MAX_DIMS] = { 1 };
+        int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
+
+        for (int j = 0; j < n_dims; ++j) {
+            ne[j] = rand() % 10 + 1;
+        }
+
+        struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
+        ggml_set_name(cur, name.c_str());
+
+        {
+            float * data = (float *) cur->data;
+            for (int j = 0; j < ggml_nelements(cur); ++j) {
+                data[j] = 100 + i;
+            }
+        }
+
+        fprintf(stdout, "%s: tensor: %s, %d dims, ne = [", __func__, name.c_str(), n_dims);
+        for (int j = 0; j < 4; ++j) {
+            fprintf(stdout, "%s%3d", j == 0 ? "" : ", ", (int) cur->ne[j]);
+        }
+        fprintf(stdout, "], offset_tensor = %6" PRIu64 "\n", offset_tensor);
+
+        gguf_ex_write_str(fout, name);
+        gguf_ex_write_i32(fout, n_dims);
+        for (int j = 0; j < n_dims; ++j) {
+            gguf_ex_write_i32(fout, cur->ne[j]);
+        }
+        gguf_ex_write_i32(fout, cur->type);
+        gguf_ex_write_u64(fout, offset_tensor);
+
+        offset_tensor += GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT);
+    }
+
+    const uint64_t offset_data = GGML_PAD((uint64_t) fout.tellp(), GGUF_DEFAULT_ALIGNMENT);
+
+    fprintf(stdout, "%s: data offset = %" PRIu64 "\n", __func__, offset_data);
+
+    {
+        const size_t pad = offset_data - fout.tellp();
+
+        for (size_t j = 0; j < pad; ++j) {
+            fout.put(0);
+        }
+    }
+
+    for (int i = 0; i < n_tensors; ++i) {
+        fprintf(stdout, "%s: writing tensor %d data\n", __func__, i);
+
+        const std::string name = "tensor_" + to_string(i);
+
+        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
+
+        fout.write((const char *) cur->data, ggml_nbytes(cur));
+
+        {
+            const size_t pad = GGML_PAD(ggml_nbytes(cur), GGUF_DEFAULT_ALIGNMENT) - ggml_nbytes(cur);
+
+            for (size_t j = 0; j < pad; ++j) {
+                fout.put(0);
+            }
+        }
+    }
+
+    fout.close();
+
+    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
+
+    ggml_free(ctx_data);
 
     return true;
 }
 
-bool gguf_read(const std::string & fname) {
+// just read tensor info
+bool gguf_ex_read_0(const std::string & fname) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ NULL,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+    // kv
+    {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+    }
+
+    // tensor info
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name = gguf_get_tensor_name(ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+        }
+    }
+
+    return true;
+}
+
+// read and create ggml_context containing the tensors and their data
+bool gguf_ex_read_1(const std::string & fname) {
+    struct ggml_context * ctx_data = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &ctx_data,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+    // kv
+    {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+    }
+
+    // tensor info
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name = gguf_get_tensor_name(ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+        }
+    }
+
+    // data
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
+
+            const std::string name = "tensor_" + to_string(i);
+
+            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
+
+            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n",
+                    __func__, i, cur->n_dims, cur->name, cur->data);
+
+            // check data
+            {
+                const float * data = (const float *) cur->data;
+                for (int j = 0; j < ggml_nelements(cur); ++j) {
+                    if (data[j] != 100 + i) {
+                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
+    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
+    return true;
+}
+
+// read just the tensor info and mmap the data in user code
+bool gguf_ex_read_2(const std::string & fname) {
+    struct ggml_context * ctx_data = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ &ctx_data,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    // TODO: mmap based on tensor infos
+
+    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
     return true;
 }
 
@@ -20,14 +331,16 @@ int main(int argc, char ** argv) {
     }
 
     const std::string fname(argv[1]);
-    const std::string mode(argv[2]);
+    const std::string mode (argv[2]);
 
     GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
 
     if (mode == "w") {
-        GGML_ASSERT(gguf_write(fname) && "failed to write gguf file");
+        GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
     } else if (mode == "r") {
-        GGML_ASSERT(gguf_read(fname)  && "failed to read gguf file");
+        GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
+        GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
+        GGML_ASSERT(gguf_ex_read_2(fname) && "failed to read gguf file");
     }
 
     return 0;
diff --git a/ggml.c b/ggml.c
index e68e91e18..5736c800e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18364,7 +18364,7 @@ struct gguf_tensor_info {
 
     enum ggml_type type;
 
-    uint64_t offset; // offset from beginning of file, must be a multiple of `ALIGNMENT`
+    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
 };
 
 struct gguf_context {
@@ -18385,9 +18385,7 @@ static bool gguf_fread_el(void * dst, size_t size, FILE * file, size_t * offset)
     return n == size;
 }
 
-static bool gguf_fread_str(void * dst, FILE * file, size_t * offset) {
-    struct gguf_str * p = (struct gguf_str *) dst;
-
+static bool gguf_fread_str(struct gguf_str * p, FILE * file, size_t * offset) {
     p->n    = 0;
     p->data = NULL;
 
@@ -18395,14 +18393,12 @@ static bool gguf_fread_str(void * dst, FILE * file, size_t * offset) {
 
     // TODO: how to avoid mallocs for strings?
     ok = ok && gguf_fread_el(&p->n,    sizeof(p->n), file, offset); p->data = calloc(p->n + 1, 1);
-    ok = ok && gguf_fread_el(&p->data, p->n,         file, offset);
+    ok = ok && gguf_fread_el( p->data, p->n,         file, offset);
 
     return ok;
 }
 
-struct gguf_context * gguf_init(const char * fname, struct gguf_init_params params) {
-    GGML_ASSERT(!params.load || params.malloc || params.ctx != NULL);
-
+struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
     FILE * file = fopen(fname, "rb");
     if (!file) {
         return NULL;
@@ -18446,10 +18442,14 @@ struct gguf_context * gguf_init(const char * fname, struct gguf_init_params para
     for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
         struct gguf_kv * kv = &ctx->header.kv[i];
 
+        //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
+
         ok = ok && gguf_fread_str(&kv->key,                          file, &offset);
       //ok = ok && gguf_fread_el (&kv->n_bytes, sizeof(kv->n_bytes), file, &offset);
         ok = ok && gguf_fread_el (&kv->type,    sizeof(kv->type),    file, &offset);
 
+        //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
+
         switch (kv->type) {
             case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (&kv->value.uint8,   sizeof(kv->value.uint8),   file, &offset); break;
             case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (&kv->value.int8,    sizeof(kv->value.int8),    file, &offset); break;
@@ -18461,9 +18461,13 @@ struct gguf_context * gguf_init(const char * fname, struct gguf_init_params para
             case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (&kv->value.bool_,   sizeof(kv->value.bool_),   file, &offset); break;
             case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(&kv->value.str,                                file, &offset); break;
             case GGUF_TYPE_ARRAY:
-                GGML_ASSERT("gguf: array type not implemented");
-                break;
-            };
+                                    GGML_ASSERT("gguf: array type not implemented");
+                                    break;
+        };
+
+        if (!ok) {
+            break;
+        }
     }
 
     if (!ok) {
@@ -18478,12 +18482,14 @@ struct gguf_context * gguf_init(const char * fname, struct gguf_init_params para
     for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
         struct gguf_tensor_info * info = &ctx->infos[i];
 
-        memset(info->ne, 1, sizeof(info->ne));
+        for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+            info->ne[j] = 1;
+        }
 
         ok = ok && gguf_fread_str(&info->name,                          file, &offset);
         ok = ok && gguf_fread_el (&info->n_dims, sizeof(info->n_dims),  file, &offset);
         for (uint32_t j = 0; j < info->n_dims; ++j) {
-            ok = ok && gguf_fread_el (&info->ne[j], sizeof(info->ne[j]), file, &offset);
+            ok = ok && gguf_fread_el(&info->ne[j], sizeof(info->ne[j]), file, &offset);
         }
       //ok = ok && gguf_fread_el (&info->n_elms, sizeof(info->n_elms),  file, &offset);
         ok = ok && gguf_fread_el (&info->type,    sizeof(info->type),   file, &offset);
@@ -18536,28 +18542,30 @@ struct gguf_context * gguf_init(const char * fname, struct gguf_init_params para
         ctx->size_data += GGML_PAD(size_cur, ctx->alignment);
     }
 
+    // load the tensor data
     // TODO: simplify
-    if (params.load) {
-        if (params.malloc) {
-            ctx->data = GGML_ALIGNED_MALLOC(ctx->size_data);
-            fseek(file, ctx->offset, SEEK_SET);
-            ok = ok && gguf_fread_el(ctx->data, ctx->size_data, file, &offset);
-        } else {
-            const size_t mem_size =
-                ctx->header.n_tensors*ggml_tensor_overhead() + 1 +
-                ctx->size_data;
+    if (params.ctx != NULL) {
+        const size_t mem_size =
+            params.no_alloc ?
+            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() :
+            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size_data;
 
-            struct ggml_init_params pdata = {
-                .mem_size   = mem_size,
-                .mem_buffer = NULL,
-                .no_alloc   = false,
-            };
+        struct ggml_init_params pdata = {
+            .mem_size   = mem_size,
+            .mem_buffer = NULL,
+            .no_alloc   = params.no_alloc,
+        };
 
-            *params.ctx = ggml_init(pdata);
+        *params.ctx = ggml_init(pdata);
 
-            struct ggml_context * ctx_data = *params.ctx;
+        struct ggml_context * ctx_data = *params.ctx;
 
-            struct ggml_tensor * data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size_data);
+        struct ggml_tensor * data = NULL;
+
+        if (params.no_alloc == false) {
+            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size_data);
+
+            ok = ok && data != NULL;
 
             // read the tensor data
             ok = ok && gguf_fread_el(data->data, ctx->size_data, file, &offset);
@@ -18571,39 +18579,44 @@ struct gguf_context * gguf_init(const char * fname, struct gguf_init_params para
             }
 
             ctx->data = data->data;
+        }
 
-            // create the tensors
-            ggml_set_no_alloc(ctx_data, true);
+        ggml_set_no_alloc(ctx_data, true);
 
-            for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
-                const int64_t ne[GGML_MAX_DIMS] = {
-                    ctx->infos[i].ne[0],
-                    ctx->infos[i].ne[1],
-                    ctx->infos[i].ne[2],
-                    ctx->infos[i].ne[3],
-                };
+        // create the tensors
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+            const int64_t ne[GGML_MAX_DIMS] = {
+                ctx->infos[i].ne[0],
+                ctx->infos[i].ne[1],
+                ctx->infos[i].ne[2],
+                ctx->infos[i].ne[3],
+            };
 
-                struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
+            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
 
-                ok = ok && cur != NULL;
+            ok = ok && cur != NULL;
 
-                if (!ok) {
-                    break;
-                }
-
-                cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset;
-            }
+            ggml_set_name(cur, ctx->infos[i].name.data);
 
             if (!ok) {
-                fprintf(stderr, "%s: failed to create tensors\n", __func__);
-                fclose(file);
-                ggml_free(ctx_data);
-                gguf_free(ctx);
-                return NULL;
+                break;
             }
 
-            ggml_set_no_alloc(ctx_data, false);
+            if (params.no_alloc == false) {
+              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
+                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
+            }
         }
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to create tensors\n", __func__);
+            fclose(file);
+            ggml_free(ctx_data);
+            gguf_free(ctx);
+            return NULL;
+        }
+
+        ggml_set_no_alloc(ctx_data, params.no_alloc);
     }
 
     if (!ok) {
diff --git a/ggml.h b/ggml.h
index 75a41a28f..e0abbbfdd 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1622,10 +1622,9 @@ extern "C" {
     struct gguf_context;
 
     struct gguf_init_params {
-        bool load;   // load the tensor data
-        bool malloc; // if false, create a ggml_context and allocate the tensor data in it
-                     // if  true, use malloc to allocate the tensor data instead
+        bool no_alloc;
 
+        // if not NULL, create a ggml_context and allocate the tensor data in it
         struct ggml_context ** ctx;
     };
 

From d8491fc7e3545a4447555cb9e4487994bc98c024 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Jul 2023 22:56:26 +0300
Subject: [PATCH 13/26] gguf : add comments

---
 ggml.c | 234 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 125 insertions(+), 109 deletions(-)

diff --git a/ggml.c b/ggml.c
index 5736c800e..b005fd889 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18407,106 +18407,120 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     // offset from start of file
     size_t offset = 0;
 
-    // check the magic before making allocations
     uint32_t magic = 0;
-    gguf_fread_el(&magic, sizeof(magic), file, &offset);
-    if (magic != GGUF_MAGIC) {
-        fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
-        fclose(file);
-        return NULL;
+
+    // check the magic before making allocations
+    {
+        gguf_fread_el(&magic, sizeof(magic), file, &offset);
+
+        if (magic != GGUF_MAGIC) {
+            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
+            fclose(file);
+            return NULL;
+        }
     }
 
     bool ok = true;
 
     struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 
-    ctx->header.magic = magic;
-    ctx->header.kv    = NULL;
+    // read the header
+    {
+        ctx->header.magic = magic;
+        ctx->header.kv    = NULL;
 
-    ctx->infos = NULL;
-    ctx->data  = NULL;
+        ctx->infos = NULL;
+        ctx->data  = NULL;
 
-    ok = ok && gguf_fread_el(&ctx->header.version,   sizeof(ctx->header.version),   file, &offset);
-    ok = ok && gguf_fread_el(&ctx->header.n_tensors, sizeof(ctx->header.n_tensors), file, &offset);
-    ok = ok && gguf_fread_el(&ctx->header.n_kv,      sizeof(ctx->header.n_kv),      file, &offset);
-
-    if (!ok) {
-        fprintf(stderr, "%s: failed to read header\n", __func__);
-        fclose(file);
-        gguf_free(ctx);
-        return NULL;
-    }
-
-    ctx->header.kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
-
-    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
-        struct gguf_kv * kv = &ctx->header.kv[i];
-
-        //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
-
-        ok = ok && gguf_fread_str(&kv->key,                          file, &offset);
-      //ok = ok && gguf_fread_el (&kv->n_bytes, sizeof(kv->n_bytes), file, &offset);
-        ok = ok && gguf_fread_el (&kv->type,    sizeof(kv->type),    file, &offset);
-
-        //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
-
-        switch (kv->type) {
-            case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (&kv->value.uint8,   sizeof(kv->value.uint8),   file, &offset); break;
-            case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (&kv->value.int8,    sizeof(kv->value.int8),    file, &offset); break;
-            case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (&kv->value.uint16,  sizeof(kv->value.uint16),  file, &offset); break;
-            case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (&kv->value.int16,   sizeof(kv->value.int16),   file, &offset); break;
-            case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (&kv->value.uint32,  sizeof(kv->value.uint32),  file, &offset); break;
-            case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (&kv->value.int32,   sizeof(kv->value.int32),   file, &offset); break;
-            case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (&kv->value.float32, sizeof(kv->value.float32), file, &offset); break;
-            case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (&kv->value.bool_,   sizeof(kv->value.bool_),   file, &offset); break;
-            case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(&kv->value.str,                                file, &offset); break;
-            case GGUF_TYPE_ARRAY:
-                                    GGML_ASSERT("gguf: array type not implemented");
-                                    break;
-        };
+        ok = ok && gguf_fread_el(&ctx->header.version,   sizeof(ctx->header.version),   file, &offset);
+        ok = ok && gguf_fread_el(&ctx->header.n_tensors, sizeof(ctx->header.n_tensors), file, &offset);
+        ok = ok && gguf_fread_el(&ctx->header.n_kv,      sizeof(ctx->header.n_kv),      file, &offset);
 
         if (!ok) {
-            break;
-        }
-    }
-
-    if (!ok) {
-        fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
-        fclose(file);
-        gguf_free(ctx);
-        return NULL;
-    }
-
-    ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
-
-    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
-        struct gguf_tensor_info * info = &ctx->infos[i];
-
-        for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-            info->ne[j] = 1;
-        }
-
-        ok = ok && gguf_fread_str(&info->name,                          file, &offset);
-        ok = ok && gguf_fread_el (&info->n_dims, sizeof(info->n_dims),  file, &offset);
-        for (uint32_t j = 0; j < info->n_dims; ++j) {
-            ok = ok && gguf_fread_el(&info->ne[j], sizeof(info->ne[j]), file, &offset);
-        }
-      //ok = ok && gguf_fread_el (&info->n_elms, sizeof(info->n_elms),  file, &offset);
-        ok = ok && gguf_fread_el (&info->type,    sizeof(info->type),   file, &offset);
-        ok = ok && gguf_fread_el (&info->offset,  sizeof(info->offset), file, &offset);
-
-        if (!ok) {
-            fprintf(stderr, "%s: failed to read tensor info\n", __func__);
+            fprintf(stderr, "%s: failed to read header\n", __func__);
             fclose(file);
             gguf_free(ctx);
             return NULL;
         }
     }
 
+    // read the kv pairs
+    {
+        ctx->header.kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
+
+        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+            struct gguf_kv * kv = &ctx->header.kv[i];
+
+            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
+
+            ok = ok && gguf_fread_str(&kv->key,                          file, &offset);
+          //ok = ok && gguf_fread_el (&kv->n_bytes, sizeof(kv->n_bytes), file, &offset);
+            ok = ok && gguf_fread_el (&kv->type,    sizeof(kv->type),    file, &offset);
+
+            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
+
+            switch (kv->type) {
+                case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (&kv->value.uint8,   sizeof(kv->value.uint8),   file, &offset); break;
+                case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (&kv->value.int8,    sizeof(kv->value.int8),    file, &offset); break;
+                case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (&kv->value.uint16,  sizeof(kv->value.uint16),  file, &offset); break;
+                case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (&kv->value.int16,   sizeof(kv->value.int16),   file, &offset); break;
+                case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (&kv->value.uint32,  sizeof(kv->value.uint32),  file, &offset); break;
+                case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (&kv->value.int32,   sizeof(kv->value.int32),   file, &offset); break;
+                case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (&kv->value.float32, sizeof(kv->value.float32), file, &offset); break;
+                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (&kv->value.bool_,   sizeof(kv->value.bool_),   file, &offset); break;
+                case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(&kv->value.str,                                file, &offset); break;
+                case GGUF_TYPE_ARRAY:
+                                        GGML_ASSERT("gguf: array type not implemented");
+                                        break;
+            };
+
+            if (!ok) {
+                break;
+            }
+        }
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
+        }
+    }
+
+    // read the tensor infos
+    {
+        ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+            struct gguf_tensor_info * info = &ctx->infos[i];
+
+            for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                info->ne[j] = 1;
+            }
+
+            ok = ok && gguf_fread_str(&info->name,                          file, &offset);
+            ok = ok && gguf_fread_el (&info->n_dims, sizeof(info->n_dims),  file, &offset);
+            for (uint32_t j = 0; j < info->n_dims; ++j) {
+                ok = ok && gguf_fread_el(&info->ne[j], sizeof(info->ne[j]), file, &offset);
+            }
+          //ok = ok && gguf_fread_el (&info->n_elms, sizeof(info->n_elms),  file, &offset);
+            ok = ok && gguf_fread_el (&info->type,   sizeof(info->type),    file, &offset);
+            ok = ok && gguf_fread_el (&info->offset, sizeof(info->offset),  file, &offset);
+
+            if (!ok) {
+                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
+                fclose(file);
+                gguf_free(ctx);
+                return NULL;
+            }
+        }
+    }
+
     ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
 
     // TODO: determine new alignment from kv if available
 
+    // we require the data section to be aligned, so take into account any padding
     {
         const size_t offset_pad = offset % ctx->alignment;
 
@@ -18516,38 +18530,46 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         }
     }
 
+    // store the current file offset - this is where the data section starts
     ctx->offset = offset;
 
-    ctx->size_data = 0;
+    // compute the total size of the data section, taking into account the alignment
+    {
 
-    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
-        struct gguf_tensor_info * info = &ctx->infos[i];
+        ctx->size_data = 0;
+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+            struct gguf_tensor_info * info = &ctx->infos[i];
 
-        const int64_t ne =
-            (int64_t) info->ne[0] *
-            (int64_t) info->ne[1] *
-            (int64_t) info->ne[2] *
-            (int64_t) info->ne[3];
+            const int64_t ne =
+                (int64_t) info->ne[0] *
+                (int64_t) info->ne[1] *
+                (int64_t) info->ne[2] *
+                (int64_t) info->ne[3];
 
-        if (ne % ggml_blck_size(info->type) != 0) {
-            fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
-                    __func__, info->name.data, ne, ggml_blck_size(info->type));
-            fclose(file);
-            gguf_free(ctx);
-            return NULL;
+            if (ne % ggml_blck_size(info->type) != 0) {
+                fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
+                        __func__, info->name.data, ne, ggml_blck_size(info->type));
+                fclose(file);
+                gguf_free(ctx);
+                return NULL;
+            }
+
+            const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
+
+            ctx->size_data += GGML_PAD(size_cur, ctx->alignment);
         }
-
-        const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
-
-        ctx->size_data += GGML_PAD(size_cur, ctx->alignment);
     }
 
-    // load the tensor data
-    // TODO: simplify
+    // load the tensor data only if requested
     if (params.ctx != NULL) {
+        // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
+        // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
+        // the ggml_tensor structs to the appropriate locations in the binary blob
+
+        // compute the exact size needed for the new ggml_context
         const size_t mem_size =
             params.no_alloc ?
-            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() :
+            (ctx->header.n_tensors    )*ggml_tensor_overhead() :
             (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size_data;
 
         struct ggml_init_params pdata = {
@@ -18567,7 +18589,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
             ok = ok && data != NULL;
 
-            // read the tensor data
+            // read the binary blob with the tensor data
             ok = ok && gguf_fread_el(data->data, ctx->size_data, file, &offset);
 
             if (!ok) {
@@ -18602,6 +18624,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                 break;
             }
 
+            // point the data member to the appropriate location in the binary blob using the tensor infos
             if (params.no_alloc == false) {
               //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
                 cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
@@ -18609,7 +18632,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         }
 
         if (!ok) {
-            fprintf(stderr, "%s: failed to create tensors\n", __func__);
+            fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
             fclose(file);
             ggml_free(ctx_data);
             gguf_free(ctx);
@@ -18619,13 +18642,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         ggml_set_no_alloc(ctx_data, params.no_alloc);
     }
 
-    if (!ok) {
-        fprintf(stderr, "%s: failed to read tensor data\n", __func__);
-        fclose(file);
-        gguf_free(ctx);
-        return NULL;
-    }
-
     return ctx;
 }
 

From c85d3178b3165137c1f7d0b454db2b2b5efd7b18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 27 Jul 2023 10:29:29 +0300
Subject: [PATCH 14/26] refactor : reduce code duplication and better API
 (#2415)

---
 gguf.py | 57 +++++++++++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 32 deletions(-)

diff --git a/gguf.py b/gguf.py
index dfd5ba5bf..991bbe2f3 100644
--- a/gguf.py
+++ b/gguf.py
@@ -71,63 +71,56 @@ class GGUFWriter:
         f = open(path, "wb")
         return cls(f)
 
-    def write_key(self, key: str, value_type: GGUFValueType):
-        encoded_key = key.encode("utf8")
-        self.buffered_writer.write(struct.pack("<I", len(encoded_key)))
-        self.buffered_writer.write(encoded_key)
-        self.buffered_writer.write(struct.pack("<I", value_type))
+    def write_key(self, key: str):
+        self.write_value(key, GGUFValueType.STRING)
 
     def write_uint8(self, key: str, value: int):
-        self.write_key(key, GGUFValueType.UINT8)
-        self.buffered_writer.write(struct.pack("<B", value))
+        self.write_key(key)
+        self.write_value(value, GGUFValueType.UINT8)
 
     def write_int8(self, key: str, value: int):
-        self.write_key(key, GGUFValueType.INT8)
-        self.buffered_writer.write(struct.pack("<b", value))
+        self.write_key(key)
+        self.write_value(value, GGUFValueType.INT8)
 
     def write_uint16(self, key: str, value: int):
-        self.write_key(key, GGUFValueType.UINT16)
-        self.buffered_writer.write(struct.pack("<H", value))
+        self.write_key(key)
+        self.write_value(value, GGUFValueType.UINT16)
 
     def write_int16(self, key: str, value: int):
-        self.write_key(key, GGUFValueType.INT16)
-        self.buffered_writer.write(struct.pack("<h", value))
+        self.write_key(key)
+        self.write_value(value, GGUFValueType.INT16)
 
     def write_uint32(self, key: str, value: int):
-        self.write_key(key, GGUFValueType.UINT32)
-        self.buffered_writer.write(struct.pack("<I", value))
+        self.write_key(key)
+        self.write(value, GGUFValueType.UINT32)
 
     def write_int32(self, key: str, value: int):
-        self.write_key(key, GGUFValueType.INT32)
-        self.buffered_writer.write(struct.pack("<i", value))
+        self.write_key(key)
+        self.write_value(value, GGUFValueType.INT32)
 
     def write_float32(self, key: str, value: float):
-        self.write_key(key, GGUFValueType.FLOAT32)
-        self.buffered_writer.write(struct.pack("<f", value))
+        self.write_key(key)
+        self.write_value(value, GGUFValueType.FLOAT32)
 
     def write_bool(self, key: str, value: bool):
-        self.write_key(key, GGUFValueType.BOOL)
-        self.buffered_writer.write(struct.pack("<?", value))
+        self.write_key(key)
+        self.write_value(value, GGUFValueType.BOOL)
 
     def write_string(self, key: str, value: str):
-        self.write_key(key, GGUFValueType.STRING)
-        encoded_string = value.encode('utf-8')
-        self.buffered_writer.write(struct.pack("<I", len(encoded_string)))
-        self.buffered_writer.write(encoded_string)
+        self.write_key(key)
+        self.write_value(value, GGUFValueType.STRING)
 
     def write_array(self, key: str, value: list):
         if not isinstance(value, list):
             raise ValueError("Value must be a list for array type")
 
-        self.write_key(key, GGUFValueType.ARRAY)
+        self.write_key(key)
+        self.write_value(value, GGUFValueType.ARRAY)
 
-        self.buffered_writer.write(struct.pack("<I", len(value)))
+    def write_value(self: str, value: Any, value_type: GGUFValueType = None):
+        if value_type is None:
+            value_type = GGUFValueType.get_type(value)
 
-        for item in value:
-            self.write_value(item)
-
-    def write_value(self: str, value: Any):
-        value_type = GGUFValueType.get_type(value)
         self.buffered_writer.write(struct.pack("<I", value_type))
 
         if value_type == GGUFValueType.UINT8:

From d89533dff60c758cc59401e5e260855c540008a3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 27 Jul 2023 11:10:34 +0300
Subject: [PATCH 15/26] gguf : expose the gguf_type enum through the API for
 now

---
 examples/gguf/gguf.cpp | 13 -------------
 ggml.c                 | 13 -------------
 ggml.h                 | 14 ++++++++++++++
 3 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index a5c442ac5..d6a0691d0 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -7,19 +7,6 @@
 #include <fstream>
 #include <vector>
 
-enum gguf_type {
-    GGUF_TYPE_UINT8   = 0,
-    GGUF_TYPE_INT8    = 1,
-    GGUF_TYPE_UINT16  = 2,
-    GGUF_TYPE_INT16   = 3,
-    GGUF_TYPE_UINT32  = 4,
-    GGUF_TYPE_INT32   = 5,
-    GGUF_TYPE_FLOAT32 = 6,
-    GGUF_TYPE_BOOL    = 7,
-    GGUF_TYPE_STRING  = 8,
-    GGUF_TYPE_ARRAY   = 9,
-};
-
 template<typename T>
 static std::string to_string(const T & val) {
     std::stringstream ss;
diff --git a/ggml.c b/ggml.c
index b005fd889..ebdb6536f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18297,19 +18297,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
 
 ////////////////////////////////////////////////////////////////////////////////
 
-enum gguf_type {
-    GGUF_TYPE_UINT8   = 0,
-    GGUF_TYPE_INT8    = 1,
-    GGUF_TYPE_UINT16  = 2,
-    GGUF_TYPE_INT16   = 3,
-    GGUF_TYPE_UINT32  = 4,
-    GGUF_TYPE_INT32   = 5,
-    GGUF_TYPE_FLOAT32 = 6,
-    GGUF_TYPE_BOOL    = 7,
-    GGUF_TYPE_STRING  = 8,
-    GGUF_TYPE_ARRAY   = 9,
-};
-
 struct gguf_str {
     uint32_t n;
     char * data;
diff --git a/ggml.h b/ggml.h
index e0abbbfdd..91588895c 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1619,6 +1619,20 @@ extern "C" {
     // gguf
     //
 
+    // TODO: can be removed if the API is extended for writing
+    enum gguf_type {
+        GGUF_TYPE_UINT8   = 0,
+        GGUF_TYPE_INT8    = 1,
+        GGUF_TYPE_UINT16  = 2,
+        GGUF_TYPE_INT16   = 3,
+        GGUF_TYPE_UINT32  = 4,
+        GGUF_TYPE_INT32   = 5,
+        GGUF_TYPE_FLOAT32 = 6,
+        GGUF_TYPE_BOOL    = 7,
+        GGUF_TYPE_STRING  = 8,
+        GGUF_TYPE_ARRAY   = 9,
+    };
+
     struct gguf_context;
 
     struct gguf_init_params {

From d2b6ca13ad25a55e64bdd8287e773393fa54d212 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 27 Jul 2023 14:53:07 +0300
Subject: [PATCH 16/26] gguf : add array support

---
 examples/gguf/gguf.cpp | 87 +++++++++++++++++++++++++++++++++++-------
 ggml.c                 | 64 ++++++++++++++++++++++++++++---
 ggml.h                 |  4 +-
 3 files changed, 135 insertions(+), 20 deletions(-)

diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index d6a0691d0..c3494a343 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -29,7 +29,7 @@ void gguf_ex_write_u64(std::ofstream & fout, size_t val) {
 }
 
 template<typename T>
-void gguf_ex_write_param(std::ofstream & fout, const std::string & key, enum gguf_type type, const T & val) {
+void gguf_ex_write_val(std::ofstream & fout, const std::string & key, enum gguf_type type, const T & val) {
     gguf_ex_write_str(fout, key);
     fout.write((const char *) &type, sizeof(type));
     fout.write((const char *) &val,  sizeof(val));
@@ -38,13 +38,65 @@ void gguf_ex_write_param(std::ofstream & fout, const std::string & key, enum ggu
 }
 
 template<>
-void gguf_ex_write_param<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::string & val) {
+void gguf_ex_write_val<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::string & val) {
     gguf_ex_write_str(fout, key);
     fout.write((const char *) &type, sizeof(type));
 
     const int32_t n = val.size();
     fout.write((const char *) &n, sizeof(n));
     fout.write(val.c_str(), n);
+
+    fprintf(stdout, "%s: write param: %s = %s\n", __func__, key.c_str(), val.c_str());
+}
+
+template<typename T>
+void gguf_ex_write_arr(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<T> & val) {
+    gguf_ex_write_str(fout, key);
+    {
+        const enum gguf_type tarr = GGUF_TYPE_ARRAY;
+        fout.write((const char *) &tarr, sizeof(tarr));
+    }
+
+    const int32_t n = val.size();
+    fout.write((const char *) &type, sizeof(type));
+    fout.write((const char *) &n,    sizeof(n));
+    fout.write((const char *) val.data(), n * sizeof(T));
+
+    fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
+    for (int i = 0; i < n; ++i) {
+        fprintf(stdout, "%s", to_string(val[i]).c_str());
+        if (i < n - 1) {
+            fprintf(stdout, ", ");
+        }
+    }
+    fprintf(stdout, "]\n");
+}
+
+template<>
+void gguf_ex_write_arr<std::string>(std::ofstream & fout, const std::string & key, enum gguf_type type, const std::vector<std::string> & val) {
+    gguf_ex_write_str(fout, key);
+    {
+        const enum gguf_type tarr = GGUF_TYPE_ARRAY;
+        fout.write((const char *) &tarr, sizeof(tarr));
+    }
+
+    const int32_t n = val.size();
+    fout.write((const char *) &type, sizeof(type));
+    fout.write((const char *) &n,    sizeof(n));
+    for (int i = 0; i < n; ++i) {
+        const int32_t nstr = val[i].size();
+        fout.write((const char *) &nstr, sizeof(nstr));
+        fout.write(val[i].c_str(), nstr);
+    }
+
+    fprintf(stdout, "%s: write param: %s = [", __func__, key.c_str());
+    for (int i = 0; i < n; ++i) {
+        fprintf(stdout, "%s", val[i].c_str());
+        if (i < n - 1) {
+            fprintf(stdout, ", ");
+        }
+    }
+    fprintf(stdout, "]\n");
 }
 
 bool gguf_ex_write(const std::string & fname) {
@@ -60,8 +112,9 @@ bool gguf_ex_write(const std::string & fname) {
         fout.write((const char *) &version, sizeof(version));
     }
 
+    // NOTE: these have to match the output below!
     const int n_tensors = 10;
-    const int n_kv = 9;
+    const int n_kv      = 12;
 
     fout.write((const char*) &n_tensors, sizeof(n_tensors));
     fout.write((const char*) &n_kv, sizeof(n_kv));
@@ -70,17 +123,21 @@ bool gguf_ex_write(const std::string & fname) {
 
     // kv data
     {
-        gguf_ex_write_param< uint8_t>(fout, "some.parameter.uint8",   GGUF_TYPE_UINT8,   0x12);
-        gguf_ex_write_param<  int8_t>(fout, "some.parameter.int8",    GGUF_TYPE_INT8,   -0x13);
-        gguf_ex_write_param<uint16_t>(fout, "some.parameter.uint16",  GGUF_TYPE_UINT16,  0x1234);
-        gguf_ex_write_param< int16_t>(fout, "some.parameter.int16",   GGUF_TYPE_INT16,  -0x1235);
-        gguf_ex_write_param<uint32_t>(fout, "some.parameter.uint32",  GGUF_TYPE_UINT32,  0x12345678);
-        gguf_ex_write_param< int32_t>(fout, "some.parameter.int32",   GGUF_TYPE_INT32,  -0x12345679);
+        gguf_ex_write_val< uint8_t>(fout, "some.parameter.uint8",   GGUF_TYPE_UINT8,   0x12);
+        gguf_ex_write_val<  int8_t>(fout, "some.parameter.int8",    GGUF_TYPE_INT8,   -0x13);
+        gguf_ex_write_val<uint16_t>(fout, "some.parameter.uint16",  GGUF_TYPE_UINT16,  0x1234);
+        gguf_ex_write_val< int16_t>(fout, "some.parameter.int16",   GGUF_TYPE_INT16,  -0x1235);
+        gguf_ex_write_val<uint32_t>(fout, "some.parameter.uint32",  GGUF_TYPE_UINT32,  0x12345678);
+        gguf_ex_write_val< int32_t>(fout, "some.parameter.int32",   GGUF_TYPE_INT32,  -0x12345679);
 
-        gguf_ex_write_param<float>   (fout, "some.parameter.float32", GGUF_TYPE_FLOAT32, 0.123456789f);
-        gguf_ex_write_param<bool>    (fout, "some.parameter.bool",    GGUF_TYPE_BOOL,    true);
+        gguf_ex_write_val<float>   (fout, "some.parameter.float32", GGUF_TYPE_FLOAT32, 0.123456789f);
+        gguf_ex_write_val<bool>    (fout, "some.parameter.bool",    GGUF_TYPE_BOOL,    true);
 
-        gguf_ex_write_param<std::string>(fout, "some.parameter.string",  GGUF_TYPE_STRING,  "hello world");
+        gguf_ex_write_val<std::string>(fout, "some.parameter.string",  GGUF_TYPE_STRING,  "hello world");
+
+        gguf_ex_write_arr<int16_t>    (fout, "some.parameter.arr.i16", GGUF_TYPE_INT16,   { 1, 2, 3, 4, });
+        gguf_ex_write_arr<float>      (fout, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, { 3.145f, 2.718f, 1.414f, });
+        gguf_ex_write_arr<std::string>(fout, "some.parameter.arr.str", GGUF_TYPE_STRING,  { "hello", "world", "!" });
     }
 
     uint64_t offset_tensor = 0;
@@ -203,13 +260,15 @@ bool gguf_ex_read_0(const std::string & fname) {
         fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
 
         for (int i = 0; i < n_tensors; ++i) {
-            const char * name = gguf_get_tensor_name(ctx, i);
+            const char * name   = gguf_get_tensor_name  (ctx, i);
             const size_t offset = gguf_get_tensor_offset(ctx, i);
 
             fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
         }
     }
 
+    gguf_free(ctx);
+
     return true;
 }
 
@@ -248,7 +307,7 @@ bool gguf_ex_read_1(const std::string & fname) {
         fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
 
         for (int i = 0; i < n_tensors; ++i) {
-            const char * name = gguf_get_tensor_name(ctx, i);
+            const char * name   = gguf_get_tensor_name  (ctx, i);
             const size_t offset = gguf_get_tensor_offset(ctx, i);
 
             fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
diff --git a/ggml.c b/ggml.c
index ebdb6536f..96c7ebd34 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3698,7 +3698,6 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
 };
 static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated");
 
-
 static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = "f32",
     [GGML_TYPE_F16]  = "f16",
@@ -18302,7 +18301,19 @@ struct gguf_str {
     char * data;
 };
 
-union gguf_value;
+static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
+    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
+    [GGUF_TYPE_INT8]    = sizeof(int8_t),
+    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
+    [GGUF_TYPE_INT16]   = sizeof(int16_t),
+    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
+    [GGUF_TYPE_INT32]   = sizeof(int32_t),
+    [GGUF_TYPE_FLOAT32] = sizeof(float),
+    [GGUF_TYPE_BOOL]    = sizeof(bool),
+    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
+    [GGUF_TYPE_ARRAY]   = 0, // undefined
+};
+static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
 
 union gguf_value {
     uint8_t  uint8;
@@ -18320,7 +18331,7 @@ union gguf_value {
         enum gguf_type type;
 
         uint32_t n;
-        union gguf_value * arr;
+        void * data;
     } arr;
 };
 
@@ -18457,8 +18468,35 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                 case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (&kv->value.bool_,   sizeof(kv->value.bool_),   file, &offset); break;
                 case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(&kv->value.str,                                file, &offset); break;
                 case GGUF_TYPE_ARRAY:
-                                        GGML_ASSERT("gguf: array type not implemented");
-                                        break;
+                    {
+                        ok = ok && gguf_fread_el(&kv->value.arr.type, sizeof(kv->value.arr.type), file, &offset);
+                        ok = ok && gguf_fread_el(&kv->value.arr.n,    sizeof(kv->value.arr.n),    file, &offset);
+
+                        switch (kv->value.arr.type) {
+                            case GGUF_TYPE_UINT8:
+                            case GGUF_TYPE_INT8:
+                            case GGUF_TYPE_UINT16:
+                            case GGUF_TYPE_INT16:
+                            case GGUF_TYPE_UINT32:
+                            case GGUF_TYPE_INT32:
+                            case GGUF_TYPE_FLOAT32:
+                            case GGUF_TYPE_BOOL:
+                                {
+                                    kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
+                                    ok = ok && gguf_fread_el(kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], file, &offset);
+                                } break;
+                            case GGUF_TYPE_STRING:
+                                {
+                                    kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
+                                    for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                                        ok = ok && gguf_fread_str(&((struct gguf_str *) kv->value.arr.data)[j], file, &offset);
+                                    }
+                                } break;
+                            case GGUF_TYPE_ARRAY:
+                            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
+                        };
+                    } break;
+                case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
             };
 
             if (!ok) {
@@ -18629,6 +18667,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         ggml_set_no_alloc(ctx_data, params.no_alloc);
     }
 
+    fclose(file);
+
     return ctx;
 }
 
@@ -18651,6 +18691,20 @@ void gguf_free(struct gguf_context * ctx) {
                     free(kv->value.str.data);
                 }
             }
+
+            if (kv->type == GGUF_TYPE_ARRAY) {
+                if (kv->value.arr.data) {
+                    if (kv->value.arr.type == GGUF_TYPE_STRING) {
+                        for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                            struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
+                            if (str->data) {
+                                free(str->data);
+                            }
+                        }
+                    }
+                    free(kv->value.arr.data);
+                }
+            }
         }
 
         GGML_ALIGNED_FREE(ctx->header.kv);
diff --git a/ggml.h b/ggml.h
index 91588895c..e857b3f14 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1631,6 +1631,7 @@ extern "C" {
         GGUF_TYPE_BOOL    = 7,
         GGUF_TYPE_STRING  = 8,
         GGUF_TYPE_ARRAY   = 9,
+        GGUF_TYPE_COUNT,       // marks the end of the enum
     };
 
     struct gguf_context;
@@ -1664,7 +1665,8 @@ extern "C" {
     GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
     GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
     GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
-    // TODO: arr
+    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
+    GGML_API void         gguf_get_arr_data(struct gguf_context * ctx, int i, void * data);
 
     GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
     GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);

From 158be8f7f4f0f34ef8057a87f0e35a3b2af2ed4d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 27 Jul 2023 15:37:06 +0300
Subject: [PATCH 17/26] gguf.py : some code style changes

---
 constants.py |  44 ++++++++---------
 gguf.py      | 136 +++++++++++++++++++++++++--------------------------
 2 files changed, 89 insertions(+), 91 deletions(-)

diff --git a/constants.py b/constants.py
index 7c7456403..3a97460e5 100644
--- a/constants.py
+++ b/constants.py
@@ -1,32 +1,32 @@
-GGUF_MAGIC = 0x47475546
+GGUF_MAGIC   = 0x47475546
 GGUF_VERSION = 1
 
 # general
-KEY_GENERAL_ARCHITECTURE = "general.architecture"
+KEY_GENERAL_ARCHITECTURE         = "general.architecture"
 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
-KEY_GENERAL_NAME = "general.name"
-KEY_GENERAL_AUTHOR = "general.author"
-KEY_GENERAL_URL = "general.url"
-KEY_GENERAL_DESCRIPTION = "general.description"
-KEY_GENERAL_FILE_TYPE = "general.file_type"
-KEY_GENERAL_LICENSE = "general.license"
-KEY_GENERAL_SOURCE_URL = "general.source.url"
-KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
+KEY_GENERAL_NAME                 = "general.name"
+KEY_GENERAL_AUTHOR               = "general.author"
+KEY_GENERAL_URL                  = "general.url"
+KEY_GENERAL_DESCRIPTION          = "general.description"
+KEY_GENERAL_FILE_TYPE            = "general.file_type"
+KEY_GENERAL_LICENSE              = "general.license"
+KEY_GENERAL_SOURCE_URL           = "general.source.url"
+KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
 
 # LLM
-KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length"
-KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length"
-KEY_LLM_LAYER_COUNT = "{llm}.layer_count"
-KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length"
-KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual"
-KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout"
+KEY_LLM_CONTEXT_LENGTH           = "{llm}.context_length"
+KEY_LLM_EMBEDDING_LENGTH         = "{llm}.embedding_length"
+KEY_LLM_LAYER_COUNT              = "{llm}.layer_count"
+KEY_LLM_FEED_FORWARD_LENGTH      = "{llm}.feed_forward_length"
+KEY_LLM_USE_PARALLEL_RESIDUAL    = "{llm}.use_parallel_residual"
+KEY_LLM_TENSOR_DATA_LAYOUT       = "{llm}.tensor_data_layout"
 
 # attention
-KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count"
-KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv"
-KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias"
-KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv"
+KEY_ATTENTION_HEAD_COUNT         = "{llm}.attention.head_count"
+KEY_ATTENTION_HEAD_COUNT_KV      = "{llm}.attention.head_count_kv"
+KEY_ATTENTION_MAX_ALIBI_BIAS     = "{llm}.attention.max_alibi_bias"
+KEY_ATTENTION_CLAMP_KQV          = "{llm}.attention.clamp_kqv"
 
 # RoPE
-KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count"
-KEY_ROPE_SCALE = "{llm}.rope.scale"
+KEY_ROPE_DIMENSION_COUNT         = "{llm}.rope.dimension_count"
+KEY_ROPE_SCALE                   = "{llm}.rope.scale"
diff --git a/gguf.py b/gguf.py
index 991bbe2f3..764ae9a9d 100644
--- a/gguf.py
+++ b/gguf.py
@@ -6,14 +6,13 @@
 """
 
 import struct
+import constants
 from enum import IntEnum
 from typing import List, Any
-import constants
-
 
 class GGMLQuantizationType(IntEnum):
-    F32 = 0
-    F16 = 1
+    F32  = 0
+    F16  = 1
     QR_0 = 2
     Q4_1 = 3
     # Q4_2 = 4 # support has been removed
@@ -31,31 +30,30 @@ class GGMLQuantizationType(IntEnum):
 
 
 class GGUFValueType(IntEnum):
-    UINT8 = 0
-    INT8 = 1
-    UINT16 = 2
-    INT16 = 3
-    UINT32 = 4
-    INT32 = 5
+    UINT8   = 0
+    INT8    = 1
+    UINT16  = 2
+    INT16   = 3
+    UINT32  = 4
+    INT32   = 5
     FLOAT32 = 6
-    BOOL = 7
-    STRING = 8
-    ARRAY = 9
+    BOOL    = 7
+    STRING  = 8
+    ARRAY   = 9
 
     @staticmethod
-    def get_type(value):
-        if isinstance(value, str):
+    def get_type(val):
+        if isinstance(val, str):
             return GGUFValueType.STRING
-        elif isinstance(value, list):
+        elif isinstance(val, list):
             return GGUFValueType.ARRAY
-        elif isinstance(value, float):
+        elif isinstance(val, float):
             return GGUFValueType.FLOAT32
-        elif isinstance(value, bool):
+        elif isinstance(val, bool):
             return GGUFValueType.BOOL
         else:
             return GGUFValueType.INT32
 
-
 class GGUFWriter:
     def __init__(self, buffered_writer):
         self.buffered_writer = buffered_writer
@@ -72,81 +70,81 @@ class GGUFWriter:
         return cls(f)
 
     def write_key(self, key: str):
-        self.write_value(key, GGUFValueType.STRING)
+        self.write_val(key, GGUFValueType.STRING)
 
-    def write_uint8(self, key: str, value: int):
+    def write_uint8(self, key: str, val: int):
         self.write_key(key)
-        self.write_value(value, GGUFValueType.UINT8)
+        self.write_val(val, GGUFValueType.UINT8)
 
-    def write_int8(self, key: str, value: int):
+    def write_int8(self, key: str, val: int):
         self.write_key(key)
-        self.write_value(value, GGUFValueType.INT8)
+        self.write_val(val, GGUFValueType.INT8)
 
-    def write_uint16(self, key: str, value: int):
+    def write_uint16(self, key: str, val: int):
         self.write_key(key)
-        self.write_value(value, GGUFValueType.UINT16)
+        self.write_val(val, GGUFValueType.UINT16)
 
-    def write_int16(self, key: str, value: int):
+    def write_int16(self, key: str, val: int):
         self.write_key(key)
-        self.write_value(value, GGUFValueType.INT16)
+        self.write_val(val, GGUFValueType.INT16)
 
-    def write_uint32(self, key: str, value: int):
+    def write_uint32(self, key: str, val: int):
         self.write_key(key)
-        self.write(value, GGUFValueType.UINT32)
+        self.write_val(val, GGUFValueType.UINT32)
 
-    def write_int32(self, key: str, value: int):
+    def write_int32(self, key: str, val: int):
         self.write_key(key)
-        self.write_value(value, GGUFValueType.INT32)
+        self.write_val(val, GGUFValueType.INT32)
 
-    def write_float32(self, key: str, value: float):
+    def write_float32(self, key: str, val: float):
         self.write_key(key)
-        self.write_value(value, GGUFValueType.FLOAT32)
+        self.write_val(val, GGUFValueType.FLOAT32)
 
-    def write_bool(self, key: str, value: bool):
+    def write_bool(self, key: str, val: bool):
         self.write_key(key)
-        self.write_value(value, GGUFValueType.BOOL)
+        self.write_val(val, GGUFValueType.BOOL)
 
-    def write_string(self, key: str, value: str):
+    def write_string(self, key: str, val: str):
         self.write_key(key)
-        self.write_value(value, GGUFValueType.STRING)
+        self.write_val(val, GGUFValueType.STRING)
 
-    def write_array(self, key: str, value: list):
-        if not isinstance(value, list):
+    def write_array(self, key: str, val: list):
+        if not isinstance(val, list):
             raise ValueError("Value must be a list for array type")
 
         self.write_key(key)
-        self.write_value(value, GGUFValueType.ARRAY)
+        self.write_val(val, GGUFValueType.ARRAY)
 
-    def write_value(self: str, value: Any, value_type: GGUFValueType = None):
-        if value_type is None:
-            value_type = GGUFValueType.get_type(value)
+    def write_val(self: str, val: Any, vtype: GGUFValueType = None):
+        if vtype is None:
+            vtype = GGUFValueType.get_type(val)
 
-        self.buffered_writer.write(struct.pack("<I", value_type))
+        self.buffered_writer.write(struct.pack("<I", vtype))
 
-        if value_type == GGUFValueType.UINT8:
-            self.buffered_writer.write(struct.pack("<B", value))
-        elif value_type == GGUFValueType.INT8:
-            self.buffered_writer.write(struct.pack("<b", value))
-        elif value_type == GGUFValueType.UINT16:
-            self.buffered_writer.write(struct.pack("<H", value))
-        elif value_type == GGUFValueType.INT16:
-            self.buffered_writer.write(struct.pack("<h", value))
-        elif value_type == GGUFValueType.UINT32:
-            self.buffered_writer.write(struct.pack("<I", value))
-        elif value_type == GGUFValueType.INT32:
-            self.buffered_writer.write(struct.pack("<i", value))
-        elif value_type == GGUFValueType.FLOAT32:
-            self.buffered_writer.write(struct.pack("<f", value))
-        elif value_type == GGUFValueType.BOOL:
-            self.buffered_writer.write(struct.pack("?", value))
-        elif value_type == GGUFValueType.STRING:
-            encoded_value = value.encode("utf8")
-            self.buffered_writer.write(struct.pack("<I", len(encoded_value)))
-            self.buffered_writer.write(encoded_value)
-        elif value_type == GGUFValueType.ARRAY:
-            self.buffered_writer.write(struct.pack("<I", len(value)))
-            for item in value:
-                self.write_value(item)
+        if vtype == GGUFValueType.UINT8:
+            self.buffered_writer.write(struct.pack("<B", val))
+        elif vtype == GGUFValueType.INT8:
+            self.buffered_writer.write(struct.pack("<b", val))
+        elif vtype == GGUFValueType.UINT16:
+            self.buffered_writer.write(struct.pack("<H", val))
+        elif vtype == GGUFValueType.INT16:
+            self.buffered_writer.write(struct.pack("<h", val))
+        elif vtype == GGUFValueType.UINT32:
+            self.buffered_writer.write(struct.pack("<I", val))
+        elif vtype == GGUFValueType.INT32:
+            self.buffered_writer.write(struct.pack("<i", val))
+        elif vtype == GGUFValueType.FLOAT32:
+            self.buffered_writer.write(struct.pack("<f", val))
+        elif vtype == GGUFValueType.BOOL:
+            self.buffered_writer.write(struct.pack("?", val))
+        elif vtype == GGUFValueType.STRING:
+            encoded_val = val.encode("utf8")
+            self.buffered_writer.write(struct.pack("<I", len(encoded_val)))
+            self.buffered_writer.write(encoded_val)
+        elif vtype == GGUFValueType.ARRAY:
+            self.buffered_writer.write(struct.pack("<I", len(val)))
+            for item in val:
+                self.write_val(item)
         else:
             raise ValueError("Invalid GGUF metadata value type")
 

From 68f53485e4787262489320b22c786889b220f9cf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 27 Jul 2023 15:56:53 +0300
Subject: [PATCH 18/26] convert.py : start a new simplified implementation by
 removing old stuff

---
 convert-new.py | 961 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 961 insertions(+)
 create mode 100755 convert-new.py

diff --git a/convert-new.py b/convert-new.py
new file mode 100755
index 000000000..dacad693f
--- /dev/null
+++ b/convert-new.py
@@ -0,0 +1,961 @@
+#!/usr/bin/env python
+
+import argparse
+import concurrent.futures
+import copy
+import enum
+import faulthandler
+import functools
+import io
+import itertools
+import json
+import math
+import mmap
+import pickle
+import re
+import signal
+import struct
+import sys
+import zipfile
+import numpy as np
+
+from abc import ABCMeta, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union)
+from sentencepiece import SentencePieceProcessor  # type: ignore
+
+if TYPE_CHECKING:
+    from typing_extensions import TypeAlias
+
+if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
+    faulthandler.register(signal.SIGUSR1)
+
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+@dataclass(frozen=True)
+class UnquantizedDataType:
+    name: str
+
+DT_F16  = UnquantizedDataType('F16')
+DT_F32  = UnquantizedDataType('F32')
+DT_I32  = UnquantizedDataType('I32')
+DT_BF16 = UnquantizedDataType('BF16')
+
+DataType = Union[UnquantizedDataType]
+
+DATA_TYPE_TO_FTYPE: Dict[DataType, int] = {
+    DT_F32: 0,
+    DT_F16: 1,
+}
+
+FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \
+    {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}
+
+DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
+    DT_BF16: np.dtype(np.uint16),
+    DT_F16:  np.dtype(np.float16),
+    DT_F32:  np.dtype(np.float32),
+    DT_I32:  np.dtype(np.int32),
+}
+
+NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \
+    {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()}
+
+class GGMLFileType(enum.Enum):
+    AllF32    = 0
+    MostlyF16 = 1  # except 1d tensors
+
+    def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
+        if len(tensor.shape) == 1:
+            # 1D tensors are always F32.
+            return DT_F32
+        elif self == GGMLFileType.AllF32:
+            return DT_F32
+        elif self == GGMLFileType.MostlyF16:
+            return DT_F16
+        else:
+            raise ValueError(self)
+
+# TODO: this is LLaMA specific
+def make_tensors_list() -> List[str]:
+    ret = [
+        'tok_embeddings.weight',
+        'norm.weight',
+        'output.weight',
+    ]
+    for i in range(80):  # maximum number of layer
+        ret += [
+            f'layers.{i}.attention.wq.weight',
+            f'layers.{i}.attention.wk.weight',
+            f'layers.{i}.attention.wv.weight',
+            f'layers.{i}.attention.wo.weight',
+            f'layers.{i}.attention_norm.weight',
+            f'layers.{i}.feed_forward.w1.weight',
+            f'layers.{i}.feed_forward.w2.weight',
+            f'layers.{i}.feed_forward.w3.weight',
+            f'layers.{i}.ffn_norm.weight',
+        ]
+    return ret
+
+# TODO: this should be generalized for non-LLaMA models
+TENSORS_LIST = make_tensors_list()
+TENSORS_SET = set(TENSORS_LIST)
+
+def find_n_mult(n_ff: int, n_embd: int) -> int:
+    # hardcoded magic range
+    for n_mult in range(256, 1, -1):
+        calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
+        if calc_ff == n_ff:
+            return n_mult
+    raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
+
+
+@dataclass
+class Params:
+    n_vocab: int
+    n_embd:  int
+    n_mult:  int
+    n_head:  int
+    n_layer: int
+
+    @staticmethod
+    def guessed(model: 'LazyModel') -> 'Params':
+        # try transformer naming first
+        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
+
+        # try transformer naming first
+        if "model.layers.0.self_attn.q_proj.weight" in model:
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
+        else:
+            n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
+
+        if n_layer < 1:
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+
+        n_head=n_embd // 128 # guessed
+
+        return Params(
+            n_vocab = n_vocab,
+            n_embd  = n_embd,
+            n_mult  = 256,
+            n_head  = n_head,
+            n_layer = n_layer,
+        )
+
+    @staticmethod
+    def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
+        config = json.load(open(config_path))
+
+        n_vocab = config["vocab_size"];
+        n_embd  = config["hidden_size"];
+        n_head  = config["num_attention_heads"];
+        n_layer = config["num_hidden_layers"];
+        n_ff    = config["intermediate_size"];
+
+        n_mult = find_n_mult(n_ff, n_embd);
+
+        return Params(
+            n_vocab = n_vocab,
+            n_embd  = n_embd,
+            n_mult  = n_mult,
+            n_head  = n_head,
+            n_layer = n_layer,
+        )
+
+    # LLaMA v2 70B params.json
+    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
+    @staticmethod
+    def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
+        config = json.load(open(config_path))
+
+        n_vocab = config["vocab_size"];
+        n_embd  = config["dim"];
+        n_head  = config["n_heads"];
+        n_layer = config["n_layers"];
+        n_mult  = config["multiple_of"];
+
+        if n_vocab == -1:
+            n_vocab = model["tok_embeddings.weight"].shape[0]
+
+        return Params(
+            n_vocab = n_vocab,
+            n_embd  = n_embd,
+            n_mult  = n_mult,
+            n_head  = n_head,
+            n_layer = n_layer,
+        )
+
+    @staticmethod
+    def load(model_plus: 'ModelPlus') -> 'Params':
+        hf_config_path   = model_plus.paths[0].parent / "config.json"
+        orig_config_path = model_plus.paths[0].parent / "params.json"
+
+        if hf_config_path.exists():
+            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
+        elif orig_config_path.exists():
+            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
+        else:
+            params = Params.guessed(model_plus.model)
+
+        print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
+        return params
+
+
+class SentencePieceVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
+        self.vocabtype = vocabtype
+        if self.vocabtype == "bpe":
+            self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
+        else:
+            self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+
+        added_tokens: Dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens))
+        else:
+            added_tokens = {}
+
+        if self.vocabtype == "bpe":
+            vocab_size: int = len(self.sentencepiece_tokenizer)
+        else:
+            vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids   = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens
+
+    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        tokenizer = self.sentencepiece_tokenizer
+        if self.vocabtype == "bpe":
+            from transformers.models.gpt2 import tokenization_gpt2
+            byte_encoder = tokenization_gpt2.bytes_to_unicode()
+            byte_decoder = {v: k for k, v in byte_encoder.items()}
+            for i, item in enumerate(tokenizer):
+                text: bytes
+                text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
+                score: float = -i
+                yield text, score
+        else:
+            for i in range(tokenizer.vocab_size()):
+                text: bytes
+                if tokenizer.is_unknown(i):
+                    text = " \u2047 ".encode("utf-8")
+                elif tokenizer.is_control(i):
+                    text = b""
+                elif tokenizer.is_byte(i):
+                    piece = tokenizer.id_to_piece(i)
+                    if len(piece) != 6:
+                        raise Exception(f"Invalid token: {piece}")
+                    byte_value = int(piece[3:-1], 16)
+                    text = struct.pack("B", byte_value)
+                else:
+                    text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+                score: float = tokenizer.get_score(i)
+                yield text, score
+
+    def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class GGMLVocab:
+    def __init__(self, tokens: List[Tuple[bytes, float]]):
+        self.tokens = tokens
+        self.vocab_size = len(tokens)
+
+    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
+        return self.tokens
+
+    def __repr__(self) -> str:
+        return f"<GGMLVocab with {self.vocab_size} tokens>"
+
+
+Vocab = Union[SentencePieceVocab, GGMLVocab]
+
+
+def permute(weights: NDArray, n_head: int) -> NDArray:
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                   .swapaxes(1, 2)
+                   .reshape(weights.shape))
+
+
+class Tensor(metaclass=ABCMeta):
+    data_type: DataType
+
+    @abstractmethod
+    def astype(self, data_type: DataType) -> 'Tensor': ...
+    @abstractmethod
+    def permute(self, n_head: int) -> 'Tensor': ...
+    @abstractmethod
+    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
+    @abstractmethod
+    def part(self, n_part: int) -> 'UnquantizedTensor': ...
+    @abstractmethod
+    def to_ggml(self) -> 'GGMLCompatibleTensor': ...
+
+
+def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
+    assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
+    fp32_arr = bf16_arr.astype(np.uint32) << 16
+    return fp32_arr.view(np.float32)
+
+
+class UnquantizedTensor(Tensor):
+    def __init__(self, ndarray: NDArray) -> None:
+        assert isinstance(ndarray, np.ndarray)
+        self.ndarray = ndarray
+        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
+
+    def astype(self, data_type: DataType) -> Tensor:
+        dtype = DATA_TYPE_TO_NUMPY[data_type]
+        if self.data_type == DT_BF16:
+            self.ndarray = bf16_to_fp32(self.ndarray)
+        return UnquantizedTensor(self.ndarray.astype(dtype))
+
+    def to_ggml(self) -> 'UnquantizedTensor':
+        return self
+
+    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
+
+    def part(self, n_part: int) -> 'UnquantizedTensor':
+        r = self.ndarray.shape[0] // 3
+        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
+
+    def permute(self, n_head: int) -> 'UnquantizedTensor':
+        return UnquantizedTensor(permute(self.ndarray, n_head))
+
+
+def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
+    tensor = lazy_tensor.load()
+    assert isinstance(tensor, UnquantizedTensor)
+
+    # double-check:
+    actual_shape = list(tensor.ndarray.shape)
+    assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
+    if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
+        if convert:
+            tensor.ndarray = tensor.ndarray.astype(expected_dtype)
+        else:
+            raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
+
+    return tensor.ndarray
+
+
+GGMLCompatibleTensor = Union[UnquantizedTensor]
+
+
+class DeferredPermutedTensor(Tensor):
+    def __init__(self, base: Tensor, n_head: int) -> None:
+        self.base = base
+        self.n_head = n_head
+        self.data_type = self.base.data_type
+
+    def astype(self, data_type: DataType) -> Tensor:
+        return self.base.astype(data_type).permute(self.n_head)
+
+    def to_ggml(self) -> GGMLCompatibleTensor:
+        return self.base.to_ggml().permute(self.n_head)
+
+    def permute(self, n_head: int) -> Tensor:
+        raise Exception("shouldn't permute twice")
+
+
+@dataclass
+class LazyTensor:
+    _load: Callable[[], Tensor]
+    shape: List[int]
+    data_type: DataType
+    description: str
+
+    def load(self) -> Tensor:
+        ret = self._load()
+        assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description)
+        return ret
+
+    def astype(self, data_type: DataType) -> 'LazyTensor':
+        self.validate_conversion_to(data_type)
+
+        def load() -> Tensor:
+            return self.load().astype(data_type)
+        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
+
+    def validate_conversion_to(self, data_type: DataType) -> None:
+        if data_type == self.data_type:
+            return
+
+
+LazyModel = Dict[str, LazyTensor]
+
+
+@dataclass
+class ModelPlus:
+    model: LazyModel
+    paths: List[Path]  # Where this was read from.
+    format: Literal['ggml', 'torch', 'safetensors']
+    vocab: Optional[Vocab]  # For GGML models (which have vocab built in), the vocab.
+
+
+def merge_sharded(models: List[LazyModel]) -> LazyModel:
+    # Original LLaMA models have each file contain one part of each tensor.
+    # Use a dict instead of a set to preserve order.
+    names = {name: None for model in models for name in model}
+
+    def convert(name: str) -> LazyTensor:
+        lazy_tensors: List[LazyTensor] = [model[name] for model in models]
+        if len(lazy_tensors) == 1:
+            # only one file; don't go through this procedure since there might
+            # be quantized tensors
+            return lazy_tensors[0]
+        if len(lazy_tensors[0].shape) == 1:
+            # the tensor is just duplicated in every file
+            return lazy_tensors[0]
+        if name.startswith('tok_embeddings.') or \
+           name.endswith('.attention.wo.weight') or \
+           name.endswith('.feed_forward.w2.weight'):
+            # split by columns
+            axis = 1
+        else:
+            # split by rows
+            axis = 0
+        concatenated_shape = list(lazy_tensors[0].shape)
+        concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
+
+        def load() -> UnquantizedTensor:
+            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
+            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
+            return UnquantizedTensor(concatenated)
+        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
+        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
+    return {name: convert(name) for name in names}
+
+
+def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
+    formats = set(mp.format for mp in models_plus)
+    assert len(formats) == 1, "different formats?"
+    format = formats.pop()
+    paths = [path for mp in models_plus for path in mp.paths]
+    # Use the first non-None vocab, if any.
+    try:
+        vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
+    except StopIteration:
+        vocab = None
+
+    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
+        # Transformers models put different tensors in different files, but
+        # don't split indivdual tensors between files.
+        model: LazyModel = {}
+        for mp in models_plus:
+            model.update(mp.model)
+    else:
+        model = merge_sharded([mp.model for mp in models_plus])
+
+    return ModelPlus(model, paths, format, vocab)
+
+
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute(n_head)
+    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
+
+def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().permute_part(n_part, n_head)
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 3
+    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
+
+def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
+    def load() -> Tensor:
+        return lazy_tensor.load().part(n_part)
+    s = lazy_tensor.shape.copy()
+    s[0] = s[0] // 3
+    return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
+
+def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
+    out: LazyModel = {}
+    out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
+    out["norm.weight"]           = model["model.norm.weight"]
+    out["output.weight"]         = model["lm_head.weight"]
+
+    for i in itertools.count():
+        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
+            out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
+            out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
+            out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
+        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
+            out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
+            out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
+            out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
+        else:
+            break
+
+        out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
+
+        out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
+        out[f"layers.{i}.feed_forward.w2.weight"] = model[f"model.layers.{i}.mlp.down_proj.weight"]
+        out[f"layers.{i}.feed_forward.w3.weight"] = model[f"model.layers.{i}.mlp.up_proj.weight"]
+
+        out[f"layers.{i}.attention_norm.weight"] = model[f"model.layers.{i}.input_layernorm.weight"]
+        out[f"layers.{i}.ffn_norm.weight"]       = model[f"model.layers.{i}.post_attention_layernorm.weight"]
+    return out
+
+
+# Functionality that simulates `torch.load` but where individual tensors are
+# only loaded into memory on demand, not all at once.
+# PyTorch can't do this natively as of time of writing:
+# - https://github.com/pytorch/pytorch/issues/64327
+# This allows us to de-shard without multiplying RAM usage, and also
+# conveniently drops the PyTorch dependency (though we still need numpy).
+
+
+@dataclass
+class LazyStorageKind:
+    data_type: DataType
+
+
+@dataclass
+class LazyStorage:
+    load: Callable[[int, int], NDArray]
+    kind: LazyStorageKind
+    description: str
+
+
+class LazyUnpickler(pickle.Unpickler):
+    def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
+        super().__init__(fp)
+        self.data_base_path = data_base_path
+        self.zip_file = zip_file
+
+    def persistent_load(self, pid: Any) -> Any:
+        assert pid[0] == 'storage'
+        assert isinstance(pid[1], LazyStorageKind)
+        data_type = pid[1].data_type
+        filename_stem = pid[2]
+        filename = self.data_base_path + '/' + filename_stem
+        info = self.zip_file.getinfo(filename)
+
+        def load(offset: int, elm_count: int) -> NDArray:
+            dtype = DATA_TYPE_TO_NUMPY.get(data_type)
+            if dtype is None:
+                raise Exception("tensor stored in unsupported format")
+            fp = self.zip_file.open(info)
+            fp.seek(offset * dtype.itemsize)
+            size = elm_count * dtype.itemsize
+            data = fp.read(size)
+            assert len(data) == size
+            return np.frombuffer(data, dtype)
+        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
+        return LazyStorage(load=load, kind=pid[1], description=description)
+
+    # @staticmethod
+    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
+                               # pyright: ignore[reportSelfClsParameterName]
+                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
+        assert isinstance(storage, LazyStorage)
+
+        def load() -> UnquantizedTensor:
+            elm_count = stride[0] * size[0]
+            return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
+        description = f'pickled storage_offset={storage_offset} in {storage.description}'
+        return LazyTensor(load, list(size), storage.kind.data_type, description)
+
+    # @staticmethod
+    def rebuild_from_type_v2(func, new_type, args, state):
+        return func(*args)
+
+    CLASSES: Dict[Any, Any] = {
+        ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
+        ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
+        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
+        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
+        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
+        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
+        ('torch', 'Tensor'): LazyTensor,
+    }
+
+    def find_class(self, module: str, name: str) -> Any:
+        if not module.startswith('torch'):
+            return super().find_class(module, name)
+        return self.CLASSES[(module, name)]
+
+
+def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
+    zf = zipfile.ZipFile(outer_fp)
+    pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
+    assert len(pickle_paths) == 1, pickle_paths
+    pickle_fp = zf.open(pickle_paths[0], 'r')
+    unpickler = LazyUnpickler(pickle_fp,
+                              data_base_path=pickle_paths[0][:-4],
+                              zip_file=zf)
+    model = unpickler.load()
+    as_dict = dict(model.items())
+    return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
+
+
+SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
+    'BF16': DT_BF16,
+    'F16': DT_F16,
+    'F32': DT_F32,
+    'I32': DT_I32,
+}
+
+
+def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
+    header_size, = struct.unpack('<Q', fp.read(8))
+    header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
+    # Use mmap for the actual data to avoid race conditions with the file offset.
+    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
+    byte_buf = mapped[8 + header_size:]
+
+    def convert(info: Dict[str, Any]) -> LazyTensor:
+        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
+        numpy_dtype = DATA_TYPE_TO_NUMPY[data_type]
+        shape: List[int] = info['shape']
+        begin, end = info['data_offsets']
+        assert 0 <= begin <= end <= len(byte_buf)
+        assert end - begin == math.prod(shape) * numpy_dtype.itemsize
+        buf = byte_buf[begin:end]
+
+        def load() -> UnquantizedTensor:
+            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
+        description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
+        return LazyTensor(load, shape, data_type, description)
+    model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
+    return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
+
+
+def must_read(fp: IO[bytes], length: int) -> bytes:
+    ret = fp.read(length)
+    if len(ret) < length:
+        raise Exception("unexpectedly reached end of file")
+    return ret
+
+
+@functools.lru_cache(maxsize=None)
+def lazy_load_file(path: Path) -> ModelPlus:
+    fp = open(path, 'rb')
+    first8 = fp.read(8)
+    fp.seek(0)
+    if first8[:2] == b'PK':
+        # A zip file, i.e. PyTorch format
+        return lazy_load_torch_file(fp, path)
+    elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
+        # Probably safetensors
+        return lazy_load_safetensors_file(fp, path)
+    else:
+        raise ValueError(f"unknown format: {path}")
+
+
+In = TypeVar('In')
+Out = TypeVar('Out')
+
+
+def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
+    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
+    fast enough, this will stop calling `func` at some point rather than
+    letting results pile up in memory.  Specifically, there is a max of one
+    output value buffered per thread.'''
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures: List[concurrent.futures.Future[Out]] = []
+        items_rev = list(iterable)[::-1]
+        for i in range(min(concurrency, len(items_rev))):
+            futures.append(executor.submit(func, items_rev.pop()))
+        while futures:
+            result = futures.pop(0).result()
+            if items_rev:
+                futures.append(executor.submit(func, items_rev.pop()))
+            yield result
+
+
+def check_vocab_size(params: Params, vocab: Vocab) -> None:
+    if params.n_vocab != vocab.vocab_size:
+        # GGMLVocab comes from the same file as the model so shouldn't mismatch:
+        assert isinstance(vocab, SentencePieceVocab)
+        if params.n_vocab == vocab.vocab_size_base:
+            print("Ignoring added_tokens.json since model matches vocab size without it.")
+            vocab.added_tokens_list = []
+            vocab.vocab_size = vocab.vocab_size_base
+            return
+        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
+        if vocab.fname_added_tokens is not None:
+            msg += f" combined with {vocab.fname_added_tokens}"
+        msg += f" has {vocab.vocab_size})."
+        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
+            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+        raise Exception(msg)
+
+
+class OutputFile:
+    def __init__(self, fname_out: Path) -> None:
+        self.fout = open(fname_out, "wb")
+
+    def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
+        self.fout.write(b"ggjt"[::-1])  # magic
+        values = [
+            1,  # file version
+            params.n_vocab,
+            params.n_embd,
+            params.n_mult,
+            params.n_head,
+            params.n_layer,
+            params.n_embd // params.n_head,  # rot (obsolete)
+            file_type.value,
+        ]
+        self.fout.write(struct.pack("i" * len(values), *values))
+
+    def write_tensor_header(self, name: str, shape: Sequence[int], data_type: DataType) -> None:
+        sname = name.encode('utf-8')
+        self.fout.write(struct.pack("iii", len(shape), len(sname), DATA_TYPE_TO_FTYPE[data_type]))
+        self.fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+        self.fout.write(sname)
+        self.fout.seek((self.fout.tell() + 31) & -32)
+
+    def write_vocab(self, vocab: Vocab) -> None:
+        for text, score in vocab.all_tokens():
+            self.fout.write(struct.pack("i", len(text)))
+            self.fout.write(text)
+            self.fout.write(struct.pack("f", score))
+
+    @staticmethod
+    def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
+        of = OutputFile(fname_out)
+        params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
+        of = OutputFile(fname_out)
+        of.write_file_header(params, file_type=GGMLFileType.AllF32)
+        of.write_vocab(vocab)
+        of.fout.close()
+
+    @staticmethod
+    def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
+        check_vocab_size(params, vocab)
+        of = OutputFile(fname_out)
+        of.write_file_header(params, file_type)
+        print("Writing vocab...")
+        of.write_vocab(vocab)
+
+        def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
+            name, lazy_tensor = item
+            return lazy_tensor.load().to_ggml().ndarray
+
+        ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
+            of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
+            ndarray.tofile(of.fout)
+        of.fout.close()
+
+
+def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
+    wq_type = model["layers.0.attention.wq.weight"].data_type
+    if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
+        return GGMLFileType.AllF32
+    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
+        return GGMLFileType.MostlyF16
+    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
+    raise Exception(f"Unexpected combination of types: {name_to_type}")
+
+
+def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
+    if "lm_head.weight" in model:
+        model = convert_transformers_to_orig(model, params)
+    model = filter_and_sort_tensors(model)
+
+    return model
+
+
+def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
+    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
+            for (name, tensor) in model.items()}
+
+
+def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
+    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the nth path in the model.
+    '''
+    # Support the following patterns:
+    patterns: List[Tuple[str, str]] = [
+        # - x.00.pth, x.01.pth, etc.
+        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
+        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
+        (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
+        # x.bin, x.bin.1, etc.
+        (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
+    ]
+    for regex, replacement in patterns:
+        if re.search(regex, path.name):
+            new_path = path.with_name(re.sub(regex, replacement, path.name))
+            if new_path.exists():
+                return new_path
+    return None
+
+
+def find_multifile_paths(path: Path) -> List[Path]:
+    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
+    the whole list of paths in the model.
+    '''
+    ret: List[Path] = []
+    for i in itertools.count():
+        nth_path = nth_multifile_path(path, i)
+        if nth_path is None:
+            break
+        ret.append(nth_path)
+    if not ret:
+        # No matches.  This should only happen if the file was named, e.g.,
+        # foo.0, and there was no file named foo.  Oh well, try to process it
+        # as a single file.
+        return [path]
+    return ret
+
+
+def load_some_model(path: Path) -> ModelPlus:
+    '''Load a model of any supported format.'''
+    # Be extra-friendly and accept either a file or a directory:
+    if path.is_dir():
+        # Check if it's a set of safetensors files first
+        files = list(path.glob("model-00001-of-*.safetensors"))
+        if not files:
+            # Try the PyTorch patterns too, with lower priority
+            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
+            files = [file for glob in globs for file in path.glob(glob)]
+        if not files:
+            # Try GGML too, but with lower priority, since if both a non-GGML
+            # model and a GGML model exist in the same directory, we assume the
+            # latter was converted from the former.
+            files = list(path.glob("ggml-model*.bin*"))
+        if not files:
+            raise Exception(f"Can't find model in directory {path}")
+        if len(files) > 1:
+            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
+        path = files[0]
+
+    paths = find_multifile_paths(path)
+    models_plus: List[ModelPlus] = []
+    for path in paths:
+        print(f"Loading model file {path}")
+        models_plus.append(lazy_load_file(path))
+
+    model_plus = merge_multifile_models(models_plus)
+    return model_plus
+
+
+def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
+    return {name: model[name] for name in TENSORS_LIST if name in model}
+
+
+def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
+    print(f"vocabtype: {vocabtype}")
+    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    # a directory, it might be the model directory, and tokenizer.model might
+    # be in the parent of that.
+    if path.is_dir():
+        vocab_file = "tokenizer.model"
+        if vocabtype == 'bpe':
+          vocab_file = "vocab.json"
+        path2 = path / vocab_file
+        # Use `.parent` instead of /.. to handle the symlink case better.
+        path3 = path.parent / vocab_file
+        if path2.exists():
+            path = path2
+        elif path3.exists():
+            path = path3
+        else:
+            raise FileNotFoundError(
+                f"Could not find tokenizer.model in {path} or its parent; "
+                "if it's in another directory, pass the directory as --vocab-dir")
+    added_tokens_path = path.parent / "added_tokens.json"
+    print(f"Loading vocab file {path}")
+    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
+                              vocabtype)
+
+
+def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
+    namestr = {
+        GGMLFileType.AllF32:    "f32",
+        GGMLFileType.MostlyF16: "f16",
+    }[file_type]
+    ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
+    if ret in model_paths:
+        sys.stderr.write(
+            f"Error: Default output path ({ret}) would overwrite the input. "
+            "Please explicitly specify a path using --outfile.\n")
+        sys.exit(1)
+    return ret
+
+
+def do_dump_model(model_plus: ModelPlus) -> None:
+    print(f"model_plus.paths = {model_plus.paths!r}")
+    print(f"model_plus.format = {model_plus.format!r}")
+    print(f"model_plus.vocab = {model_plus.vocab!r}")
+    for name, lazy_tensor in model_plus.model.items():
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
+
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
+    parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
+    parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
+    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+    parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
+    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
+    parser.add_argument("model", type=Path,
+                        help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
+    args = parser.parse_args(args_in)
+
+    vocab: Vocab
+    if args.dump_single:
+        model_plus = lazy_load_file(args.model)
+        do_dump_model(model_plus)
+    elif args.vocab_only:
+        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
+        assert args.outfile, "need --outfile if using --vocab-only"
+        outfile = args.outfile
+        OutputFile.write_vocab_only(outfile, vocab)
+        print(f"Wrote {outfile}")
+    else:
+        model_plus = load_some_model(args.model)
+        if args.dump:
+            do_dump_model(model_plus)
+            return
+        if model_plus.vocab is not None and args.vocab_dir is None:
+            vocab = model_plus.vocab
+        else:
+            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
+            vocab = load_vocab(vocab_dir, args.vocabtype)
+        params = Params.load(model_plus)
+        model = model_plus.model
+        model = do_necessary_conversions(model, params)
+        output_type = pick_output_type(model, args.outtype)
+        model = convert_to_output_type(model, output_type)
+        outfile = args.outfile or default_outfile(model_plus.paths, output_type)
+        OutputFile.write_all(outfile, params, output_type, model, vocab)
+        print(f"Wrote {outfile}")
+
+
+if __name__ == '__main__':
+    main()

From d2bb3ac10b025e9c2e475a0bc9527a7286b63810 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 27 Jul 2023 16:36:35 +0300
Subject: [PATCH 19/26] convert.py : remove GGML vocab + other obsolete stuff

---
 convert-new.py | 46 +++++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/convert-new.py b/convert-new.py
index dacad693f..bf85cb551 100755
--- a/convert-new.py
+++ b/convert-new.py
@@ -278,19 +278,7 @@ class SentencePieceVocab:
         return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 
 
-class GGMLVocab:
-    def __init__(self, tokens: List[Tuple[bytes, float]]):
-        self.tokens = tokens
-        self.vocab_size = len(tokens)
-
-    def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
-        return self.tokens
-
-    def __repr__(self) -> str:
-        return f"<GGMLVocab with {self.vocab_size} tokens>"
-
-
-Vocab = Union[SentencePieceVocab, GGMLVocab]
+Vocab = Union[SentencePieceVocab]
 
 
 def permute(weights: NDArray, n_head: int) -> NDArray:
@@ -691,7 +679,6 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
 
 def check_vocab_size(params: Params, vocab: Vocab) -> None:
     if params.n_vocab != vocab.vocab_size:
-        # GGMLVocab comes from the same file as the model so shouldn't mismatch:
         assert isinstance(vocab, SentencePieceVocab)
         if params.n_vocab == vocab.vocab_size_base:
             print("Ignoring added_tokens.json since model matches vocab size without it.")
@@ -874,7 +861,7 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
     if path.is_dir():
         vocab_file = "tokenizer.model"
         if vocabtype == 'bpe':
-          vocab_file = "vocab.json"
+            vocab_file = "vocab.json"
         path2 = path / vocab_file
         # Use `.parent` instead of /.. to handle the symlink case better.
         path3 = path.parent / vocab_file
@@ -916,15 +903,14 @@ def do_dump_model(model_plus: ModelPlus) -> None:
 
 def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
-    parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
-    parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
-    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
-    parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
-    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
-    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("model", type=Path,
-                        help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
-    parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
+    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
+    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
+    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outtype",     choices=["f32", "f16"], help="output format (default: based on input)")
+    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)")
     args = parser.parse_args(args_in)
 
     vocab: Vocab
@@ -947,12 +933,14 @@ def main(args_in: Optional[List[str]] = None) -> None:
         else:
             vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
             vocab = load_vocab(vocab_dir, args.vocabtype)
-        params = Params.load(model_plus)
-        model = model_plus.model
-        model = do_necessary_conversions(model, params)
+
+        params      = Params.load(model_plus)
+        model       = model_plus.model
+        model       = do_necessary_conversions(model, params)
         output_type = pick_output_type(model, args.outtype)
-        model = convert_to_output_type(model, output_type)
-        outfile = args.outfile or default_outfile(model_plus.paths, output_type)
+        model       = convert_to_output_type(model, output_type)
+        outfile     = args.outfile or default_outfile(model_plus.paths, output_type)
+
         OutputFile.write_all(outfile, params, output_type, model, vocab)
         print(f"Wrote {outfile}")
 

From 464192b9be263a916555870598086e99d8c2e449 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 27 Jul 2023 22:25:04 +0300
Subject: [PATCH 20/26] WIP: Write tensor

---
 constants.py |  5 ++--
 gguf.py      | 68 ++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/constants.py b/constants.py
index 3a97460e5..34880bb20 100644
--- a/constants.py
+++ b/constants.py
@@ -1,5 +1,6 @@
-GGUF_MAGIC   = 0x47475546
-GGUF_VERSION = 1
+GGUF_MAGIC             = 0x47475546
+GGUF_VERSION           = 1
+GGUF_DEFAULT_ALIGNMENT = 32
 
 # general
 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
diff --git a/gguf.py b/gguf.py
index 764ae9a9d..1f24cb9d5 100644
--- a/gguf.py
+++ b/gguf.py
@@ -8,11 +8,14 @@
 import struct
 import constants
 from enum import IntEnum
-from typing import List, Any
+from typing import Any, IO, List, Sequence
+
+import numpy as np
+
 
 class GGMLQuantizationType(IntEnum):
-    F32  = 0
-    F16  = 1
+    F32 = 0
+    F16 = 1
     QR_0 = 2
     Q4_1 = 3
     # Q4_2 = 4 # support has been removed
@@ -30,16 +33,16 @@ class GGMLQuantizationType(IntEnum):
 
 
 class GGUFValueType(IntEnum):
-    UINT8   = 0
-    INT8    = 1
-    UINT16  = 2
-    INT16   = 3
-    UINT32  = 4
-    INT32   = 5
+    UINT8 = 0
+    INT8 = 1
+    UINT16 = 2
+    INT16 = 3
+    UINT32 = 4
+    INT32 = 5
     FLOAT32 = 6
-    BOOL    = 7
-    STRING  = 8
-    ARRAY   = 9
+    BOOL = 7
+    STRING = 8
+    ARRAY = 9
 
     @staticmethod
     def get_type(val):
@@ -54,15 +57,18 @@ class GGUFValueType(IntEnum):
         else:
             return GGUFValueType.INT32
 
+
 class GGUFWriter:
-    def __init__(self, buffered_writer):
-        self.buffered_writer = buffered_writer
+    def __init__(self, fout: IO):
+        self.fout = fout
+        self.offset_tensor = 0
+        self.tensors = []
 
     def write_header(self, tensor_count: int, metadata_kv_count: int):
-        self.buffered_writer.write(struct.pack("<I", constants.GGUF_MAGIC))
-        self.buffered_writer.write(struct.pack("<I", constants.GGUF_VERSION))
-        self.buffered_writer.write(struct.pack("<I", tensor_count))
-        self.buffered_writer.write(struct.pack("<I", metadata_kv_count))
+        self.fout.write(struct.pack("<I", constants.GGUF_MAGIC))
+        self.fout.write(struct.pack("<I", constants.GGUF_VERSION))
+        self.fout.write(struct.pack("<I", tensor_count))
+        self.fout.write(struct.pack("<I", metadata_kv_count))
 
     @classmethod
     def open(cls, path: str) -> "GGUFWriter":
@@ -148,11 +154,33 @@ class GGUFWriter:
         else:
             raise ValueError("Invalid GGUF metadata value type")
 
+    @staticmethod
+    def ggml_pad(x: int, n: int) -> int:
+        return ((x + n - 1) // n) * n
+
+    def write_tensor_info(self, name: str, tensor: np.ndarray):
+        self.write_val(key, GGUFValueType.STRING)
+        n_dims = len(tensor.shape)
+        self.write_val(n_dims, GGUFValueType.INT32)
+        for i in range(n_dims):
+            self.write_val(tensor.shape[N_dims - 1 - i], GGUFValueType.INT32)
+
+        dtype = GGMLQuantizationType.F32 if tensor.dtype == np.float32 else GGMLQuantizationType.F16
+        self.write_val(dtype, GGUFValueType.INT32)
+        self.fout.write(struct.pack("<Q", self.offset_tensor))
+        self.offset_tensor += GGUFWriter.ggml_pad(tensor.nbytes, constants.GGUF_DEFAULT_ALIGNMENT)
+
+        offset_data = GGUFWriter.ggml_pad(self.fout.tell(), constants.GGUF_DEFAULT_ALIGNMENT)
+        pad = offset_data - self.fout.tell()
+        self.fout.write(bytes([0] * pad))
+
+        self.tensors.append(tensor)
+
     def flush(self):
-        self.buffered_writer.flush()
+        self.fout.flush()
 
     def close(self):
-        self.buffered_writer.close()
+        self.fout.close()
 
     def write_architecture(self, architecture: str):
         self.write_string(constants.KEY_GENERAL_ARCHITECTURE,

From bb54d1700e5071c40bc9da4a56314b69f1117b1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 27 Jul 2023 23:09:53 +0300
Subject: [PATCH 21/26] GGUF : Support writing tensors in Python

---
 gguf.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gguf.py b/gguf.py
index 1f24cb9d5..f1e2c69a2 100644
--- a/gguf.py
+++ b/gguf.py
@@ -62,7 +62,7 @@ class GGUFWriter:
     def __init__(self, fout: IO):
         self.fout = fout
         self.offset_tensor = 0
-        self.tensors = []
+        self.tensors: List[np.ndarray] = []
 
     def write_header(self, tensor_count: int, metadata_kv_count: int):
         self.fout.write(struct.pack("<I", constants.GGUF_MAGIC))
@@ -176,6 +176,12 @@ class GGUFWriter:
 
         self.tensors.append(tensor)
 
+    def write_tensors(self):
+        for tensor in self.tensors:
+            tensor.tofile(self.fout)
+            pad = GGUFWriter.ggml_pad(tensor.nbytes, constants.GGUF_DEFAULT_ALIGNMENT) - tensor.nbytes
+            self.fout.write(bytes([0] * pad))
+
     def flush(self):
         self.fout.flush()
 

From 941125056413f07d720bb7720e461e201beab088 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 27 Jul 2023 23:25:47 +0300
Subject: [PATCH 22/26] refactor : rm unused import and upd todos

---
 gguf.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/gguf.py b/gguf.py
index f1e2c69a2..31a03c43e 100644
--- a/gguf.py
+++ b/gguf.py
@@ -1,14 +1,13 @@
 """TODOs
-1. Implement writing tensor data with alignment.
-2. Implement writers for known architectures, LLaMA in particular.
-3. Add docstrings from the format specs.
-4. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
+1. Implement writers for known architectures, LLaMA in particular.
+2. Add docstrings from the format specs.
+3. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
 """
 
 import struct
 import constants
 from enum import IntEnum
-from typing import Any, IO, List, Sequence
+from typing import Any, IO, List
 
 import numpy as np
 

From 62f4926bdeef45df2563398d3305a07a0c4bb6b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Fri, 28 Jul 2023 00:04:19 +0300
Subject: [PATCH 23/26] fix : fix errors upd writing example

---
 example.gguf | Bin 0 -> 928 bytes
 gguf.py      |  46 ++++++++++++++++++++++++----------------------
 2 files changed, 24 insertions(+), 22 deletions(-)
 create mode 100644 example.gguf

diff --git a/example.gguf b/example.gguf
new file mode 100644
index 0000000000000000000000000000000000000000..42a152e7491156d432ce40933357f0979264a8b9
GIT binary patch
literal 928
zcma)&dq`7J9LMJ~v1u04+mO^bo!jnp_wKyhIhmG9m^sSmX`9YR=~ml(L<mV#R8rc@
z84^7xBDK)OqM)D`71<N1tSnQ>q97|PE2VqEqW<d#pYuK6-|z4}oHKV(N{W|4q41XQ
zkqD3omB_c*ZB9#(&f?4|D0JI$+$Bz%JoA$%DzX$?WaBSkvAfD_PG4ykD<OBs{pn^G
z+OzYD92U2HP@z!Imd9fx+%~(*;pF8o`C?fEM^zrn{y*#A#Q47xl~RQ4Lq=78X1Kik
zU0+^fMtZzW`<l$uGvg8EHb$`9vA(oft)@F7a#_tuj`ei(n-+9en%<77rWsKdcGEhC
z4i!YwhXYO2>R(K?t9t2kWrwMM{xhmylR-b!Dp|SjlZ5p9Da_M#z~r+mfOV{#L@PHZ
zv+}ljdg5>@ElzHr$%&V!<8&vjSUsO9&B3fL$I3S5TIszPopes0inf@mX!W)d)^sVC
z`UK8n%9rCQYjv~ZOOlvs*O2M%AuW4a;-M&N8oL&=hI!ikS;eL*`bKMJBTt9v`938}
zzMaOz{TvM#p2B|AtYb<2HS}ETOg8!1YF3CftouS4^-|Ab%fpGO{$K_T+N)-+ty=o~
zk8(gF$_DAsL9Rv&kP&j09E5?ujN7?exEW0rufvIP?@(v#4Y(Bb$6&(`y(hjFhr#J!
z5PXcWfGncGuZ&Ke-N*@9I8D^RN&KJ~9rsme6B5u07zx)BUU-Cu!5zr)yEP%&V15D^
z3%X(!<K3~1U^~u78*rI%g!>K`;sMkQ8$cG<tL*{9IEYKdr-UtGyy}U-qfovf`X~Pl
ztvA%8Z@5EKP5?NJjzJZ=XnccQ!Wr@jm*QX{5Z)0L`ZPm2XoChJMKg$81~*!XLd8lL
zE%Kt79Kok~E4(5U#>b&n-W!CA%D9W<DEW*Yp!aYm>_#ClmAfgH;>++c<iudjEp&~0
zPVSR(<70iM(1Kp!BjNzIVgZY~08)$hAP3w$<EzHcFcoE?G#(2>uwA8tUSeicodC#E
ay)TZ6&x0fW3-DqAp$4JRAmZ`57XBA>VHP3)

literal 0
HcmV?d00001

diff --git a/gguf.py b/gguf.py
index 31a03c43e..0f94b963e 100644
--- a/gguf.py
+++ b/gguf.py
@@ -124,30 +124,30 @@ class GGUFWriter:
         if vtype is None:
             vtype = GGUFValueType.get_type(val)
 
-        self.buffered_writer.write(struct.pack("<I", vtype))
+        self.fout.write(struct.pack("<I", vtype))
 
         if vtype == GGUFValueType.UINT8:
-            self.buffered_writer.write(struct.pack("<B", val))
+            self.fout.write(struct.pack("<B", val))
         elif vtype == GGUFValueType.INT8:
-            self.buffered_writer.write(struct.pack("<b", val))
+            self.fout.write(struct.pack("<b", val))
         elif vtype == GGUFValueType.UINT16:
-            self.buffered_writer.write(struct.pack("<H", val))
+            self.fout.write(struct.pack("<H", val))
         elif vtype == GGUFValueType.INT16:
-            self.buffered_writer.write(struct.pack("<h", val))
+            self.fout.write(struct.pack("<h", val))
         elif vtype == GGUFValueType.UINT32:
-            self.buffered_writer.write(struct.pack("<I", val))
+            self.fout.write(struct.pack("<I", val))
         elif vtype == GGUFValueType.INT32:
-            self.buffered_writer.write(struct.pack("<i", val))
+            self.fout.write(struct.pack("<i", val))
         elif vtype == GGUFValueType.FLOAT32:
-            self.buffered_writer.write(struct.pack("<f", val))
+            self.fout.write(struct.pack("<f", val))
         elif vtype == GGUFValueType.BOOL:
-            self.buffered_writer.write(struct.pack("?", val))
+            self.fout.write(struct.pack("?", val))
         elif vtype == GGUFValueType.STRING:
             encoded_val = val.encode("utf8")
-            self.buffered_writer.write(struct.pack("<I", len(encoded_val)))
-            self.buffered_writer.write(encoded_val)
+            self.fout.write(struct.pack("<I", len(encoded_val)))
+            self.fout.write(encoded_val)
         elif vtype == GGUFValueType.ARRAY:
-            self.buffered_writer.write(struct.pack("<I", len(val)))
+            self.fout.write(struct.pack("<I", len(val)))
             for item in val:
                 self.write_val(item)
         else:
@@ -158,12 +158,13 @@ class GGUFWriter:
         return ((x + n - 1) // n) * n
 
     def write_tensor_info(self, name: str, tensor: np.ndarray):
-        self.write_val(key, GGUFValueType.STRING)
+        self.write_val(name, GGUFValueType.STRING)
         n_dims = len(tensor.shape)
         self.write_val(n_dims, GGUFValueType.INT32)
         for i in range(n_dims):
-            self.write_val(tensor.shape[N_dims - 1 - i], GGUFValueType.INT32)
+            self.write_val(tensor.shape[n_dims - 1 - i], GGUFValueType.INT32)
 
+        assert tensor.dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
         dtype = GGMLQuantizationType.F32 if tensor.dtype == np.float32 else GGMLQuantizationType.F16
         self.write_val(dtype, GGUFValueType.INT32)
         self.fout.write(struct.pack("<Q", self.offset_tensor))
@@ -268,14 +269,15 @@ class GGUFWriter:
 if __name__ == "__main__":
     # Example usage with a file
     gguf_writer = GGUFWriter.open("example.gguf")
-    gguf_writer.write_header(0, 3)
+    gguf_writer.write_header(2, 3)
 
-gguf_writer.write_architecture("llama")
-gguf_writer.write_uint32("answer", 42)  # Write a 32-bit integer
-gguf_writer.write_float32("answer_in_float", 42.0)  # Write a 32-bit float
-# Write an array of integers
-#gguf_writer.write_array("simple_array", [1, 2, 3, 4])
-# Write a nested array
-#gguf_writer.write_array("nested", [1, "nested", [2, 3]])
+    gguf_writer.write_architecture("llama")
+    gguf_writer.write_uint32("answer", 42)  # Write a 32-bit integer
+    gguf_writer.write_float32("answer_in_float", 42.0)  # Write a 32-bit float
+    tensor1 = np.random.random(size=(7, 10)).astype(np.float32)
+    tensor2 = np.random.random(size=(16, 12)).astype(np.float16)
+    gguf_writer.write_tensor_info("tensor1", tensor1)
+    gguf_writer.write_tensor_info("tensor2", tensor2)
+    gguf_writer.write_tensors()
 
 gguf_writer.close()

From 8e62d2b2148234e3c57863c72b895aa41745e331 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Fri, 28 Jul 2023 00:06:47 +0300
Subject: [PATCH 24/26] rm example.gguf

---
 example.gguf | Bin 928 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 example.gguf

diff --git a/example.gguf b/example.gguf
deleted file mode 100644
index 42a152e7491156d432ce40933357f0979264a8b9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 928
zcma)&dq`7J9LMJ~v1u04+mO^bo!jnp_wKyhIhmG9m^sSmX`9YR=~ml(L<mV#R8rc@
z84^7xBDK)OqM)D`71<N1tSnQ>q97|PE2VqEqW<d#pYuK6-|z4}oHKV(N{W|4q41XQ
zkqD3omB_c*ZB9#(&f?4|D0JI$+$Bz%JoA$%DzX$?WaBSkvAfD_PG4ykD<OBs{pn^G
z+OzYD92U2HP@z!Imd9fx+%~(*;pF8o`C?fEM^zrn{y*#A#Q47xl~RQ4Lq=78X1Kik
zU0+^fMtZzW`<l$uGvg8EHb$`9vA(oft)@F7a#_tuj`ei(n-+9en%<77rWsKdcGEhC
z4i!YwhXYO2>R(K?t9t2kWrwMM{xhmylR-b!Dp|SjlZ5p9Da_M#z~r+mfOV{#L@PHZ
zv+}ljdg5>@ElzHr$%&V!<8&vjSUsO9&B3fL$I3S5TIszPopes0inf@mX!W)d)^sVC
z`UK8n%9rCQYjv~ZOOlvs*O2M%AuW4a;-M&N8oL&=hI!ikS;eL*`bKMJBTt9v`938}
zzMaOz{TvM#p2B|AtYb<2HS}ETOg8!1YF3CftouS4^-|Ab%fpGO{$K_T+N)-+ty=o~
zk8(gF$_DAsL9Rv&kP&j09E5?ujN7?exEW0rufvIP?@(v#4Y(Bb$6&(`y(hjFhr#J!
z5PXcWfGncGuZ&Ke-N*@9I8D^RN&KJ~9rsme6B5u07zx)BUU-Cu!5zr)yEP%&V15D^
z3%X(!<K3~1U^~u78*rI%g!>K`;sMkQ8$cG<tL*{9IEYKdr-UtGyy}U-qfovf`X~Pl
ztvA%8Z@5EKP5?NJjzJZ=XnccQ!Wr@jm*QX{5Z)0L`ZPm2XoChJMKg$81~*!XLd8lL
zE%Kt79Kok~E4(5U#>b&n-W!CA%D9W<DEW*Yp!aYm>_#ClmAfgH;>++c<iudjEp&~0
zPVSR(<70iM(1Kp!BjNzIVgZY~08)$hAP3w$<EzHcFcoE?G#(2>uwA8tUSeicodC#E
ay)TZ6&x0fW3-DqAp$4JRAmZ`57XBA>VHP3)


From 0c43a3b7d8a67b8de442190d63ee3cfcbd0d3bdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Fri, 28 Jul 2023 00:07:28 +0300
Subject: [PATCH 25/26] gitignore *.gguf

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index abe8e28cb..fd41f26cc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 *.o
 *.a
 *.so
+*.gguf
 .DS_Store
 .build/
 .cache/

From 511055722e925fcd2561cfe45627a82582b53008 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Fri, 28 Jul 2023 09:09:14 +0300
Subject: [PATCH 26/26] undo formatting

---
 gguf.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/gguf.py b/gguf.py
index 0f94b963e..c5b2174c9 100644
--- a/gguf.py
+++ b/gguf.py
@@ -13,8 +13,8 @@ import numpy as np
 
 
 class GGMLQuantizationType(IntEnum):
-    F32 = 0
-    F16 = 1
+    F32  = 0
+    F16  = 1
     QR_0 = 2
     Q4_1 = 3
     # Q4_2 = 4 # support has been removed
@@ -32,16 +32,16 @@ class GGMLQuantizationType(IntEnum):
 
 
 class GGUFValueType(IntEnum):
-    UINT8 = 0
-    INT8 = 1
-    UINT16 = 2
-    INT16 = 3
-    UINT32 = 4
-    INT32 = 5
+    UINT8   = 0
+    INT8    = 1
+    UINT16  = 2
+    INT16   = 3
+    UINT32  = 4
+    INT32   = 5
     FLOAT32 = 6
-    BOOL = 7
-    STRING = 8
-    ARRAY = 9
+    BOOL    = 7
+    STRING  = 8
+    ARRAY   = 9
 
     @staticmethod
     def get_type(val):