Make loading weights 10-100x faster

This is a breaking change that's going to give you three benefits: 1. Your inference commands should load 100x faster 2. You may be able to safely load models 2x larger 3. You can run many concurrent inference processes This was accomplished by changing the file format so we can mmap() weights directly into memory without having to read() or copy them thereby ensuring the kernel can make its file cache pages directly accessible to our inference processes; and secondly, that the file cache pages are much less likely to get evicted (which would force loads to hit disk) because they're no longer competing with memory pages that were needlessly created by gigabytes of standard i/o. The new file format supports single-file models like LLaMA 7b, and it also supports multi-file models like LLaMA 13B. Our Python tool now merges the foo.1, foo.2, etc. files back into a single file so that the C++ code which maps it doesn't need to reshape data every time. That's made llama.cpp so much simpler. Much of its load code has now been deleted. Furthermore, this change ensures that tensors are aligned properly on a 32-byte boundary. That opens the door to seeing if we can get additional performance gains on some microprocessors, by using ops that require memory alignment. Lastly note that both POSIX and the Windows platform are supported Fixes #91
2023-03-29 13:51:37 -07:00 · 2023-03-29 13:51:37 -07:00 · 78ca9838ee
commit 78ca9838ee
parent a017390358
7 changed files with 336 additions and 375 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -12,17 +12,19 @@
 #include <cassert>
 #include <cstring>

-// mmap
-#if defined (__unix__) || defined (__APPLE__)
-#   include <sys/mman.h>
-#   include <fcntl.h>
-#   include <unistd.h>
-#elif defined(_WIN32)
-#   define WIN32_LEAN_AND_MEAN
-#   include <Windows.h>
-//#include <Memoryapi.h>
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#else
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
 #endif

+#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
+#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
+
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16

@ -155,7 +157,7 @@ struct llama_model {

    // model memory mapped file
    void * mm_addr = NULL;
-    size_t mm_length = 0;
+    uint64_t mm_length = 0;

    // tensors
    int n_loaded;
@ -180,6 +182,7 @@ struct llama_context {

    int64_t t_load_us = 0;
    int64_t t_start_us = 0;
+    bool has_evaluated_once = false;

    int64_t t_sample_us = 0;
    int64_t t_eval_us   = 0;
@ -221,7 +224,7 @@ struct llama_context {
        }

        if (buf_last >= 0) {
-            buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
+            buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
        }

        buf_last = i;
@ -304,59 +307,57 @@ struct llama_context_params llama_context_default_params() {
 // model loading
 //

-static void mmap_file(const char* fname, void * &mm_addr, size_t &mm_length) {
-#if defined(MAP_FAILED)
-    // POSIX
-    int fd = open(fname, O_RDONLY);
-    mm_length = lseek(fd, 0, SEEK_END);
-    mm_addr = mmap(NULL, mm_length, PROT_READ, MAP_SHARED, fd, 0);
-    close(fd);
-    if (mm_addr == MAP_FAILED) {
-        perror("mmap failed");
-        mm_addr = NULL;
-        mm_length = 0;
-    }
-#elif defined(_WIN32)
-    mm_addr = NULL;
-    
-    HANDLE hFile = CreateFileA(filename, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
-    if (hFile == INVALID_HANDLE_VALUE) {
-        return;
-    }
-
-    // not really necessary
+static void *mmap_file(const char *fname, uint64_t *mm_length) {
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+    HANDLE hFile = CreateFileA(fname,
+                               GENERIC_READ,
+                               FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                               NULL,
+                               OPEN_EXISTING,
+                               FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
+                               NULL);
+    if (hFile == INVALID_HANDLE_VALUE) return 0;
    LARGE_INTEGER fileSize;
+    fileSize.QuadPart = -1;
    GetFileSizeEx(hFile, &fileSize);
-    mm_length = fileSize;
-
+    int64_t length = fileSize.QuadPart;
    HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
    CloseHandle(hFile);
-
-    if (hMapping == NULL) {
-        return;
-    }
-
-    mm_addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+    if (!hMapping) return 0;
+    void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
    CloseHandle(hMapping);
+    if (!addr) return 0;
 #else
-    mm_addr = NULL;
-    mm_length = 0;
-    (void)(fname); // suppress warnings
+    int fd = open(fname, O_RDONLY);
+    if (fd == -1) return 0;
+    int64_t length = lseek(fd, 0, SEEK_END);
+    void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
+    close(fd);
+    if (addr == MAP_FAILED) return 0;
 #endif
+    *mm_length = length;
+    return addr;
 }

 static void munmap_file(void * addr, size_t length) {
-#if defined(MAP_FAILED)
-    // POSIX
-    munmap(addr, length);
-#elif defined(_WIN32)
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
    UnmapViewOfFile(addr);
 #else
-    (void)(addr); // suppress warnings
-    (void)(length);
+    munmap(addr, length);
 #endif
 }

+static bool report_bad_magic(const char *path) {
+    fprintf(stderr,
+            "%s: invalid model file (bad magic)\n"
+            "you most likely need to regenerate your ggml files\n"
+            "the benefit is you'll get 10-100x faster load times\n"
+            "see https://github.com/ggerganov/llama.cpp/issues/91\n"
+            "use convert-pth-to-ggml.py on your llama model files\n",
+            path);
+    return false;
+}
+
 static bool llama_model_load(
        const std::string & fname,
        llama_context & lctx,
@ -368,23 +369,24 @@ static bool llama_model_load(
        void *progress_callback_user_data) {
    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

-    const int64_t t_start_us = ggml_time_us();
-
-    lctx.t_start_us = t_start_us;
-
-    // TODO: this could probably be smaller when using mmap
-    std::vector<char> f_buf(1024*1024);
+    lctx.t_start_us = ggml_time_us();

    auto & model = lctx.model;
    auto & vocab = lctx.vocab;

    auto fin = std::ifstream(fname, std::ios::binary);
-    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
    if (!fin) {
        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
        return false;
    }

+    std::vector<char> f_buf(1024*1024);
+    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
+
+    fin.seekg(0, fin.end);
+    const size_t file_size = fin.tellg();
+    fin.seekg(0);
+
    // verify magic
    {
        uint32_t magic;
@ -395,8 +397,7 @@ static bool llama_model_load(
            return false;
        }
        if (magic != LLAMA_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
+            return report_bad_magic(fname.c_str());
        }

        uint32_t format_version;
@ -519,54 +520,24 @@ static bool llama_model_load(
                }
    }

-    bool use_mmap = (n_parts == 1);
-
-    // try to memory map the model file
-    void * mm_addr = NULL;
-    if (use_mmap) {
-        mmap_file(fname.c_str(), model.mm_addr, model.mm_length);
-        if (model.mm_addr == NULL) {
-            use_mmap = false;
-        }
-        else {
-            mm_addr = model.mm_addr;
-        }
+    // map model into memory
+    char *mm_addr = NULL;
+    model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
+    if (model.mm_addr == NULL) {
+        fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
+        return false;
    }
+    mm_addr = (char *)model.mm_addr;
+    fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));

    auto & ctx = model.ctx;

    size_t ctx_size = 0;
    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
+        const auto &hparams = model.hparams;
        const int n_layer = hparams.n_layer;
-        const int n_vocab = hparams.n_vocab;
-
-        if (!use_mmap) {
-            ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
-
-            ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
-
-            ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
-
-            ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
-
-            ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
-            ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
-            ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
-            ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
-
-            ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
-
-            ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
-            ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
-            ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
-        }
-
        ctx_size += (5 + 10*n_layer)*256; // object overhead
-
-        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
    }

    // print memory requirements
@ -576,6 +547,7 @@ static bool llama_model_load(
        // this is the total memory required to run the inference
        const size_t mem_required =
            ctx_size +
+            model.mm_length +
            MEM_REQ_SCRATCH0.at(model.type) +
            MEM_REQ_SCRATCH1.at(model.type) +
            MEM_REQ_EVAL.at    (model.type);
@ -595,7 +567,7 @@ static bool llama_model_load(
        struct ggml_init_params params = {
            /*.mem_size   =*/ lctx.model.buf.size(),
            /*.mem_buffer =*/ lctx.model.buf.data(),
-            /*.no_alloc   =*/ use_mmap,
+            /*.no_alloc   =*/ true,
        };

        model.ctx = ggml_init(params);
@ -658,241 +630,106 @@ static bool llama_model_load(
        }
    }

-    const size_t file_offset = fin.tellg();
-
-    fin.close();
-
    std::vector<uint8_t> tmp;

    if (progress_callback) {
        progress_callback(0.0, progress_callback_user_data);
    }

-    for (int i = 0; i < n_parts; ++i) {
-        const int part_id = i;
-        //const int part_id = n_parts - i - 1;
+    fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());

-        std::string fname_part = fname;
-        if (i > 0) {
-            fname_part += "." + std::to_string(i);
-        }
+    // load weights
+    {
+        size_t total_size = 0;
+        model.n_loaded = 0;

-        fprintf(stderr, "%s: loading model part %d/%d from '%s'%s\n", __func__, i+1, n_parts, fname_part.c_str(), use_mmap ? " (memory mapped)" : "");
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ftype;

-        fin = std::ifstream(fname_part, std::ios::binary);
-        fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));

-        fin.seekg(0, fin.end);
-        const size_t file_size = fin.tellg();
-
-        fin.seekg(file_offset);
-
-        // load weights
-        {
-            size_t total_size = 0;
-
-            model.n_loaded = 0;
-
-            fprintf(stderr, "%s: ", __func__);
-
-            while (true) {
-                int32_t n_dims;
-                int32_t length;
-                int32_t ftype;
-
-                fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-                fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-                fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
-
-                if (fin.eof()) {
-                    break;
-                }
-
-                int32_t nelements = 1;
-                int32_t ne[2] = { 1, 1 };
-                for (int i = 0; i < n_dims; ++i) {
-                    fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                    nelements *= ne[i];
-                }
-
-                std::string name(length, 0);
-                fin.read(&name[0], length);
-
-                if (model.tensors.find(name.data()) == model.tensors.end()) {
-                    fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                    return false;
-                }
-
-                // split_type = 0: split by columns
-                // split_type = 1: split by rows
-                int split_type = 0;
-
-                // split_type = 0:
-                // regex:
-                //   - tok_embeddings.*
-                //   - layers.*.attention.wo.weight
-                //   - layers.*.feed_forward.w2.weight
-
-                // split_type = 1:
-                // regex:
-                //   - output.*
-                //   - layers.*.attention.wq.weight
-                //   - layers.*.attention.wk.weight
-                //   - layers.*.attention.wv.weight
-                //   - layers.*.feed_forward.w1.weight
-                //   - layers.*.feed_forward.w3.weight
-                if (name.find("tok_embeddings") != std::string::npos) {
-                    split_type = 0;
-                } else if (name.find("layers") != std::string::npos) {
-                    if (name.find("attention.wo.weight") != std::string::npos) {
-                        split_type = 0;
-                    } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
-                        split_type = 0;
-                    } else {
-                        split_type = 1;
-                    }
-                } else if (name.find("output") != std::string::npos) {
-                    split_type = 1;
-                }
-
-                auto tensor = model.tensors[name.data()];
-
-                if (n_dims == 1) {
-                    if (ggml_nelements(tensor) != nelements) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                        return false;
-                    }
-                } else {
-                    if (ggml_nelements(tensor)/n_parts != nelements) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                        return false;
-                    }
-                }
-
-                if (n_dims == 1) {
-                    if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
-                        return false;
-                    }
-                } else {
-                    if (split_type == 0) {
-                        if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
-                            fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                    __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
-                            return false;
-                        }
-                    } else {
-                        if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
-                            fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                    __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
-                            return false;
-                        }
-                    }
-                }
-
-                if (0) {
-                    static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
-                    fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
-                }
-
-                size_t bpe = 0;
-
-                switch (ftype) {
-                    case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
-                    case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
-                    case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
-                    case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
-                    default:
-                            {
-                                fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
-                                return false;
-                            }
-                };
-
-                if (n_dims == 1 || n_parts == 1) {
-                    if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                                __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                        return false;
-                    }
-
-                    if (part_id == 0) {
-                        if (mm_addr) {
-                            off_t offset = fin.tellg();
-                            tensor->data = (char *) mm_addr + offset;
-                            fin.seekg(ggml_nbytes(tensor), std::ios::cur);
-                        }
-                        else {
-                            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-                        }
-                    } else {
-                        fin.seekg(ggml_nbytes(tensor), std::ios::cur);
-                    }
-
-                    total_size += ggml_nbytes(tensor);
-                } else {
-                    if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                                __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
-                        return false;
-                    }
-
-                    if (split_type == 0) {
-                        const int np0 = ne[0];
-
-                        const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-                        assert(row_size == tensor->nb[1]);
-
-                        for (int i1 = 0; i1 < ne[1]; ++i1) {
-                            const size_t offset_row = i1*row_size;
-                            const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-                            fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
-                        }
-                    } else {
-                        const int np1 = ne[1];
-
-                        const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-
-                        for (int i1 = 0; i1 < ne[1]; ++i1) {
-                            const size_t offset_row = (i1 + part_id*np1)*row_size;
-                            fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
-                        }
-                    }
-
-                    total_size += ggml_nbytes(tensor)/n_parts;
-                }
-
-                //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
-                model.n_loaded++;
-
-                // progress
-                if (progress_callback) {
-                    float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset);
-                    float current_progress = (float(i) + current_file_progress) / float(n_parts);
-                    progress_callback(current_progress, progress_callback_user_data);
-                }
-                if (model.n_loaded % 8 == 0) {
-                    fprintf(stderr, ".");
-                    fflush(stderr);
-                }
+            if (fin.eof()) {
+                break;
            }

-            fprintf(stderr, " done\n");
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }

-            fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
-            if (model.n_loaded == 0) {
-                fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
-            } else if (model.n_loaded != (int) model.tensors.size()) {
-                fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
                return false;
            }
+
+            auto tensor = model.tensors[name.data()];
+
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return false;
+            }
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+            if (0) {
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
+            }
+
+            switch (ftype) {
+                case 0:  // f32
+                case 1:  // f16
+                    break;
+                case 2:  // q4_0
+                case 3:  // q4_1
+                    assert(ne[0] % 64 == 0);
+                    break;
+                default:
+                    fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
+                    return false;
+            };
+
+            // load the tensor data into memory without copying or reading it
+            size_t offset = fin.tellg();
+            size_t tensor_data_size = ggml_nbytes(tensor);
+            offset = (offset + 31) & -32;
+            tensor->data = mm_addr + offset;
+            fin.seekg(offset + tensor_data_size);
+            total_size += tensor_data_size;
+            model.n_loaded++;
+
+            // progress
+            if (progress_callback) {
+                double current_progress = size_t(fin.tellg()) / double(file_size);
+                progress_callback(current_progress, progress_callback_user_data);
+            }
        }

        fin.close();
+
+        fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
+        if (model.n_loaded == 0) {
+            fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
+        } else if (model.n_loaded != (int) model.tensors.size()) {
+            fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
+            return false;
+        }
    }

-    lctx.t_load_us = ggml_time_us() - t_start_us;
+    // loading time will be recalculate after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    lctx.t_load_us = ggml_time_us() - lctx.t_start_us;

    if (progress_callback) {
        progress_callback(1.0, progress_callback_user_data);
@ -1216,7 +1053,7 @@ struct llama_tokenizer {
        size_t offs = 0;
        while (offs < text.size()) {
            llama_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+            size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
            sym.text = text.c_str() + offs;
            sym.n = char_len;
            offs += char_len;
@ -1381,7 +1218,7 @@ static llama_vocab::id llama_sample_top_p_top_k(

    float maxl = -std::numeric_limits<float>::infinity();
    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
+        maxl = Max(maxl, kv.first);
    }

    // compute probs for the top k tokens
@ -1475,8 +1312,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
            return false;
        }
        if (magic != LLAMA_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
+            return report_bad_magic(fname_inp.c_str());
        }

        fout.write((char *) &magic, sizeof(magic));
@ -1542,8 +1378,8 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
            fout.write((char *) &len, sizeof(len));

            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
+            finp.read ((char *) &word[0], len);
+            fout.write((char *) &word[0], len);

            float score;
            finp.read ((char *) &score, sizeof(score));
@ -1593,6 +1429,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
            std::string name(length, 0);
            finp.read (&name[0], length);

+            {
+                // ensure tensor data is aligned
+                uint64_t offset = finp.tellg();
+                offset = (offset + 31) & -32;
+                finp.seekg(offset);
+            }
+
            {
                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
                printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
@ -1648,6 +1491,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
            }
            fout.write(&name[0], length);

+            {
+                // ensure tensor data is aligned
+                uint64_t offset = fout.tellp();
+                offset = (offset + 31) & -32;
+                fout.seekp(offset);
+            }
+
            if (quantize) {
                printf("quantizing .. ");
                work.resize(nelements); // for quantization
@ -1824,7 +1674,11 @@ int llama_eval(
        fprintf(stderr, "%s: failed to eval\n", __func__);
        return 1;
    }
-
+    // get a more accurate load time, upon first eval
+    if (!ctx->has_evaluated_once) {
+        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
+        ctx->has_evaluated_once = true;
+    }
    return 0;
 }

@ -1917,9 +1771,9 @@ llama_token llama_sample_top_p_top_k(
 void llama_print_timings(struct llama_context * ctx) {
    const int64_t t_end_us = ggml_time_us();

-    const int32_t n_sample = std::max(1, ctx->n_sample);
-    const int32_t n_eval   = std::max(1, ctx->n_eval);
-    const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
+    const int32_t n_sample = Max(1, ctx->n_sample);
+    const int32_t n_eval   = Max(1, ctx->n_eval);
+    const int32_t n_p_eval = Max(1, ctx->n_p_eval);

    fprintf(stderr, "\n");
    fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
@ -1931,7 +1785,6 @@ void llama_print_timings(struct llama_context * ctx) {

 void llama_reset_timings(struct llama_context * ctx) {
    ctx->t_start_us = ggml_time_us();
-
    ctx->t_sample_us = ctx->n_sample = 0;
    ctx->t_eval_us   = ctx->n_eval   = 0;
    ctx->t_p_eval_us = ctx->n_p_eval = 0;