ggml llama: align structs for memory optimization on 64-bit platforms:

- ggml_type_traits_t (80 -> 72 bytes) - llama_batch (72 -> 64 bytes) - llama_model_params (56 -> 48 bytes) - hash_node (32 -> 24 bytes) - ggml_compute_state (32 -> 24 bytes) - gguf_tensor_info (88 -> 80 bytes)
2024-05-13 18:38:48 -05:00 · 2024-05-13 18:38:48 -05:00 · 2a9a84be7d
commit 2a9a84be7d
parent b228aba91a
10 changed files with 26 additions and 27 deletions
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -140,13 +140,13 @@ int main(int argc, char ** argv) {
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));

            llama_batch batch_view = {
-                n_tokens,
                batch.token    + i,
                nullptr,
                batch.pos      + i,
                batch.n_seq_id + i,
                batch.seq_id   + i,
                batch.logits   + i,
+                n_tokens,
                0, 0, 0, // unused
            };

--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -338,7 +338,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        llama_batch batch = {nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, int32_t(n_eval), *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
            LOG_TEE("%s : failed to eval\n", __func__);
            return false;
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -301,13 +301,13 @@ int main(int argc, char ** argv) {
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));

            llama_batch batch_view = {
-                n_tokens,
                batch.token    + i,
                nullptr,
                batch.pos      + i,
                batch.n_seq_id + i,
                batch.seq_id   + i,
                batch.logits   + i,
+                n_tokens,
                0, 0, 0, // unused
            };

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -691,13 +691,13 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
        const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));

        llama_batch batch_view = {
-            n_tokens,
            batch.token    + i,
            nullptr,
            batch.pos      + i,
            batch.n_seq_id + i,
            batch.seq_id   + i,
            batch.logits   + i,
+            n_tokens,
            0, 0, 0, // unused
        };

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1072,13 +1072,13 @@ struct server_context {
            for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
                const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i);
                llama_batch batch_view = {
-                    n_tokens,
                    batch.token    + i,
                    nullptr,
                    batch.pos      + i,
                    batch.n_seq_id + i,
                    batch.seq_id   + i,
                    batch.logits   + i,
+                    n_tokens,
                    0, 0, 0, // unused
                };

@ -2195,13 +2195,13 @@ struct server_context {
            }

            llama_batch batch_view = {
-                n_tokens,
                batch.token    + i,
                nullptr,
                batch.pos      + i,
                batch.n_seq_id + i,
                batch.seq_id   + i,
                batch.logits   + i,
+                n_tokens,
                0, 0, 0, // unused
            };

--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -334,8 +334,8 @@ struct hash_node {
    int n_children;
    int n_views;
    int buffer_id;
-    size_t offset; // offset within the buffer
    bool allocated;
+    size_t offset; // offset within the buffer
 };

 struct tensor_alloc {
--- a/ggml.c
+++ b/ggml.c
@ -19149,8 +19149,8 @@ struct ggml_compute_state_shared {
 struct ggml_compute_state {
    ggml_thread_t thrd;
    int ith;
-    struct ggml_compute_state_shared * shared;
    enum ggml_status ec;
+    struct ggml_compute_state_shared * shared;
 };

 static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@ -21706,8 +21706,8 @@ struct gguf_header {
 struct gguf_tensor_info {
    struct gguf_str name;

-    uint32_t n_dims;
    uint64_t ne[GGML_MAX_DIMS];
+    uint32_t n_dims;

    enum ggml_type type;

--- a/ggml.h
+++ b/ggml.h
@ -2412,9 +2412,9 @@ extern "C" {

    typedef struct {
        const char      * type_name;
+        bool              is_quantized;
        int               blck_size;
        size_t            type_size;
-        bool              is_quantized;
        ggml_to_float_t   to_float;
        ggml_from_float_t from_float;
        ggml_from_float_t from_float_reference;
--- a/llama.cpp
+++ b/llama.cpp
@ -11428,13 +11428,13 @@ static int llama_decode_internal(
    for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
        const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
        llama_batch u_batch = {
-            /* .n_tokens   = */ (int32_t) n_tokens,
            /* .token      = */ batch_all.token     ? batch_all.token    + cur_token        : nullptr,
            /* .embd       = */ batch_all.embd      ? batch_all.embd     + cur_token*n_embd : nullptr,
            /* .pos        = */ batch_all.pos       ? batch_all.pos      + cur_token        : nullptr,
            /* .n_seq_id   = */ batch_all.n_seq_id  ? batch_all.n_seq_id + cur_token        : nullptr,
            /* .seq_id     = */ batch_all.seq_id    ? batch_all.seq_id   + cur_token        : nullptr,
            /* .logits     = */ batch_all.logits    ? batch_all.logits   + cur_token        : nullptr,
+            /* .n_tokens   = */ (int32_t) n_tokens,
            /* .all_pos_0  = */ batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1,
            /* .all_pos_1  = */ batch_all.all_pos_1,
            /* .all_seq_id = */ batch_all.all_seq_id,
@ -15310,13 +15310,13 @@ static int llama_apply_lora_from_file_internal(
 //
 struct llama_model_params llama_model_default_params() {
    struct llama_model_params result = {
-        /*.n_gpu_layers                =*/ 0,
-        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
-        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ nullptr,
        /*.progress_callback           =*/ nullptr,
        /*.progress_callback_user_data =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
+        /*.n_gpu_layers                =*/ 0,
+        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
+        /*.main_gpu                    =*/ 0,
        /*.vocab_only                  =*/ false,
        /*.use_mmap                    =*/ true,
        /*.use_mlock                   =*/ false,
@ -17293,13 +17293,13 @@ struct llama_batch llama_batch_get_one(
               llama_pos   pos_0,
            llama_seq_id   seq_id) {
    return {
-        /*n_tokens       =*/ n_tokens,
        /*tokens         =*/ tokens,
        /*embd           =*/ nullptr,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
        /*logits         =*/ nullptr,
+        /*n_tokens       =*/ n_tokens,
        /*all_pos_0      =*/ pos_0,
        /*all_pos_1      =*/ 1,
        /*all_seq_id     =*/ seq_id,
@ -17307,7 +17307,7 @@ struct llama_batch llama_batch_get_one(
 }

 struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
-    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
+    llama_batch batch = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, };

    if (embd) {
        batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
--- a/llama.h
+++ b/llama.h
@ -190,8 +190,6 @@ extern "C" {
    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
    //
    typedef struct llama_batch {
-        int32_t n_tokens;
-
        llama_token  *  token;
        float        *  embd;
        llama_pos    *  pos;
@ -199,6 +197,7 @@ extern "C" {
        llama_seq_id ** seq_id;
        int8_t       *  logits; // TODO: rename this to "output"

+        int32_t         n_tokens;
        // NOTE: helpers for smooth API transition - can be deprecated in the future
        //       for future-proof code, use the above fields instead and ignore everything below
        //
@ -230,15 +229,6 @@ extern "C" {
    };

    struct llama_model_params {
-        int32_t n_gpu_layers; // number of layers to store in VRAM
-        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
-
-        // main_gpu interpretation depends on split_mode:
-        // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
-        // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
-        // LLAMA_SPLIT_LAYER: ignored
-        int32_t main_gpu;
-
        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
        const float * tensor_split;

@ -253,6 +243,15 @@ extern "C" {
        // override key-value pairs of the model meta data
        const struct llama_model_kv_override * kv_overrides;

+        int32_t n_gpu_layers; // number of layers to store in VRAM
+        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
+
+        // main_gpu interpretation depends on split_mode:
+        // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
+        // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
+        // LLAMA_SPLIT_LAYER: ignored
+        int32_t main_gpu;
+
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only;    // only load the vocabulary, no weights
        bool use_mmap;      // use mmap if possible