diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 2924d8116..e7a83198d 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -140,13 +140,13 @@ int main(int argc, char ** argv) { const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); llama_batch batch_view = { - n_tokens, batch.token + i, nullptr, batch.pos + i, batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, + n_tokens, 0, 0, 0, // unused }; diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 9a990bb18..5da101105 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -338,7 +338,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ if (n_eval > n_batch) { n_eval = n_batch; } - llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; + llama_batch batch = {nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, int32_t(n_eval), *n_past, 1, 0, }; if (llama_decode(ctx_llama, batch)) { LOG_TEE("%s : failed to eval\n", __func__); return false; diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7c5595d6e..a91fa8294 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -301,13 +301,13 @@ int main(int argc, char ** argv) { const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); llama_batch batch_view = { - n_tokens, batch.token + i, nullptr, batch.pos + i, batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, + n_tokens, 0, 0, 0, // unused }; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index db6e0949d..15bdc725f 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -691,13 +691,13 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); llama_batch batch_view = { - n_tokens, batch.token + i, nullptr, batch.pos + i, batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, + n_tokens, 0, 0, 0, // unused }; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ceaeb1f76..40b20bf33 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1072,13 +1072,13 @@ struct server_context { for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i); llama_batch batch_view = { - n_tokens, batch.token + i, nullptr, batch.pos + i, batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, + n_tokens, 0, 0, 0, // unused }; @@ -2195,13 +2195,13 @@ struct server_context { } llama_batch batch_view = { - n_tokens, batch.token + i, nullptr, batch.pos + i, batch.n_seq_id + i, batch.seq_id + i, batch.logits + i, + n_tokens, 0, 0, 0, // unused }; diff --git a/ggml-alloc.c b/ggml-alloc.c index 1fbd376ed..e2998defb 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -334,8 +334,8 @@ struct hash_node { int n_children; int n_views; int buffer_id; - size_t offset; // offset within the buffer bool allocated; + size_t offset; // offset within the buffer }; struct tensor_alloc { diff --git a/ggml.c b/ggml.c index b96a82a41..1937a6f86 100644 --- a/ggml.c +++ b/ggml.c @@ -19149,8 +19149,8 @@ struct ggml_compute_state_shared { struct ggml_compute_state { ggml_thread_t thrd; int ith; - struct ggml_compute_state_shared * shared; enum ggml_status ec; + struct ggml_compute_state_shared * shared; }; static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { @@ -21706,8 +21706,8 @@ struct gguf_header { struct gguf_tensor_info { struct gguf_str name; - uint32_t n_dims; uint64_t ne[GGML_MAX_DIMS]; + uint32_t n_dims; enum ggml_type type; diff --git a/ggml.h b/ggml.h index 3fe95ed57..261792bdf 100644 --- a/ggml.h +++ b/ggml.h @@ -2412,9 +2412,9 @@ extern "C" { typedef struct { const char * type_name; + bool is_quantized; int blck_size; size_t type_size; - bool is_quantized; ggml_to_float_t to_float; ggml_from_float_t from_float; ggml_from_float_t from_float_reference; diff --git a/llama.cpp b/llama.cpp index e91ad7285..e1fa10b5c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11428,13 +11428,13 @@ static int llama_decode_internal( for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) { const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token); llama_batch u_batch = { - /* .n_tokens = */ (int32_t) n_tokens, /* .token = */ batch_all.token ? batch_all.token + cur_token : nullptr, /* .embd = */ batch_all.embd ? batch_all.embd + cur_token*n_embd : nullptr, /* .pos = */ batch_all.pos ? batch_all.pos + cur_token : nullptr, /* .n_seq_id = */ batch_all.n_seq_id ? batch_all.n_seq_id + cur_token : nullptr, /* .seq_id = */ batch_all.seq_id ? batch_all.seq_id + cur_token : nullptr, /* .logits = */ batch_all.logits ? batch_all.logits + cur_token : nullptr, + /* .n_tokens = */ (int32_t) n_tokens, /* .all_pos_0 = */ batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1, /* .all_pos_1 = */ batch_all.all_pos_1, /* .all_seq_id = */ batch_all.all_seq_id, @@ -15310,13 +15310,13 @@ static int llama_apply_lora_from_file_internal( // struct llama_model_params llama_model_default_params() { struct llama_model_params result = { - /*.n_gpu_layers =*/ 0, - /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, - /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.kv_overrides =*/ nullptr, + /*.n_gpu_layers =*/ 0, + /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, + /*.main_gpu =*/ 0, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, @@ -17293,13 +17293,13 @@ struct llama_batch llama_batch_get_one( llama_pos pos_0, llama_seq_id seq_id) { return { - /*n_tokens =*/ n_tokens, /*tokens =*/ tokens, /*embd =*/ nullptr, /*pos =*/ nullptr, /*n_seq_id =*/ nullptr, /*seq_id =*/ nullptr, /*logits =*/ nullptr, + /*n_tokens =*/ n_tokens, /*all_pos_0 =*/ pos_0, /*all_pos_1 =*/ 1, /*all_seq_id =*/ seq_id, @@ -17307,7 +17307,7 @@ struct llama_batch llama_batch_get_one( } struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) { - llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, }; + llama_batch batch = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, }; if (embd) { batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd); diff --git a/llama.h b/llama.h index 0b2e708d0..502319eec 100644 --- a/llama.h +++ b/llama.h @@ -190,8 +190,6 @@ extern "C" { // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output // typedef struct llama_batch { - int32_t n_tokens; - llama_token * token; float * embd; llama_pos * pos; @@ -199,6 +197,7 @@ extern "C" { llama_seq_id ** seq_id; int8_t * logits; // TODO: rename this to "output" + int32_t n_tokens; // NOTE: helpers for smooth API transition - can be deprecated in the future // for future-proof code, use the above fields instead and ignore everything below // @@ -230,15 +229,6 @@ extern "C" { }; struct llama_model_params { - int32_t n_gpu_layers; // number of layers to store in VRAM - enum llama_split_mode split_mode; // how to split the model across multiple GPUs - - // main_gpu interpretation depends on split_mode: - // LLAMA_SPLIT_NONE: the GPU that is used for the entire model - // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results - // LLAMA_SPLIT_LAYER: ignored - int32_t main_gpu; - // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() const float * tensor_split; @@ -253,6 +243,15 @@ extern "C" { // override key-value pairs of the model meta data const struct llama_model_kv_override * kv_overrides; + int32_t n_gpu_layers; // number of layers to store in VRAM + enum llama_split_mode split_mode; // how to split the model across multiple GPUs + + // main_gpu interpretation depends on split_mode: + // LLAMA_SPLIT_NONE: the GPU that is used for the entire model + // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results + // LLAMA_SPLIT_LAYER: ignored + int32_t main_gpu; + // Keep the booleans together to avoid misalignment during copy-by-value. bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible