ggml llama: align structs for memory optimization on 64-bit platforms:
- ggml_type_traits_t (80 -> 72 bytes) - llama_batch (72 -> 64 bytes) - llama_model_params (56 -> 48 bytes) - hash_node (32 -> 24 bytes) - ggml_compute_state (32 -> 24 bytes) - gguf_tensor_info (88 -> 80 bytes)
This commit is contained in:
parent
b228aba91a
commit
2a9a84be7d
10 changed files with 26 additions and 27 deletions
|
@ -140,13 +140,13 @@ int main(int argc, char ** argv) {
|
|||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
n_tokens,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
|
||||
|
|
|
@ -338,7 +338,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
|||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||
llama_batch batch = {nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, int32_t(n_eval), *n_past, 1, 0, };
|
||||
if (llama_decode(ctx_llama, batch)) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
|
|
|
@ -301,13 +301,13 @@ int main(int argc, char ** argv) {
|
|||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
n_tokens,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
|
||||
|
|
|
@ -691,13 +691,13 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
|
|||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
n_tokens,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
|
||||
|
|
|
@ -1072,13 +1072,13 @@ struct server_context {
|
|||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||
const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i);
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
n_tokens,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
|
||||
|
@ -2195,13 +2195,13 @@ struct server_context {
|
|||
}
|
||||
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
n_tokens,
|
||||
0, 0, 0, // unused
|
||||
};
|
||||
|
||||
|
|
|
@ -334,8 +334,8 @@ struct hash_node {
|
|||
int n_children;
|
||||
int n_views;
|
||||
int buffer_id;
|
||||
size_t offset; // offset within the buffer
|
||||
bool allocated;
|
||||
size_t offset; // offset within the buffer
|
||||
};
|
||||
|
||||
struct tensor_alloc {
|
||||
|
|
4
ggml.c
4
ggml.c
|
@ -19149,8 +19149,8 @@ struct ggml_compute_state_shared {
|
|||
struct ggml_compute_state {
|
||||
ggml_thread_t thrd;
|
||||
int ith;
|
||||
struct ggml_compute_state_shared * shared;
|
||||
enum ggml_status ec;
|
||||
struct ggml_compute_state_shared * shared;
|
||||
};
|
||||
|
||||
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
||||
|
@ -21706,8 +21706,8 @@ struct gguf_header {
|
|||
struct gguf_tensor_info {
|
||||
struct gguf_str name;
|
||||
|
||||
uint32_t n_dims;
|
||||
uint64_t ne[GGML_MAX_DIMS];
|
||||
uint32_t n_dims;
|
||||
|
||||
enum ggml_type type;
|
||||
|
||||
|
|
2
ggml.h
2
ggml.h
|
@ -2412,9 +2412,9 @@ extern "C" {
|
|||
|
||||
typedef struct {
|
||||
const char * type_name;
|
||||
bool is_quantized;
|
||||
int blck_size;
|
||||
size_t type_size;
|
||||
bool is_quantized;
|
||||
ggml_to_float_t to_float;
|
||||
ggml_from_float_t from_float;
|
||||
ggml_from_float_t from_float_reference;
|
||||
|
|
12
llama.cpp
12
llama.cpp
|
@ -11428,13 +11428,13 @@ static int llama_decode_internal(
|
|||
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
|
||||
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
|
||||
llama_batch u_batch = {
|
||||
/* .n_tokens = */ (int32_t) n_tokens,
|
||||
/* .token = */ batch_all.token ? batch_all.token + cur_token : nullptr,
|
||||
/* .embd = */ batch_all.embd ? batch_all.embd + cur_token*n_embd : nullptr,
|
||||
/* .pos = */ batch_all.pos ? batch_all.pos + cur_token : nullptr,
|
||||
/* .n_seq_id = */ batch_all.n_seq_id ? batch_all.n_seq_id + cur_token : nullptr,
|
||||
/* .seq_id = */ batch_all.seq_id ? batch_all.seq_id + cur_token : nullptr,
|
||||
/* .logits = */ batch_all.logits ? batch_all.logits + cur_token : nullptr,
|
||||
/* .n_tokens = */ (int32_t) n_tokens,
|
||||
/* .all_pos_0 = */ batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1,
|
||||
/* .all_pos_1 = */ batch_all.all_pos_1,
|
||||
/* .all_seq_id = */ batch_all.all_seq_id,
|
||||
|
@ -15310,13 +15310,13 @@ static int llama_apply_lora_from_file_internal(
|
|||
//
|
||||
struct llama_model_params llama_model_default_params() {
|
||||
struct llama_model_params result = {
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.vocab_only =*/ false,
|
||||
/*.use_mmap =*/ true,
|
||||
/*.use_mlock =*/ false,
|
||||
|
@ -17293,13 +17293,13 @@ struct llama_batch llama_batch_get_one(
|
|||
llama_pos pos_0,
|
||||
llama_seq_id seq_id) {
|
||||
return {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ tokens,
|
||||
/*embd =*/ nullptr,
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
/*logits =*/ nullptr,
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*all_pos_0 =*/ pos_0,
|
||||
/*all_pos_1 =*/ 1,
|
||||
/*all_seq_id =*/ seq_id,
|
||||
|
@ -17307,7 +17307,7 @@ struct llama_batch llama_batch_get_one(
|
|||
}
|
||||
|
||||
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
|
||||
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
||||
llama_batch batch = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, };
|
||||
|
||||
if (embd) {
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||
|
|
21
llama.h
21
llama.h
|
@ -190,8 +190,6 @@ extern "C" {
|
|||
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
||||
//
|
||||
typedef struct llama_batch {
|
||||
int32_t n_tokens;
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
llama_pos * pos;
|
||||
|
@ -199,6 +197,7 @@ extern "C" {
|
|||
llama_seq_id ** seq_id;
|
||||
int8_t * logits; // TODO: rename this to "output"
|
||||
|
||||
int32_t n_tokens;
|
||||
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
||||
// for future-proof code, use the above fields instead and ignore everything below
|
||||
//
|
||||
|
@ -230,15 +229,6 @@ extern "C" {
|
|||
};
|
||||
|
||||
struct llama_model_params {
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
// main_gpu interpretation depends on split_mode:
|
||||
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
||||
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
||||
// LLAMA_SPLIT_LAYER: ignored
|
||||
int32_t main_gpu;
|
||||
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
const float * tensor_split;
|
||||
|
||||
|
@ -253,6 +243,15 @@ extern "C" {
|
|||
// override key-value pairs of the model meta data
|
||||
const struct llama_model_kv_override * kv_overrides;
|
||||
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
// main_gpu interpretation depends on split_mode:
|
||||
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
||||
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
||||
// LLAMA_SPLIT_LAYER: ignored
|
||||
int32_t main_gpu;
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue