ggml llama: align structs for memory optimization on 64-bit platforms:

- ggml_type_traits_t (80 -> 72 bytes)
- llama_batch (72 -> 64 bytes)
- llama_model_params (56 -> 48 bytes)
- hash_node (32 -> 24 bytes)
- ggml_compute_state (32 -> 24 bytes)
- gguf_tensor_info (88 -> 80 bytes)
This commit is contained in:
Herman Semenov 2024-05-13 18:38:48 -05:00
parent b228aba91a
commit 2a9a84be7d
10 changed files with 26 additions and 27 deletions

View file

@ -140,13 +140,13 @@ int main(int argc, char ** argv) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
n_tokens,
0, 0, 0, // unused
};

View file

@ -338,7 +338,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
llama_batch batch = {nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, int32_t(n_eval), *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
LOG_TEE("%s : failed to eval\n", __func__);
return false;

View file

@ -301,13 +301,13 @@ int main(int argc, char ** argv) {
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
n_tokens,
0, 0, 0, // unused
};

View file

@ -691,13 +691,13 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
n_tokens,
0, 0, 0, // unused
};

View file

@ -1072,13 +1072,13 @@ struct server_context {
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i);
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
n_tokens,
0, 0, 0, // unused
};
@ -2195,13 +2195,13 @@ struct server_context {
}
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
n_tokens,
0, 0, 0, // unused
};

View file

@ -334,8 +334,8 @@ struct hash_node {
int n_children;
int n_views;
int buffer_id;
size_t offset; // offset within the buffer
bool allocated;
size_t offset; // offset within the buffer
};
struct tensor_alloc {

4
ggml.c
View file

@ -19149,8 +19149,8 @@ struct ggml_compute_state_shared {
struct ggml_compute_state {
ggml_thread_t thrd;
int ith;
struct ggml_compute_state_shared * shared;
enum ggml_status ec;
struct ggml_compute_state_shared * shared;
};
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@ -21706,8 +21706,8 @@ struct gguf_header {
struct gguf_tensor_info {
struct gguf_str name;
uint32_t n_dims;
uint64_t ne[GGML_MAX_DIMS];
uint32_t n_dims;
enum ggml_type type;

2
ggml.h
View file

@ -2412,9 +2412,9 @@ extern "C" {
typedef struct {
const char * type_name;
bool is_quantized;
int blck_size;
size_t type_size;
bool is_quantized;
ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_reference;

View file

@ -11428,13 +11428,13 @@ static int llama_decode_internal(
for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) {
const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
llama_batch u_batch = {
/* .n_tokens = */ (int32_t) n_tokens,
/* .token = */ batch_all.token ? batch_all.token + cur_token : nullptr,
/* .embd = */ batch_all.embd ? batch_all.embd + cur_token*n_embd : nullptr,
/* .pos = */ batch_all.pos ? batch_all.pos + cur_token : nullptr,
/* .n_seq_id = */ batch_all.n_seq_id ? batch_all.n_seq_id + cur_token : nullptr,
/* .seq_id = */ batch_all.seq_id ? batch_all.seq_id + cur_token : nullptr,
/* .logits = */ batch_all.logits ? batch_all.logits + cur_token : nullptr,
/* .n_tokens = */ (int32_t) n_tokens,
/* .all_pos_0 = */ batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1,
/* .all_pos_1 = */ batch_all.all_pos_1,
/* .all_seq_id = */ batch_all.all_seq_id,
@ -15310,13 +15310,13 @@ static int llama_apply_lora_from_file_internal(
//
struct llama_model_params llama_model_default_params() {
struct llama_model_params result = {
/*.n_gpu_layers =*/ 0,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.n_gpu_layers =*/ 0,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.vocab_only =*/ false,
/*.use_mmap =*/ true,
/*.use_mlock =*/ false,
@ -17293,13 +17293,13 @@ struct llama_batch llama_batch_get_one(
llama_pos pos_0,
llama_seq_id seq_id) {
return {
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*embd =*/ nullptr,
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
/*logits =*/ nullptr,
/*n_tokens =*/ n_tokens,
/*all_pos_0 =*/ pos_0,
/*all_pos_1 =*/ 1,
/*all_seq_id =*/ seq_id,
@ -17307,7 +17307,7 @@ struct llama_batch llama_batch_get_one(
}
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
llama_batch batch = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, };
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);

21
llama.h
View file

@ -190,8 +190,6 @@ extern "C" {
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
//
typedef struct llama_batch {
int32_t n_tokens;
llama_token * token;
float * embd;
llama_pos * pos;
@ -199,6 +197,7 @@ extern "C" {
llama_seq_id ** seq_id;
int8_t * logits; // TODO: rename this to "output"
int32_t n_tokens;
// NOTE: helpers for smooth API transition - can be deprecated in the future
// for future-proof code, use the above fields instead and ignore everything below
//
@ -230,15 +229,6 @@ extern "C" {
};
struct llama_model_params {
int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// main_gpu interpretation depends on split_mode:
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_LAYER: ignored
int32_t main_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split;
@ -253,6 +243,15 @@ extern "C" {
// override key-value pairs of the model meta data
const struct llama_model_kv_override * kv_overrides;
int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// main_gpu interpretation depends on split_mode:
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_LAYER: ignored
int32_t main_gpu;
// Keep the booleans together to avoid misalignment during copy-by-value.
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible