measure max compute size for each cgraph eval order and use best order

this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
This commit is contained in:
xaedes 2023-09-09 21:00:25 +02:00
parent 917d2870b4
commit ace90884a6
No known key found for this signature in database
GPG key ID: 30030EDD817EA2B1

View file

@ -2721,7 +2721,7 @@ int main(int argc, char ** argv) {
NULL, // mem_buffer
true, // no_alloc
};
struct ggml_context * ctx_compute = ggml_init(ctx_compute_params);
struct ggml_context * ctx_compute = NULL;
struct ggml_tensor * loss = NULL;
struct ggml_tensor * logits = NULL;
@ -2731,32 +2731,47 @@ int main(int argc, char ** argv) {
struct ggml_cgraph * gb_tmp = NULL;
// measure required memory for compute tensors
alloc = ggml_allocr_new_measure(tensor_alignment);
gf = ggml_new_graph(ctx_compute);
gb = ggml_new_graph(ctx_compute);
gb_tmp = params.use_checkpointing
? ggml_new_graph(ctx_compute)
: NULL;
loss = llama_build_lora_finetune_graphs(
&model, &lora, alloc, ctx_compute,
gf, gb, gb_tmp,
&logits, tokens_input, target_probs,
n_tokens, n_batch,
params.use_flash,
params.use_checkpointing
);
size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
ggml_allocr_free(alloc);
size_t best_compute_size = SIZE_MAX;
enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
// find best evaluation order
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new_measure(tensor_alignment);
gf = ggml_new_graph(ctx_compute);
gf->order = (enum ggml_cgraph_eval_order) order;
gb = ggml_new_graph(ctx_compute);
gb_tmp = params.use_checkpointing
? ggml_new_graph(ctx_compute)
: NULL;
loss = llama_build_lora_finetune_graphs(
&model, &lora, alloc, ctx_compute,
gf, gb, gb_tmp,
&logits, tokens_input, target_probs,
n_tokens, n_batch,
params.use_flash,
params.use_checkpointing
);
size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
if (max_compute_size < best_compute_size) {
best_compute_size = max_compute_size;
best_order = gf->order;
}
ggml_allocr_free(alloc);
ggml_free(ctx_compute);
}
size_t max_compute_size = best_compute_size;
printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
// reset compute context
ggml_free(ctx_compute);
ctx_compute = ggml_init(ctx_compute_params);
printf("%s: evaluation order = %s\n", __func__,
(best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
(best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
"invalid");
// allocate compute tensors
mem_compute_data.resize(max_compute_size);
ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
gf = ggml_new_graph(ctx_compute);
gf->order = best_order;
gb = ggml_new_graph(ctx_compute);
gb_tmp = params.use_checkpointing
? ggml_new_graph(ctx_compute)