measure max compute size for each cgraph eval order and use best order

this can bring huge memory savings:
e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
This commit is contained in:
xaedes 2023-09-09 21:00:25 +02:00
parent 917d2870b4
commit ace90884a6
No known key found for this signature in database
GPG key ID: 30030EDD817EA2B1

View file

@ -2721,7 +2721,7 @@ int main(int argc, char ** argv) {
NULL, // mem_buffer NULL, // mem_buffer
true, // no_alloc true, // no_alloc
}; };
struct ggml_context * ctx_compute = ggml_init(ctx_compute_params); struct ggml_context * ctx_compute = NULL;
struct ggml_tensor * loss = NULL; struct ggml_tensor * loss = NULL;
struct ggml_tensor * logits = NULL; struct ggml_tensor * logits = NULL;
@ -2731,8 +2731,14 @@ int main(int argc, char ** argv) {
struct ggml_cgraph * gb_tmp = NULL; struct ggml_cgraph * gb_tmp = NULL;
// measure required memory for compute tensors // measure required memory for compute tensors
size_t best_compute_size = SIZE_MAX;
enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
// find best evaluation order
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new_measure(tensor_alignment); alloc = ggml_allocr_new_measure(tensor_alignment);
gf = ggml_new_graph(ctx_compute); gf = ggml_new_graph(ctx_compute);
gf->order = (enum ggml_cgraph_eval_order) order;
gb = ggml_new_graph(ctx_compute); gb = ggml_new_graph(ctx_compute);
gb_tmp = params.use_checkpointing gb_tmp = params.use_checkpointing
? ggml_new_graph(ctx_compute) ? ggml_new_graph(ctx_compute)
@ -2746,17 +2752,26 @@ int main(int argc, char ** argv) {
params.use_checkpointing params.use_checkpointing
); );
size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
if (max_compute_size < best_compute_size) {
best_compute_size = max_compute_size;
best_order = gf->order;
}
ggml_allocr_free(alloc); ggml_allocr_free(alloc);
printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
// reset compute context
ggml_free(ctx_compute); ggml_free(ctx_compute);
ctx_compute = ggml_init(ctx_compute_params); }
size_t max_compute_size = best_compute_size;
printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
printf("%s: evaluation order = %s\n", __func__,
(best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
(best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
"invalid");
// allocate compute tensors // allocate compute tensors
mem_compute_data.resize(max_compute_size); mem_compute_data.resize(max_compute_size);
ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
gf = ggml_new_graph(ctx_compute); gf = ggml_new_graph(ctx_compute);
gf->order = best_order;
gb = ggml_new_graph(ctx_compute); gb = ggml_new_graph(ctx_compute);
gb_tmp = params.use_checkpointing gb_tmp = params.use_checkpointing
? ggml_new_graph(ctx_compute) ? ggml_new_graph(ctx_compute)