measure max compute size for each cgraph eval order and use best order

this can bring huge memory savings: e.g. codellama-34b with n_ctx=64, n_batch=1 goes from 92927.8mb down to 4627.6 MB
2023-09-09 21:00:25 +02:00 · 2023-09-09 21:00:25 +02:00 · ace90884a6
commit ace90884a6
parent 917d2870b4
1 changed files with 36 additions and 21 deletions
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -2721,7 +2721,7 @@ int main(int argc, char ** argv) {
        NULL,                           // mem_buffer
        true,                           // no_alloc
    };
-    struct ggml_context * ctx_compute = ggml_init(ctx_compute_params);
+    struct ggml_context * ctx_compute = NULL;
    struct ggml_tensor * loss   = NULL;
    struct ggml_tensor * logits = NULL;
@ -2731,8 +2731,14 @@ int main(int argc, char ** argv) {
    struct ggml_cgraph * gb_tmp = NULL;
    // measure required memory for compute tensors
    size_t best_compute_size = SIZE_MAX;
    enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT;
    // find best evaluation order
    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
        ctx_compute = ggml_init(ctx_compute_params);
        alloc = ggml_allocr_new_measure(tensor_alignment);
        gf = ggml_new_graph(ctx_compute);
        gf->order = (enum ggml_cgraph_eval_order) order;
        gb = ggml_new_graph(ctx_compute);
        gb_tmp = params.use_checkpointing
            ? ggml_new_graph(ctx_compute)
@ -2746,17 +2752,26 @@ int main(int argc, char ** argv) {
            params.use_checkpointing
        );
        size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
        if (max_compute_size < best_compute_size) {
            best_compute_size = max_compute_size;
            best_order = gf->order;
        }
        ggml_allocr_free(alloc);
    printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
    // reset compute context
        ggml_free(ctx_compute);
-    ctx_compute = ggml_init(ctx_compute_params);
+    }
    size_t max_compute_size = best_compute_size;
    printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f));
    printf("%s: evaluation order = %s\n", __func__,
        (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" :
        (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" :
        "invalid");
    // allocate compute tensors
    mem_compute_data.resize(max_compute_size);
    ctx_compute = ggml_init(ctx_compute_params);
    alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
    gf = ggml_new_graph(ctx_compute);
    gf->order = best_order;
    gb = ggml_new_graph(ctx_compute);
    gb_tmp = params.use_checkpointing
        ? ggml_new_graph(ctx_compute)