diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 4d3f47fb4..3547fca02 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -2721,7 +2721,7 @@ int main(int argc, char ** argv) { NULL, // mem_buffer true, // no_alloc }; - struct ggml_context * ctx_compute = ggml_init(ctx_compute_params); + struct ggml_context * ctx_compute = NULL; struct ggml_tensor * loss = NULL; struct ggml_tensor * logits = NULL; @@ -2731,32 +2731,47 @@ int main(int argc, char ** argv) { struct ggml_cgraph * gb_tmp = NULL; // measure required memory for compute tensors - alloc = ggml_allocr_new_measure(tensor_alignment); - gf = ggml_new_graph(ctx_compute); - gb = ggml_new_graph(ctx_compute); - gb_tmp = params.use_checkpointing - ? ggml_new_graph(ctx_compute) - : NULL; - loss = llama_build_lora_finetune_graphs( - &model, &lora, alloc, ctx_compute, - gf, gb, gb_tmp, - &logits, tokens_input, target_probs, - n_tokens, n_batch, - params.use_flash, - params.use_checkpointing - ); - size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; - ggml_allocr_free(alloc); + size_t best_compute_size = SIZE_MAX; + enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT; + // find best evaluation order + for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { + ctx_compute = ggml_init(ctx_compute_params); + alloc = ggml_allocr_new_measure(tensor_alignment); + gf = ggml_new_graph(ctx_compute); + gf->order = (enum ggml_cgraph_eval_order) order; + gb = ggml_new_graph(ctx_compute); + gb_tmp = params.use_checkpointing + ? ggml_new_graph(ctx_compute) + : NULL; + loss = llama_build_lora_finetune_graphs( + &model, &lora, alloc, ctx_compute, + gf, gb, gb_tmp, + &logits, tokens_input, target_probs, + n_tokens, n_batch, + params.use_flash, + params.use_checkpointing + ); + size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; + if (max_compute_size < best_compute_size) { + best_compute_size = max_compute_size; + best_order = gf->order; + } + ggml_allocr_free(alloc); + ggml_free(ctx_compute); + } + size_t max_compute_size = best_compute_size; printf("%s: max_compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f)); - - // reset compute context - ggml_free(ctx_compute); - ctx_compute = ggml_init(ctx_compute_params); + printf("%s: evaluation order = %s\n", __func__, + (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" : + (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" : + "invalid"); // allocate compute tensors mem_compute_data.resize(max_compute_size); + ctx_compute = ggml_init(ctx_compute_params); alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); gf = ggml_new_graph(ctx_compute); + gf->order = best_order; gb = ggml_new_graph(ctx_compute); gb_tmp = params.use_checkpointing ? ggml_new_graph(ctx_compute)