metal: only encode in one command buffer

It's advised a program should only have one command buffer. This slow inference by ~1 ms on 33B model, but we may avoid it by reusing previous command queue.
2023-07-21 11:17:48 -04:00 · 2023-07-21 11:17:48 -04:00 · c8e6ef1846
commit c8e6ef1846
parent d924522a46
1 changed files with 517 additions and 548 deletions
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -358,39 +358,15 @@ void ggml_metal_graph_compute(
               struct ggml_cgraph * gf) {
    metal_printf("%s: evaluating graph\n", __func__);

-    // create multiple command buffers and enqueue them
-    // then, we encode the graph into the command buffers in parallel
+    id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];

-    const int n_cb = ctx->n_cb;
-
-    NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
-
-    for (int i = 0; i < n_cb; ++i) {
-        command_buffers[i] = [ctx->queue commandBuffer];
-
-        // enqueue the command buffers in order to specify their execution order
-        [command_buffers[i] enqueue];
-    }
-
-    // TODO: is this the best way to start threads?
-    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
-
-    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-        const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
-
-        dispatch_async(queue, ^{
    size_t offs_src0 = 0;
    size_t offs_src1 = 0;
    size_t offs_dst  = 0;

-            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
-
    id<MTLComputeCommandEncoder> encoder = nil;

-            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-            const int node_end   = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
-
-            for (int i = node_start; i < node_end; ++i) {
+    for (int i = 0; i < gf->n_nodes; ++i) {
        metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));

        struct ggml_tensor * src0 = gf->nodes[i]->src[0];
@ -980,21 +956,14 @@ void ggml_metal_graph_compute(
    }

    [command_buffer commit];
-        });
-    }

-    // wait for all threads to finish
-    dispatch_barrier_sync(queue, ^{});
-
-    [command_buffers[n_cb - 1] waitUntilCompleted];
+    [command_buffer waitUntilCompleted];

    // check status of command buffers
    // needed to detect if the device ran out-of-memory for example (#1881)
-    for (int i = 0; i < n_cb; i++) {
-        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
+    MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffer status];
    if (status != MTLCommandBufferStatusCompleted) {
-            fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
+        fprintf(stderr, "%s: command buffer failed with status %lu\n", __func__, status);
        GGML_ASSERT(false);
    }
 }
-}