metal: only encode in one command buffer

It's advised a program should only have one command buffer. This slow
inference by ~1 ms on 33B model, but we may avoid it by reusing
previous command queue.
This commit is contained in:
lshzh-ww 2023-07-21 11:17:48 -04:00
parent d924522a46
commit c8e6ef1846

View file

@ -358,39 +358,15 @@ void ggml_metal_graph_compute(
struct ggml_cgraph * gf) {
metal_printf("%s: evaluating graph\n", __func__);
// create multiple command buffers and enqueue them
// then, we encode the graph into the command buffers in parallel
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
const int n_cb = ctx->n_cb;
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
for (int i = 0; i < n_cb; ++i) {
command_buffers[i] = [ctx->queue commandBuffer];
// enqueue the command buffers in order to specify their execution order
[command_buffers[i] enqueue];
}
// TODO: is this the best way to start threads?
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
dispatch_async(queue, ^{
size_t offs_src0 = 0;
size_t offs_src1 = 0;
size_t offs_dst = 0;
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
id<MTLComputeCommandEncoder> encoder = nil;
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
for (int i = node_start; i < node_end; ++i) {
for (int i = 0; i < gf->n_nodes; ++i) {
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
@ -980,21 +956,14 @@ void ggml_metal_graph_compute(
}
[command_buffer commit];
});
}
// wait for all threads to finish
dispatch_barrier_sync(queue, ^{});
[command_buffers[n_cb - 1] waitUntilCompleted];
[command_buffer waitUntilCompleted];
// check status of command buffers
// needed to detect if the device ran out-of-memory for example (#1881)
for (int i = 0; i < n_cb; i++) {
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffer status];
if (status != MTLCommandBufferStatusCompleted) {
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
fprintf(stderr, "%s: command buffer failed with status %lu\n", __func__, status);
GGML_ASSERT(false);
}
}
}