metal: only encode in one command buffer
It's advised a program should only have one command buffer. This slow inference by ~1 ms on 33B model, but we may avoid it by reusing previous command queue.
This commit is contained in:
parent
d924522a46
commit
c8e6ef1846
1 changed files with 517 additions and 548 deletions
41
ggml-metal.m
41
ggml-metal.m
|
@ -358,39 +358,15 @@ void ggml_metal_graph_compute(
|
|||
struct ggml_cgraph * gf) {
|
||||
metal_printf("%s: evaluating graph\n", __func__);
|
||||
|
||||
// create multiple command buffers and enqueue them
|
||||
// then, we encode the graph into the command buffers in parallel
|
||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
|
||||
|
||||
const int n_cb = ctx->n_cb;
|
||||
|
||||
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
||||
|
||||
for (int i = 0; i < n_cb; ++i) {
|
||||
command_buffers[i] = [ctx->queue commandBuffer];
|
||||
|
||||
// enqueue the command buffers in order to specify their execution order
|
||||
[command_buffers[i] enqueue];
|
||||
}
|
||||
|
||||
// TODO: is this the best way to start threads?
|
||||
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||
const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
|
||||
|
||||
dispatch_async(queue, ^{
|
||||
size_t offs_src0 = 0;
|
||||
size_t offs_src1 = 0;
|
||||
size_t offs_dst = 0;
|
||||
|
||||
id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
|
||||
|
||||
id<MTLComputeCommandEncoder> encoder = nil;
|
||||
|
||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||
const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
|
||||
|
||||
for (int i = node_start; i < node_end; ++i) {
|
||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||
|
||||
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
||||
|
@ -980,21 +956,14 @@ void ggml_metal_graph_compute(
|
|||
}
|
||||
|
||||
[command_buffer commit];
|
||||
});
|
||||
}
|
||||
|
||||
// wait for all threads to finish
|
||||
dispatch_barrier_sync(queue, ^{});
|
||||
|
||||
[command_buffers[n_cb - 1] waitUntilCompleted];
|
||||
[command_buffer waitUntilCompleted];
|
||||
|
||||
// check status of command buffers
|
||||
// needed to detect if the device ran out-of-memory for example (#1881)
|
||||
for (int i = 0; i < n_cb; i++) {
|
||||
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
|
||||
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffer status];
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||
fprintf(stderr, "%s: command buffer failed with status %lu\n", __func__, status);
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue