diff --git a/ggml-metal.m b/ggml-metal.m index a673f0603..cd9d00456 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1238,7 +1238,7 @@ void ggml_metal_graph_compute( // not sure how to avoid this // TODO: make a simpler cpy_bytes kernel - const int nth = MIN(1024, ne00); + const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00); [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -2239,7 +2239,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; [encoder setBytes:&sf length:sizeof(sf) atIndex:18]; - const int nth = MIN(1024, ne0); + const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0); [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index d5b54e112..3d6b8e7e8 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -3523,7 +3523,7 @@ void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg device const int8_t * qs = ((device const int8_t *)xb->qs); const half d = xb->d; - for (int i=0;i<16;i++) { + for (int i = 0; i < 16; i++) { reg[i/4][i%4] = (qs[i + 16*il] * d); } }