diff --git a/.gitignore b/.gitignore index edcb6b144..e4561ad73 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,7 @@ build-release/ build-static/ build-cublas/ build-opencl/ -build-mtl/ +build-metal/ build-no-accel/ build-sanitize-addr/ build-sanitize-thread/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 62f1467aa..1f2e78c0f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -207,7 +207,7 @@ if (LLAMA_METAL) find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED) - set(GGML_SOURCES_METAL ggml-mtl.m ggml-mtl.h) + set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) add_compile_definitions(GGML_USE_METAL) add_compile_definitions(GGML_METAL_NDEBUG) @@ -215,8 +215,8 @@ if (LLAMA_METAL) # get full path to the file #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") - # copy ggml-mtl.metal to bin directory - configure_file(ggml-mtl.metal bin/ggml-mtl.metal COPYONLY) + # copy ggml-metal.metal to bin directory + configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${FOUNDATION_LIBRARY} diff --git a/Makefile b/Makefile index 8e8d426c5..1f910c3ec 100644 --- a/Makefile +++ b/Makefile @@ -105,6 +105,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) #CFLAGS += -mfma -mf16c -mavx #CXXFLAGS += -mfma -mf16c -mavx endif + ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) ifneq (,$(findstring POWER9,$(POWER9_M))) @@ -116,6 +117,7 @@ ifneq ($(filter ppc64%,$(UNAME_M)),) CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN endif endif + ifndef LLAMA_NO_ACCELERATE # Mac M1 - include Accelerate framework. # `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). @@ -123,7 +125,8 @@ ifndef LLAMA_NO_ACCELERATE CFLAGS += -DGGML_USE_ACCELERATE LDFLAGS += -framework Accelerate endif -endif +endif # LLAMA_NO_ACCELERATE + ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),) @@ -131,11 +134,13 @@ ifdef LLAMA_OPENBLAS else LDFLAGS += -lopenblas endif -endif +endif # LLAMA_OPENBLAS + ifdef LLAMA_BLIS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis LDFLAGS += -lblis -L/usr/local/lib -endif +endif # LLAMA_BLIS + ifdef LLAMA_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include @@ -156,9 +161,10 @@ endif # LLAMA_CUDA_DMMV_Y ggml-cuda.o: ggml-cuda.cu ggml-cuda.h $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ endif # LLAMA_CUBLAS + ifdef LLAMA_CLBLAST - CFLAGS += -DGGML_USE_CLBLAST - CXXFLAGS += -DGGML_USE_CLBLAST + CFLAGS += -DGGML_USE_CLBLAST + CXXFLAGS += -DGGML_USE_CLBLAST # Mac provides OpenCL as a framework ifeq ($(UNAME_S),Darwin) LDFLAGS += -lclblast -framework OpenCL @@ -166,23 +172,38 @@ ifdef LLAMA_CLBLAST LDFLAGS += -lclblast -lOpenCL endif OBJS += ggml-opencl.o + ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h $(CXX) $(CXXFLAGS) -c $< -o $@ -endif +endif # LLAMA_CLBLAST + +ifdef LLAMA_METAL + CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG + CXXFLAGS += -DGGML_USE_METAL + LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders + OBJS += ggml-metal.o + +ggml-metal.o: ggml-metal.m ggml-metal.h + $(CC) $(CFLAGS) -c $< -o $@ +endif # LLAMA_METAL + ifneq ($(filter aarch64%,$(UNAME_M)),) # Apple M1, M2, etc. # Raspberry Pi 3, 4, Zero 2 (64-bit) CFLAGS += -mcpu=native CXXFLAGS += -mcpu=native endif + ifneq ($(filter armv6%,$(UNAME_M)),) # Raspberry Pi 1, Zero CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access endif + ifneq ($(filter armv7%,$(UNAME_M)),) # Raspberry Pi 2 CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations endif + ifneq ($(filter armv8%,$(UNAME_M)),) # Raspberry Pi 3, 4, Zero 2 (32-bit) CFLAGS += -mfp16-format=ieee -mno-unaligned-access diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e23bf1cb3..3deff4077 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -38,7 +38,7 @@ else() add_subdirectory(benchmark) add_subdirectory(baby-llama) if (LLAMA_METAL) - add_subdirectory(mtl) + add_subdirectory(metal) endif() if (LLAMA_BUILD_SERVER) add_subdirectory(server) diff --git a/examples/metal/CMakeLists.txt b/examples/metal/CMakeLists.txt new file mode 100644 index 000000000..a8c4284a5 --- /dev/null +++ b/examples/metal/CMakeLists.txt @@ -0,0 +1,3 @@ +set(TEST_TARGET metal) +add_executable(${TEST_TARGET} metal.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) diff --git a/examples/mtl/mtl.cpp b/examples/metal/metal.cpp similarity index 77% rename from examples/mtl/mtl.cpp rename to examples/metal/metal.cpp index 56510904c..fc1db90a1 100644 --- a/examples/mtl/mtl.cpp +++ b/examples/metal/metal.cpp @@ -1,5 +1,5 @@ #include "ggml.h" -#include "ggml-mtl.h" +#include "ggml-metal.h" #include #include @@ -23,20 +23,20 @@ int main(int argc, char ** argv) { gf.n_threads = 1; // this allocates all Metal resources and memory buffers - auto * ctx_mtl = ggml_mtl_init(); + auto * ctx_metal = ggml_metal_init(); - ggml_mtl_add_buffer(ctx_mtl, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data)); - ggml_mtl_add_buffer(ctx_mtl, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval)); + ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data)); + ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval)); // main { struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd"); *(int32_t *) input->data = 1; // BOS - ggml_mtl_set_tensor(ctx_mtl, input); + ggml_metal_set_tensor(ctx_metal, input); // warmup - ggml_mtl_graph_compute(ctx_mtl, &gf); + ggml_metal_graph_compute(ctx_metal, &gf); const int n_iter = 16; @@ -44,7 +44,7 @@ int main(int argc, char ** argv) { // the actual inference happens here for (int i = 0; i < n_iter; ++i) { - ggml_mtl_graph_compute(ctx_mtl, &gf); + ggml_metal_graph_compute(ctx_metal, &gf); } const int64_t t1 = ggml_time_us(); @@ -55,7 +55,7 @@ int main(int argc, char ** argv) { // debug output { struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1]; - ggml_mtl_get_tensor(ctx_mtl, logits); + ggml_metal_get_tensor(ctx_metal, logits); float * ptr = (float *) ggml_get_data(logits); @@ -77,7 +77,7 @@ int main(int argc, char ** argv) { printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax); } - ggml_mtl_free(ctx_mtl); + ggml_metal_free(ctx_metal); ggml_free(ctx_data); ggml_free(ctx_eval); diff --git a/examples/mtl/CMakeLists.txt b/examples/mtl/CMakeLists.txt deleted file mode 100644 index 0fe3a7197..000000000 --- a/examples/mtl/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -if (APPLE) - set(TEST_TARGET mtl) - add_executable(${TEST_TARGET} mtl.cpp) - target_link_libraries(${TEST_TARGET} PRIVATE ggml) -endif() - diff --git a/ggml-metal.h b/ggml-metal.h new file mode 100644 index 000000000..a9441a9d4 --- /dev/null +++ b/ggml-metal.h @@ -0,0 +1,63 @@ +// An interface allowing to compute ggml_cgraph with Metal +// +// This is a fully functional interface that extends ggml with GPU support for Apple devices. +// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.) +// +// How it works? +// +// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this +// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you +// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.) +// +// You only need to make sure that all memory buffers that you used during the graph creation +// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is +// used during the graph evaluation to determine the arguments of the compute kernels. +// +// Synchronization between device and host memory (for example for input and output tensors) +// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions. +// + +#pragma once + +#include +#include + +// max memory buffers that can be mapped to the device +#define GGML_METAL_MAX_BUFFERS 16 + +struct ggml_tensor; +struct ggml_cgraph; + +#ifdef __cplusplus +extern "C" { +#endif + +struct ggml_metal_context; + +struct ggml_metal_context * ggml_metal_init(void); +void ggml_metal_free(struct ggml_metal_context * ctx); + +// creates a mapping between a host memory buffer and a device memory buffer +// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute +// - the mapping is used during computation to determine the arguments of the compute kernels +// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal +// +bool ggml_metal_add_buffer( + struct ggml_metal_context * ctx, + const char * name, + void * data, + size_t size); + +// set data from host memory into the device +void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); + +// get data from the device into host memory +void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); + +// same as ggml_graph_compute but uses Metal +void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); + +#ifdef __cplusplus +} +#endif + diff --git a/ggml-mtl.m b/ggml-metal.m similarity index 70% rename from ggml-mtl.m rename to ggml-metal.m index 8f831afe7..3cb423a01 100644 --- a/ggml-mtl.m +++ b/ggml-metal.m @@ -1,28 +1,30 @@ -#import "ggml-mtl.h" +#import "ggml-metal.h" #import "ggml.h" #import + #import #import #ifdef GGML_METAL_NDEBUG -#define mtl_printf(...) +#define metal_printf(...) #else -#define mtl_printf(...) fprintf(stderr, __VA_ARGS__) +#define metal_printf(...) fprintf(stderr, __VA_ARGS__) #endif -//#define mtl_printf(...) -struct ggml_mtl_buffer { +#define UNUSED(x) (void)(x) + +struct ggml_metal_buffer { const char * name; void * data; size_t size; - id mtl; + id metal; }; -struct ggml_mtl_context { +struct ggml_metal_context { float * logits; id device; @@ -30,65 +32,41 @@ struct ggml_mtl_context { id library; int n_buffers; - struct ggml_mtl_buffer buffers[GGML_METAL_MAX_BUFFERS]; + struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; // custom kernels - id function_add; - id pipeline_add; +#define GGML_METAL_DECL_KERNEL(name) \ + id function_##name; \ + id pipeline_##name - id function_mul; - id pipeline_mul; + GGML_METAL_DECL_KERNEL(add); + GGML_METAL_DECL_KERNEL(mul); + GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast + GGML_METAL_DECL_KERNEL(scale); + GGML_METAL_DECL_KERNEL(silu); + GGML_METAL_DECL_KERNEL(relu); + GGML_METAL_DECL_KERNEL(soft_max); + GGML_METAL_DECL_KERNEL(diag_mask_inf); + GGML_METAL_DECL_KERNEL(get_rows_q4_0); + GGML_METAL_DECL_KERNEL(rms_norm); + GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); + GGML_METAL_DECL_KERNEL(rope); + GGML_METAL_DECL_KERNEL(cpy_f32_f16); + GGML_METAL_DECL_KERNEL(cpy_f32_f32); - // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast - id function_mul_row; - id pipeline_mul_row; - - id function_scale; - id pipeline_scale; - - id function_silu; - id pipeline_silu; - - id function_relu; - id pipeline_relu; - - id function_soft_max; - id pipeline_soft_max; - - id function_diag_mask_inf; - id pipeline_diag_mask_inf; - - id function_get_rows_q4_0; - id pipeline_get_rows_q4_0; - - id function_rms_norm; - id pipeline_rms_norm; - - id function_mul_mat_q4_0_f32; - id pipeline_mul_mat_q4_0_f32; - - id function_mul_mat_f16_f32; - id pipeline_mul_mat_f16_f32; - - id function_rope; - id pipeline_rope; - - id function_cpy_f32_f16; - id pipeline_cpy_f32_f16; - - id function_cpy_f32_f32; - id pipeline_cpy_f32_f32; +#undef GGML_METAL_DECL_KERNEL }; // MSL code // TODO: move the contents here when ready // for now it is easier to work in a separate file -NSString * const msl_library_source = @"see mtl.metal"; +static NSString * const msl_library_source = @"see metal.metal"; -struct ggml_mtl_context * ggml_mtl_init(void) { +struct ggml_metal_context * ggml_metal_init(void) { fprintf(stderr, "%s: allocating\n", __func__); - struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context)); + struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; @@ -113,12 +91,14 @@ struct ggml_mtl_context * ggml_mtl_init(void) { } } #else - // read the source from "../examples/mtl/mtl.metal" into a string and use newLibraryWithSource + UNUSED(msl_library_source); + + // read the source from "ggml-metal.metal" into a string and use newLibraryWithSource { NSError * error = nil; - //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/mtl/mtl" ofType:@"metal"]; - NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-mtl" ofType:@"metal"]; + //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; + NSString * path = [[NSBundle mainBundle] pathForResource:@"ggml-metal" ofType:@"metal"]; fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; @@ -137,80 +117,44 @@ struct ggml_mtl_context * ggml_mtl_init(void) { // load kernels { - MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new]; +#define GGML_METAL_ADD_KERNEL(name) \ + ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ + ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \ + fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); - ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"]; - ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil]; - fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, (void *) ctx->pipeline_add); + GGML_METAL_ADD_KERNEL(add); + GGML_METAL_ADD_KERNEL(mul); + GGML_METAL_ADD_KERNEL(mul_row); + GGML_METAL_ADD_KERNEL(scale); + GGML_METAL_ADD_KERNEL(silu); + GGML_METAL_ADD_KERNEL(relu); + GGML_METAL_ADD_KERNEL(soft_max); + GGML_METAL_ADD_KERNEL(diag_mask_inf); + GGML_METAL_ADD_KERNEL(get_rows_q4_0); + GGML_METAL_ADD_KERNEL(rms_norm); + GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); + GGML_METAL_ADD_KERNEL(rope); + GGML_METAL_ADD_KERNEL(cpy_f32_f16); + GGML_METAL_ADD_KERNEL(cpy_f32_f32); - ctx->function_mul = [ctx->library newFunctionWithName:@"kernel_mul"]; - ctx->pipeline_mul = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul error:nil]; - fprintf(stderr, "%s: loaded kernel_mul: %p\n", __func__, (void *) ctx->pipeline_mul); - - ctx->function_mul_row = [ctx->library newFunctionWithName:@"kernel_mul_row"]; - ctx->pipeline_mul_row = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_row error:nil]; - fprintf(stderr, "%s: loaded kernel_mul_row: %p\n", __func__, (void *) ctx->pipeline_mul_row); - - ctx->function_scale = [ctx->library newFunctionWithName:@"kernel_scale"]; - ctx->pipeline_scale = [ctx->device newComputePipelineStateWithFunction:ctx->function_scale error:nil]; - fprintf(stderr, "%s: loaded kernel_scale: %p\n", __func__, (void *) ctx->pipeline_scale); - - ctx->function_silu = [ctx->library newFunctionWithName:@"kernel_silu"]; - ctx->pipeline_silu = [ctx->device newComputePipelineStateWithFunction:ctx->function_silu error:nil]; - fprintf(stderr, "%s: loaded kernel_silu: %p\n", __func__, (void *) ctx->pipeline_silu); - - ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"]; - ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil]; - fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, (void *) ctx->pipeline_relu); - - ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil]; - ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil]; - fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, (void *) ctx->pipeline_soft_max); - - ctx->function_diag_mask_inf = [ctx->library newFunctionWithName:@"kernel_diag_mask_inf" constantValues:constants error:nil]; - ctx->pipeline_diag_mask_inf = [ctx->device newComputePipelineStateWithFunction:ctx->function_diag_mask_inf error:nil]; - fprintf(stderr, "%s: loaded kernel_diag_mask_inf: %p\n", __func__, (void *) ctx->pipeline_diag_mask_inf); - - ctx->function_get_rows_q4_0 = [ctx->library newFunctionWithName:@"kernel_get_rows_q4_0"]; - ctx->pipeline_get_rows_q4_0 = [ctx->device newComputePipelineStateWithFunction:ctx->function_get_rows_q4_0 error:nil]; - fprintf(stderr, "%s: loaded kernel_get_rows_q4_0: %p\n", __func__, (void *) ctx->pipeline_get_rows_q4_0); - - ctx->function_rms_norm = [ctx->library newFunctionWithName:@"kernel_rms_norm"]; - ctx->pipeline_rms_norm = [ctx->device newComputePipelineStateWithFunction:ctx->function_rms_norm error:nil]; - fprintf(stderr, "%s: loaded kernel_rms_norm: %p\n", __func__, (void *) ctx->pipeline_rms_norm); - - ctx->function_mul_mat_q4_0_f32 = [ctx->library newFunctionWithName:@"kernel_mul_mat_q4_0_f32"]; - ctx->pipeline_mul_mat_q4_0_f32 = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_mat_q4_0_f32 error:nil]; - fprintf(stderr, "%s: loaded kernel_mul_mat_q4_0_f32: %p\n", __func__, (void *) ctx->pipeline_mul_mat_q4_0_f32); - - ctx->function_mul_mat_f16_f32 = [ctx->library newFunctionWithName:@"kernel_mul_mat_f16_f32"]; - ctx->pipeline_mul_mat_f16_f32 = [ctx->device newComputePipelineStateWithFunction:ctx->function_mul_mat_f16_f32 error:nil]; - fprintf(stderr, "%s: loaded kernel_mul_mat_f16_f32: %p\n", __func__, (void *) ctx->pipeline_mul_mat_f16_f32); - - ctx->function_rope = [ctx->library newFunctionWithName:@"kernel_rope"]; - ctx->pipeline_rope = [ctx->device newComputePipelineStateWithFunction:ctx->function_rope error:nil]; - fprintf(stderr, "%s: loaded kernel_rope: %p\n", __func__, (void *) ctx->pipeline_rope); - - ctx->function_cpy_f32_f16 = [ctx->library newFunctionWithName:@"kernel_cpy_f32_f16"]; - ctx->pipeline_cpy_f32_f16 = [ctx->device newComputePipelineStateWithFunction:ctx->function_cpy_f32_f16 error:nil]; - fprintf(stderr, "%s: loaded kernel_cpy_f32_f16: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f16); - - ctx->function_cpy_f32_f32 = [ctx->library newFunctionWithName:@"kernel_cpy_f32_f32"]; - ctx->pipeline_cpy_f32_f32 = [ctx->device newComputePipelineStateWithFunction:ctx->function_cpy_f32_f32 error:nil]; - fprintf(stderr, "%s: loaded kernel_cpy_f32_f32: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f32); +#undef GGML_METAL_ADD_KERNEL } return ctx; } -void ggml_mtl_free(struct ggml_mtl_context * ctx) { +void ggml_metal_free(struct ggml_metal_context * ctx) { fprintf(stderr, "%s: deallocating\n", __func__); free(ctx); } -// get data / eval buffer + offset -static id ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) { +// finds the Metal buffer that contains the tensor data on the GPU device +// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the +// Metal buffer based on the host memory pointer +// +static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); for (int i = 0; i < ctx->n_buffers; ++i) { @@ -221,64 +165,75 @@ static id ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct g //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); - return ctx->buffers[i].mtl; + return ctx->buffers[i].metal; } } fprintf(stderr, "%s: error: buffer is nil\n", __func__); - GGML_ASSERT(false); return nil; } -void ggml_mtl_add_buffer( - struct ggml_mtl_context * ctx, +bool ggml_metal_add_buffer( + struct ggml_metal_context * ctx, const char * name, void * data, size_t size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { fprintf(stderr, "%s: too many buffers\n", __func__); - return; + return false; } if (data) { + // verify that the buffer does not overlap with any of the existing buffers + for (int i = 0; i < ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; + + if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { + fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + return false; + } + } + ctx->buffers[ctx->n_buffers].name = name; ctx->buffers[ctx->n_buffers].data = data; ctx->buffers[ctx->n_buffers].size = size; - ctx->buffers[ctx->n_buffers].mtl = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared]; + ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared]; ++ctx->n_buffers; - fprintf(stderr, "%s: allocated '%16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0); + fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0); } + + return true; } -void ggml_mtl_set_tensor( - struct ggml_mtl_context * ctx, +void ggml_metal_set_tensor( + struct ggml_metal_context * ctx, struct ggml_tensor * t) { - mtl_printf("%s: set input for tensor '%s'\n", __func__, t->name); + metal_printf("%s: set input for tensor '%s'\n", __func__, t->name); size_t offs; - id id_dst = ggml_mtl_get_buffer(ctx, t, &offs); + id id_dst = ggml_metal_get_buffer(ctx, t, &offs); memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t)); } -void ggml_mtl_get_tensor( - struct ggml_mtl_context * ctx, +void ggml_metal_get_tensor( + struct ggml_metal_context * ctx, struct ggml_tensor * t) { - mtl_printf("%s: extract results for tensor '%s'\n", __func__, t->name); + metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name); size_t offs; - id id_src = ggml_mtl_get_buffer(ctx, t, &offs); + id id_src = ggml_metal_get_buffer(ctx, t, &offs); memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t)); } -int ggml_mtl_graph_compute( - struct ggml_mtl_context * ctx, +void ggml_metal_graph_compute( + struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { - mtl_printf("%s: evaluating graph\n", __func__); + metal_printf("%s: evaluating graph\n", __func__); size_t offs_src0 = 0; size_t offs_src1 = 0; @@ -288,7 +243,7 @@ int ggml_mtl_graph_compute( id encoder = nil; for (int i = 0; i < gf->n_nodes; ++i) { - //mtl_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); struct ggml_tensor * src0 = gf->nodes[i]->src0; struct ggml_tensor * src1 = gf->nodes[i]->src1; @@ -307,12 +262,12 @@ int ggml_mtl_graph_compute( const int64_t ne10 = src1 ? src1->ne[0] : 0; const int64_t ne11 = src1 ? src1->ne[1] : 0; const int64_t ne12 = src1 ? src1->ne[2] : 0; - //const int64_t ne13 = src1 ? src1->ne[3] : 0; + const int64_t ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); const uint64_t nb10 = src1 ? src1->nb[0] : 0; const uint64_t nb11 = src1 ? src1->nb[1] : 0; const uint64_t nb12 = src1 ? src1->nb[2] : 0; - //const uint64_t nb13 = src1 ? src1->nb[3] : 0; + const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); const int64_t ne0 = dst ? dst->ne[0] : 0; const int64_t ne1 = dst ? dst->ne[1] : 0; @@ -328,21 +283,21 @@ int ggml_mtl_graph_compute( const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - id id_src0 = src0 ? ggml_mtl_get_buffer(ctx, src0, &offs_src0) : nil; - id id_src1 = src1 ? ggml_mtl_get_buffer(ctx, src1, &offs_src1) : nil; - id id_dst = dst ? ggml_mtl_get_buffer(ctx, dst, &offs_dst) : nil; + id id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil; + id id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil; + id id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil; - //mtl_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); + //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op)); //if (src0) { - // mtl_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, + // metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, // ggml_is_contiguous(src0), src0->name); //} //if (src1) { - // mtl_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, + // metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, // ggml_is_contiguous(src1), src1->name); //} //if (dst) { - // mtl_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, + // metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, // dst->name); //} @@ -472,6 +427,8 @@ int ggml_mtl_graph_compute( } break; case GGML_OP_MUL_MAT: { + // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224 + GGML_ASSERT(ne00 == ne10); GGML_ASSERT(ne02 == ne12); @@ -503,6 +460,7 @@ int ggml_mtl_graph_compute( // we need to do ne02 multiplications // TODO: is there a way to do this in parallel - currently very slow .. + // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS for (int64_t i02 = 0; i02 < ne02; ++i02) { size_t offs_src0_cur = offs_src0 + i02*nb02; size_t offs_src1_cur = offs_src1 + i02*nb12; @@ -578,10 +536,7 @@ int ggml_mtl_graph_compute( switch (src0->type) { case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; - default: { - // not implemented - fprintf(stderr, "%s: node %3d, op = %8s, type = %8s not implemented\n", __func__, i, ggml_op_name(dst->op), ggml_type_name(src0->type)); - } + default: GGML_ASSERT(false && "not implemented"); } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -626,10 +581,6 @@ int ggml_mtl_graph_compute( const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; - //mtl_printf("rope: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03); - //mtl_printf("rope: %lld x %lld x %lld x %lld\n", ne0, ne1, ne2, ne3); - //mtl_printf("rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode); - const int n_past = ((int32_t *)(src1->data))[0]; [encoder setComputePipelineState:ctx->pipeline_rope]; @@ -665,12 +616,6 @@ int ggml_mtl_graph_compute( const int nth = 32; - //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", ne00, ne01, ne02, ne03); - //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", nb00, nb01, nb02, nb03); - //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", ne0, ne1, ne2, ne3); - //mtl_printf("cpy: %lld x %lld x %lld x %lld\n", nb0, nb1, nb2, nb3); - //mtl_printf("cpy: %s -> %s\n", ggml_type_name(src0t), ggml_type_name(dstt)); - switch (src0t) { case GGML_TYPE_F32: { @@ -707,7 +652,6 @@ int ggml_mtl_graph_compute( default: fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); - return -1; } } @@ -721,8 +665,8 @@ int ggml_mtl_graph_compute( { const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime]; - mtl_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0); - } + UNUSED(time_elapsed); - return 0; + metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0); + } } diff --git a/ggml-mtl.metal b/ggml-metal.metal similarity index 97% rename from ggml-mtl.metal rename to ggml-metal.metal index 53f7f7448..4bedc8ea4 100644 --- a/ggml-mtl.metal +++ b/ggml-metal.metal @@ -98,19 +98,6 @@ kernel void kernel_soft_max( device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; - //float max = 0.0f; - //for (int i = 0; i < ne00; i++) { - // max = MAX(max, psrc0[i]); - //} - //float sum = 0.0f; - //for (int i = 0; i < ne00; i++) { - // pdst[i] = exp(psrc0[i] - max); - // sum += pdst[i]; - //} - //for (int i = 0; i < ne00; i++) { - // pdst[i] /= sum; - //} - // parallel max buf[tpitg[0]] = -INFINITY; for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) { diff --git a/ggml-mtl.h b/ggml-mtl.h deleted file mode 100644 index cab71e386..000000000 --- a/ggml-mtl.h +++ /dev/null @@ -1,44 +0,0 @@ -#pragma once - -#include - -#define GGML_METAL_MAX_BUFFERS 16 - -struct ggml_tensor; -struct ggml_cgraph; - -#ifdef __cplusplus -extern "C" { -#endif - -struct ggml_mtl_context; - -struct ggml_mtl_context * ggml_mtl_init(void); - -void ggml_mtl_free(struct ggml_mtl_context * ctx); - -void ggml_mtl_add_buffer( - struct ggml_mtl_context * ctx, - const char * name, - void * data, - size_t size); - -// set data from host memory into the device -void ggml_mtl_set_tensor( - struct ggml_mtl_context * ctx, - struct ggml_tensor * t); - -// get data from the device into host memory -void ggml_mtl_get_tensor( - struct ggml_mtl_context * ctx, - struct ggml_tensor * t); - -// return 0 on success -int ggml_mtl_graph_compute( - struct ggml_mtl_context * ctx, - struct ggml_cgraph * gf); - -#ifdef __cplusplus -} -#endif - diff --git a/llama.cpp b/llama.cpp index 455402a4e..4b22b215a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17,7 +17,7 @@ #endif #ifdef GGML_USE_METAL -#include "ggml-mtl.h" +#include "ggml-metal.h" #endif #include @@ -243,7 +243,7 @@ struct llama_context { llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; #ifdef GGML_USE_METAL - ggml_mtl_context * mtl_ctx = NULL; + ggml_metal_context * ctx_metal = NULL; #endif int buf_last = 0; @@ -1256,8 +1256,8 @@ static bool llama_eval_internal( memcpy(embd->data, tokens, N*ggml_element_size(embd)); #ifdef GGML_USE_METAL - if (lctx.mtl_ctx) { - ggml_mtl_set_tensor(lctx.mtl_ctx, embd); + if (lctx.ctx_metal) { + ggml_metal_set_tensor(lctx.ctx_metal, embd); } #endif @@ -1448,11 +1448,25 @@ static bool llama_eval_internal( ggml_build_forward_expand(&gf, cur); #ifdef GGML_USE_METAL - if (lctx.mtl_ctx) { - ggml_mtl_graph_compute(lctx.mtl_ctx, &gf); - ggml_mtl_get_tensor(lctx.mtl_ctx, cur); + if (lctx.ctx_metal && N == 1) { + ggml_metal_graph_compute(lctx.ctx_metal, &gf); + ggml_metal_get_tensor (lctx.ctx_metal, cur); } else { + // IMPORTANT: + // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla + // ggml_graph_compute(). It uses Apple's Accelerate CBLAS API which takes advantage of the ANE or the AMX + // coprocessor. + // + // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch. + // But for now, we have focused only on Matrix x Vector Metal multiplication. + // ggml_graph_compute(ctx0, &gf); + + if (lctx.ctx_metal) { + // We need to sync the CPU KV cache with the GPU KV cache + ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k); + ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v); + } } #else ggml_graph_compute(ctx0, &gf); @@ -2318,12 +2332,7 @@ struct llama_context * llama_init_from_file( ctx->embedding.resize(hparams.n_embd); } -#ifdef GGML_USE_METAL - // when using Metal, we don't need the extra buffer for intermediate dequantization - ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)/100); -#else ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); -#endif ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); @@ -2333,19 +2342,19 @@ struct llama_context * llama_init_from_file( if (params.n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers if (params.use_mmap) { - ctx->mtl_ctx = ggml_mtl_init(); - ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ctx->model.mapping->addr, ctx->model.mapping->size); - ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr, ctx->buf_compute.size); - ggml_mtl_add_buffer(ctx->mtl_ctx, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size); - ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size); - ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size); + ctx->ctx_metal = ggml_metal_init(); + ggml_metal_add_buffer(ctx->ctx_metal, "data", ctx->model.mapping->addr, ctx->model.mapping->size); + ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size); + ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size); + ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size); + ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size); } else { - ctx->mtl_ctx = ggml_mtl_init(); - ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx)); - ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr, ctx->buf_compute.size); - ggml_mtl_add_buffer(ctx->mtl_ctx, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size); - ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size); - ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size); + ctx->ctx_metal = ggml_metal_init(); + ggml_metal_add_buffer(ctx->ctx_metal, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx)); + ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size); + ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size); + ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size); + ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size); } } #endif