metal : reusing llama.cpp logging

2023-09-12 23:28:04 +02:00 · 2023-09-12 23:28:04 +02:00 · 84471ffc73
commit 84471ffc73
parent 89e89599fd
4 changed files with 80 additions and 71 deletions
--- a/6
+++ b/6
@ -427,7 +427,7 @@ endif
 endif # LLAMA_METAL
 ifdef LLAMA_METAL
-ggml-metal.o: ggml-metal.m ggml-metal.h
+ggml-metal.o: ggml-metal.m ggml-metal.h llama.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_METAL
@ -551,7 +551,7 @@ speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o co
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 ifdef LLAMA_METAL
-metal: examples/metal/metal.cpp ggml.o $(OBJS)
+metal: examples/metal/metal.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif
@ -573,7 +573,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	./$@
-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
+vdot: pocs/vdot/vdot.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -11,11 +11,12 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 // TODO: temporary - reuse llama.cpp logging
 #ifdef GGML_METAL_NDEBUG
-#define metal_printf(...)
+#define LLAMA_LOG_INFO(...)
 #define LLAMA_LOG_WARN(...)
 #define LLAMA_LOG_ERROR(...)
 #else
-#define metal_printf(...) fprintf(stderr, __VA_ARGS__)
+#import "llama.h"
 #endif
 #define UNUSED(x) (void)(x)
@ -118,7 +119,7 @@ static NSString * const msl_library_source = @"see metal.metal";
@end
 struct ggml_metal_context * ggml_metal_init(int n_cb) {
-    metal_printf("%s: allocating\n", __func__);
+    LLAMA_LOG_INFO("%s: allocating\n", __func__);
    id <MTLDevice> device;
    NSString * s;
@ -128,14 +129,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    NSArray * devices = MTLCopyAllDevices();
    for (device in devices) {
        s = [device name];
-        metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
+        LLAMA_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
    }
 #endif
    // Pick and show default Metal device
    device = MTLCreateSystemDefaultDevice();
    s = [device name];
-    metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
+    LLAMA_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
    // Configure context
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
@ -162,7 +163,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
        if (error) {
-            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            LLAMA_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
@ -176,11 +177,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
        NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
-        metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
+        LLAMA_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]);
        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
        if (error) {
-            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            LLAMA_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
@ -192,7 +193,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
 #endif
        if (error) {
-            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            LLAMA_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
@ -204,11 +205,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
-        metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
+        LLAMA_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
                (int) ctx->pipeline_##name.threadExecutionWidth); \
        if (error) { \
-            metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+          LLAMA_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
            return NULL; \
        }
@ -264,13 +265,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #undef GGML_METAL_ADD_KERNEL
    }
-    metal_printf("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+    LLAMA_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
 #if TARGET_OS_OSX
-    metal_printf("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+    LLAMA_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
    if (ctx->device.maxTransferRate != 0) {
-        metal_printf("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
+        LLAMA_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
    } else {
-        metal_printf("%s: maxTransferRate               = built-in GPU\n", __func__);
+        LLAMA_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
    }
 #endif
@ -278,7 +279,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 }
 void ggml_metal_free(struct ggml_metal_context * ctx) {
-    metal_printf("%s: deallocating\n", __func__);
+    LLAMA_LOG_INFO("%s: deallocating\n", __func__);
 #define GGML_METAL_DEL_KERNEL(name) \
    [ctx->function_##name release]; \
    [ctx->pipeline_##name release];
@ -350,7 +351,7 @@ void * ggml_metal_host_malloc(size_t n) {
    void * data = NULL;
    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
    if (result != 0) {
-        metal_printf("%s: error: posix_memalign failed\n", __func__);
+        LLAMA_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
        return NULL;
    }
@ -378,7 +379,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
 // Metal buffer based on the host memory pointer
 //
 static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
-    //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+    //LLAMA_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
    const int64_t tsize = ggml_nbytes(t);
@ -389,13 +390,13 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
            *offs = (size_t) ioffs;
-            //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
+            //LLAMA_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
            return ctx->buffers[i].metal;
        }
    }
-    metal_printf("%s: error: buffer is nil\n", __func__);
+    LLAMA_LOG_ERROR("%s: error: buffer is nil\n", __func__);
    return nil;
 }
@ -407,7 +408,7 @@ bool ggml_metal_add_buffer(
                         size_t   size,
                         size_t   max_size) {
    if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
-        metal_printf("%s: too many buffers\n", __func__);
+        LLAMA_LOG_ERROR("%s: error: too many buffers\n", __func__);
        return false;
    }
@ -417,7 +418,7 @@ bool ggml_metal_add_buffer(
            const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
            if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
-                metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
+                LLAMA_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
                return false;
            }
        }
@ -438,11 +439,11 @@ bool ggml_metal_add_buffer(
            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
+                LLAMA_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
                return false;
            }
-            metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
+            LLAMA_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
            ++ctx->n_buffers;
        } else {
@ -462,13 +463,13 @@ bool ggml_metal_add_buffer(
                ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
                if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
+                    LLAMA_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
                    return false;
                }
-                metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
+                LLAMA_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
                if (i + size_step < size) {
-                    metal_printf("\n");
+                    LLAMA_LOG_INFO("\n");
                }
                ++ctx->n_buffers;
@ -476,17 +477,17 @@ bool ggml_metal_add_buffer(
        }
 #if TARGET_OS_OSX
-        metal_printf(", (%8.2f / %8.2f)",
+        LLAMA_LOG_INFO(", (%8.2f / %8.2f)",
                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
-            metal_printf(", warning: current allocated size is greater than the recommended max working set size\n");
+            LLAMA_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
        } else {
-            metal_printf("\n");
+            LLAMA_LOG_INFO("\n");
        }
 #else
-        metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
+        LLAMA_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
 #endif
    }
@ -599,7 +600,7 @@ void ggml_metal_graph_find_concurrency(
    }
    if (ctx->concur_list_len > GGML_MAX_CONCUR) {
-        metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__);
+        LLAMA_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__);
    }
 }
@ -653,7 +654,7 @@ void ggml_metal_graph_compute(
                    continue;
                }
-                //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+                //LLAMA_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
                struct ggml_tensor * src1 = gf->nodes[i]->src[1];
@ -697,17 +698,17 @@ void ggml_metal_graph_compute(
                id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
                id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
-                //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+                //LLAMA_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
                //if (src0) {
-                //    metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
+                //    LLAMA_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
                //            ggml_is_contiguous(src0), src0->name);
                //}
                //if (src1) {
-                //    metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
+                //    LLAMA_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
                //            ggml_is_contiguous(src1), src1->name);
                //}
                //if (dst) {
-                //    metal_printf("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
+                //    LLAMA_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
                //            dst->name);
                //}
@ -813,7 +814,7 @@ void ggml_metal_graph_compute(
                                } break;
                            default:
                                {
-                                    metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                    LLAMA_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                                    GGML_ASSERT(false);
                                }
                        } break;
@ -993,7 +994,7 @@ void ggml_metal_graph_compute(
                                        } break;
                                    default:
                                        {
-                                            metal_printf("Asserting on type %d\n",(int)src0t);
+                                            LLAMA_LOG_ERROR("Asserting on type %d\n",(int)src0t);
                                            GGML_ASSERT(false && "not implemented");
                                        }
                                };
@ -1234,7 +1235,7 @@ void ggml_metal_graph_compute(
                        } break;
                    default:
                        {
-                            metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                            LLAMA_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                            GGML_ASSERT(false);
                        }
                }
@ -1259,7 +1260,7 @@ void ggml_metal_graph_compute(
        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
        if (status != MTLCommandBufferStatusCompleted) {
-            metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status);
+            LLAMA_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
            GGML_ASSERT(false);
        }
    }
--- a/llama.cpp
+++ b/llama.cpp
@ -76,28 +76,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 #ifdef __GNUC__
 #ifdef __MINGW32__
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 #else
 #define LLAMA_ATTRIBUTE_FORMAT(...)
 #endif
 //
 // logging
 //
 LLAMA_ATTRIBUTE_FORMAT(2, 3)
 static void llama_log_internal        (llama_log_level level, const char* format, ...);
 static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
 #define LLAMA_LOG_INFO(...)  llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
 //
 // helpers
 //
@ -6366,7 +6344,7 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) {
    g_state.log_callback_user_data = user_data;
 }
-static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
+void llama_log_v(llama_log_level level, const char * format, va_list args) {
    va_list args_copy;
    va_copy(args_copy, args);
    char buffer[128];
@ -6383,14 +6361,14 @@ static void llama_log_internal_v(llama_log_level level, const char * format, va_
    va_end(args_copy);
 }
-static void llama_log_internal(llama_log_level level, const char * format, ...) {
+void llama_log(llama_log_level level, const char * format, ...) {
    va_list args;
    va_start(args, format);
-    llama_log_internal_v(level, format, args);
+    llama_log_v(level, format, args);
    va_end(args);
 }
-static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
+void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
    (void) level;
    (void) user_data;
    fputs(text, stderr);
--- a/llama.h
+++ b/llama.h
@ -532,6 +532,36 @@ extern "C" {
 }
 #endif
 #ifdef __GNUC__
 #ifdef __MINGW32__
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 #else
 #define LLAMA_ATTRIBUTE_FORMAT(...)
 #endif
 //
 // logging
 //
 #ifdef __cplusplus
 extern "C" {
 #endif
 LLAMA_ATTRIBUTE_FORMAT(2, 3)
 void llama_log                 (enum llama_log_level level, const char* format, ...);
 void llama_log_callback_default(enum llama_log_level level, const char * text, void * user_data);
 #ifdef __cplusplus
 }
 #endif
 #define LLAMA_LOG_INFO(...)  llama_log(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL