From 95b6e5212f5e4e1419de1d833d7f8d788f9f2227 Mon Sep 17 00:00:00 2001 From: Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> Date: Mon, 28 Aug 2023 23:33:27 -0700 Subject: [PATCH 1/7] added `struct` to llama_dump_timing_info_yaml's `llama_context` (#2857) fixes C compat. --- llama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.h b/llama.h index b38d3be20..6e5e1df63 100644 --- a/llama.h +++ b/llama.h @@ -521,7 +521,7 @@ extern "C" { // If this is not called, or NULL is supplied, everything is output on stderr. LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); - LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx); + LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); #ifdef __cplusplus } From 611363ac791435497e66278dfe31ac8a4e11fa4f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 29 Aug 2023 10:50:30 +0300 Subject: [PATCH 2/7] scripts : add pipefail --- scripts/qnt-all.sh | 1 + scripts/run-all-perf.sh | 1 + scripts/run-all-ppl.sh | 1 + 3 files changed, 3 insertions(+) diff --git a/scripts/qnt-all.sh b/scripts/qnt-all.sh index 1376e4194..b4c2a159e 100755 --- a/scripts/qnt-all.sh +++ b/scripts/qnt-all.sh @@ -20,6 +20,7 @@ fi model="$1" out="../tmp/results-${model}" +set -o pipefail set -e mkdir -p ${out} diff --git a/scripts/run-all-perf.sh b/scripts/run-all-perf.sh index 7391e3dd5..6384e364d 100755 --- a/scripts/run-all-perf.sh +++ b/scripts/run-all-perf.sh @@ -20,6 +20,7 @@ fi model="$1" out="../tmp/results-${model}" +set -o pipefail set -e mkdir -p ${out} diff --git a/scripts/run-all-ppl.sh b/scripts/run-all-ppl.sh index f643ca3ae..e04d61d7f 100755 --- a/scripts/run-all-ppl.sh +++ b/scripts/run-all-ppl.sh @@ -17,6 +17,7 @@ if [ ! -z "$3" ]; then args="$3" fi +set -o pipefail set -e model="$1" From 3a007648f230ea37d6cca5e63850f04ebb12d2cf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 29 Aug 2023 11:33:46 +0300 Subject: [PATCH 3/7] metal : add option to disable debug logs (close #2764) --- CMakeLists.txt | 2 +- Makefile | 2 +- ggml-metal.m | 71 +++++++++++++++++++++++--------------------------- 3 files changed, 35 insertions(+), 40 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ba008bcc6..1eae2d670 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -301,7 +301,7 @@ if (LLAMA_METAL) set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) add_compile_definitions(GGML_USE_METAL) - add_compile_definitions(GGML_METAL_NDEBUG) + #add_compile_definitions(GGML_METAL_NDEBUG) # get full path to the file #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") diff --git a/Makefile b/Makefile index e60821dd5..a64374e7d 100644 --- a/Makefile +++ b/Makefile @@ -305,7 +305,7 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h endif # LLAMA_HIPBLAS ifdef LLAMA_METAL - CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG + CFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG CXXFLAGS += -DGGML_USE_METAL LDFLAGS += -framework Foundation -framework Metal -framework MetalKit OBJS += ggml-metal.o diff --git a/ggml-metal.m b/ggml-metal.m index ad2ee8cf5..e929c4b07 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -11,6 +11,7 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +// TODO: temporary - reuse llama.cpp logging #ifdef GGML_METAL_NDEBUG #define metal_printf(...) #else @@ -113,7 +114,7 @@ static NSString * const msl_library_source = @"see metal.metal"; @end struct ggml_metal_context * ggml_metal_init(int n_cb) { - fprintf(stderr, "%s: allocating\n", __func__); + metal_printf("%s: allocating\n", __func__); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); @@ -132,7 +133,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -146,11 +147,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]); + metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } @@ -162,7 +163,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; #endif if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -174,11 +175,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ + metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ if (error) { \ - fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } @@ -230,19 +231,19 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #undef GGML_METAL_ADD_KERNEL } - fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); if (ctx->device.maxTransferRate != 0) { - fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); } else { - fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__); + metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); } return ctx; } void ggml_metal_free(struct ggml_metal_context * ctx) { - fprintf(stderr, "%s: deallocating\n", __func__); + metal_printf("%s: deallocating\n", __func__); #define GGML_METAL_DEL_KERNEL(name) \ [ctx->function_##name release]; \ [ctx->pipeline_##name release]; @@ -311,7 +312,7 @@ void * ggml_metal_host_malloc(size_t n) { void * data = NULL; const int result = posix_memalign((void **) &data, getpagesize(), n); if (result != 0) { - fprintf(stderr, "%s: error: posix_memalign failed\n", __func__); + metal_printf("%s: error: posix_memalign failed\n", __func__); return NULL; } @@ -339,7 +340,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { // Metal buffer based on the host memory pointer // static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { - //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -350,13 +351,13 @@ static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; - //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); return ctx->buffers[i].metal; } } - fprintf(stderr, "%s: error: buffer is nil\n", __func__); + metal_printf("%s: error: buffer is nil\n", __func__); return nil; } @@ -368,7 +369,7 @@ bool ggml_metal_add_buffer( size_t size, size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { - fprintf(stderr, "%s: too many buffers\n", __func__); + metal_printf("%s: too many buffers\n", __func__); return false; } @@ -378,7 +379,7 @@ bool ggml_metal_add_buffer( const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); return false; } } @@ -399,11 +400,11 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); return false; } - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -423,27 +424,27 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); return false; } - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { - fprintf(stderr, "\n"); + metal_printf("\n"); } ++ctx->n_buffers; } } - fprintf(stderr, ", (%8.2f / %8.2f)", + metal_printf(", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n"); + metal_printf(", warning: current allocated size is greater than the recommended max working set size\n"); } else { - fprintf(stderr, "\n"); + metal_printf("\n"); } } @@ -453,8 +454,6 @@ bool ggml_metal_add_buffer( void ggml_metal_set_tensor( struct ggml_metal_context * ctx, struct ggml_tensor * t) { - metal_printf("%s: set input for tensor '%s'\n", __func__, t->name); - size_t offs; id id_dst = ggml_metal_get_buffer(ctx, t, &offs); @@ -464,8 +463,6 @@ void ggml_metal_set_tensor( void ggml_metal_get_tensor( struct ggml_metal_context * ctx, struct ggml_tensor * t) { - metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name); - size_t offs; id id_src = ggml_metal_get_buffer(ctx, t, &offs); @@ -560,15 +557,13 @@ void ggml_metal_graph_find_concurrency( } if (ctx->concur_list_len > GGML_MAX_CONCUR) { - fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__); + metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__); } } void ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { - metal_printf("%s: evaluating graph\n", __func__); - @autoreleasepool { // if there is ctx->concur_list, dispatch concurrently @@ -616,7 +611,7 @@ void ggml_metal_graph_compute( continue; } - metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src1 = gf->nodes[i]->src[1]; @@ -764,7 +759,7 @@ void ggml_metal_graph_compute( } break; default: { - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } break; @@ -923,7 +918,7 @@ void ggml_metal_graph_compute( } break; default: { - fprintf(stderr, "Asserting on type %d\n",(int)src0t); + metal_printf("Asserting on type %d\n",(int)src0t); GGML_ASSERT(false && "not implemented"); } }; @@ -1161,7 +1156,7 @@ void ggml_metal_graph_compute( } break; default: { - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } @@ -1186,7 +1181,7 @@ void ggml_metal_graph_compute( MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { - fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status); + metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_ASSERT(false); } } From d4b5e16c32ba9c5fa6bbd035e80a99c113050cde Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Tue, 29 Aug 2023 04:42:41 -0400 Subject: [PATCH 4/7] make : fix clang tests build, add missing examples (#2859) * make : do not pass headers to the compiler This fixes building tests with clang. * make : add missing examples * make : fix build-info.h dependencies --- Makefile | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index a64374e7d..02ba3e36d 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam_search # Binaries only useful for tests TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1 @@ -356,7 +356,7 @@ OBJS += ggml-alloc.o llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -common.o: common/common.cpp common/common.h +common.o: common/common.cpp common/common.h build-info.h $(CXX) $(CXXFLAGS) -c $< -o $@ console.o: common/console.cpp common/console.h @@ -369,7 +369,7 @@ libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: - rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS) + rm -vf *.o *.so *.dll benchmark-matmult build-info.h $(BUILD_TARGETS) $(TEST_TARGETS) # # Examples @@ -409,18 +409,33 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput -gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS) +gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o common.o $(OBJS) +train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp build-info.h ggml.o llama.o $(OBJS) +convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + +beam_search: examples/beam_search/beam_search.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + +ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' +BUILD_TARGETS += metal +endif + +ifdef LLAMA_METAL +metal: examples/metal/metal.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +endif + build-info.h: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh > $@.tmp @if ! cmp -s $@.tmp $@; then \ @@ -443,34 +458,34 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) From 74e0caeb82fc9db77fa2cc93070bb919a9a935dd Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Tue, 29 Aug 2023 17:30:10 +0800 Subject: [PATCH 5/7] readme : add react-native binding (#2869) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bf3eb0b76..8d54a558d 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,7 @@ as the main playground for developing new features for the [ggml](https://github - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) +- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) **UI:** From bcce96ba4dd95482824700c4ce2455fe8c49055a Mon Sep 17 00:00:00 2001 From: jameswu2014 <545426914@qq.com> Date: Tue, 29 Aug 2023 17:48:41 +0800 Subject: [PATCH 6/7] convert.py : fix baichuan7B support (#2870) * [Fix]: convert.py support baichuan7B * convert.py : fix trailing whitespaces --------- Co-authored-by: Georgi Gerganov --- convert.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/convert.py b/convert.py index a15e6ccd2..3f0a1c932 100755 --- a/convert.py +++ b/convert.py @@ -469,7 +469,7 @@ class UnquantizedTensor(Tensor): def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': r = self.ndarray.shape[0] // 3 - return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head)) + return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head)) def part(self, n_part: int) -> 'UnquantizedTensor': r = self.ndarray.shape[0] // 3 @@ -952,9 +952,10 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: print(f"Unpacking and permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) + tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head) + tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head) tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) + del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] else: break From 53885d7256909ec3e2176cdc2477f3986c15ec69 Mon Sep 17 00:00:00 2001 From: maddes8cht <55592906+maddes8cht@users.noreply.github.com> Date: Tue, 29 Aug 2023 15:51:02 +0200 Subject: [PATCH 7/7] py : fix "usage" messages (#2873) convert-to-gguf python scripts --- convert-falcon-hf-to-gguf.py | 2 +- convert-gptneox-hf-to-gguf.py | 2 +- convert-llama-7b-pth-to-gguf.py | 2 +- convert-llama-hf-to-gguf.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index 411cbf682..168bcf17f 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -48,7 +48,7 @@ def count_model_parts(dir_model: str) -> int: if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model ftype\n") + print(f"Usage: python {sys.argv[0]} dir-model ftype\n") print(" ftype == 0 -> float32") print(" ftype == 1 -> float16") sys.exit(1) diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py index 6eeff5bb1..d9c42d76b 100755 --- a/convert-gptneox-hf-to-gguf.py +++ b/convert-gptneox-hf-to-gguf.py @@ -50,7 +50,7 @@ def count_model_parts(dir_model: str) -> int: if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model ftype\n") + print(f"Usage: python {sys.argv[0]} dir-model ftype\n") print(" ftype == 0 -> float32") print(" ftype == 1 -> float16") sys.exit(1) diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py index f103f5f61..2ab082383 100755 --- a/convert-llama-7b-pth-to-gguf.py +++ b/convert-llama-7b-pth-to-gguf.py @@ -32,7 +32,7 @@ def count_model_parts(dir_model: str) -> int: if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model ftype\n") + print(f"Usage: python {sys.argv[0]} dir-model ftype\n") print(" ftype == 0 -> float32") print(" ftype == 1 -> float16") diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py index 08fde238b..b00810dbb 100755 --- a/convert-llama-hf-to-gguf.py +++ b/convert-llama-hf-to-gguf.py @@ -44,7 +44,7 @@ def count_model_parts(dir_model: str) -> int: if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model ftype\n") + print(f"Usage: python {sys.argv[0]} dir-model ftype\n") print(" ftype == 0 -> float32") print(" ftype == 1 -> float16")