diff --git a/Makefile b/Makefile index 5bc806bd1..ae239b581 100644 --- a/Makefile +++ b/Makefile @@ -139,10 +139,10 @@ NOAVX2_BUILD = OPENBLAS_NOAVX2_BUILD = ifeq ($(OS),Windows_NT) - OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas.dll $(LDFLAGS) - CLBLAST_BUILD = $(CXX) $(CXXFLAGS) ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/OpenCL.lib lib/clblast.lib -shared -o koboldcpp_clblast.dll $(LDFLAGS) - OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas_noavx2.dll $(LDFLAGS) - NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o -shared -o koboldcpp_noavx2.dll $(LDFLAGS) + OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@ $(LDFLAGS) + CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@ $(LDFLAGS) + OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@ $(LDFLAGS) + NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@ $(LDFLAGS) else ifndef LLAMA_OPENBLAS ifndef LLAMA_CLBLAST @@ -166,101 +166,101 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: llamalib llamalib_noavx2 llamalib_openblas llamalib_openblas_noavx2 llamalib_clblast -simple: llamalib llamalib_noavx2 -dev: llamalib_openblas +default: koboldcpp.dll koboldcpp_noavx2.dll koboldcpp_openblas.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll +simple: koboldcpp.dll koboldcpp_noavx2.dll +dev: koboldcpp_openblas.dll # # Build library # ggml.o: ggml.c ggml.h - $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c ggml.c -o ggml.o + $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@ ggml_openblas.o: ggml.c ggml.h - $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas.o + $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -DGGML_USE_OPENBLAS -c $< -o $@ ggml_noavx2.o: ggml.c ggml.h - $(CC) $(CFLAGS) -c ggml.c -o ggml_noavx2.o + $(CC) $(CFLAGS) -c $< -o $@ ggml_openblas_noavx2.o: ggml.c ggml.h - $(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas_noavx2.o + $(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c $< -o $@ ggml_clblast.o: ggml.c ggml.h - $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -DGGML_USE_OPENBLAS -DGGML_USE_CLBLAST -c ggml.c -o ggml_clblast.o + $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -DGGML_USE_OPENBLAS -DGGML_USE_CLBLAST -c $< -o $@ ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h - $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c otherarch/ggml_v1.c -o ggml_v1.o + $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@ ggml_v1_noavx2.o: otherarch/ggml_v1.c otherarch/ggml_v1.h - $(CC) $(CFLAGS) $(BONUSCFLAGS1) -c otherarch/ggml_v1.c -o ggml_v1_noavx2.o + $(CC) $(CFLAGS) $(BONUSCFLAGS1) -c $< -o $@ llama.o: llama.cpp llama.h llama_util.h - $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o + $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h - $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o + $(CXX) $(CXXFLAGS) -c $< -o $@ expose.o: expose.cpp expose.h - $(CXX) $(CXXFLAGS) -c expose.cpp -o expose.o + $(CXX) $(CXXFLAGS) -c $< -o $@ -llama_adapter.o: - $(CXX) $(CXXFLAGS) -c llama_adapter.cpp -o llama_adapter.o +llama_adapter.o: llama_adapter.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ -gpttype_adapter.o: - $(CXX) $(CXXFLAGS) -c gpttype_adapter.cpp -o gpttype_adapter.o +gpttype_adapter.o: gpttype_adapter.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ clean: rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize-stats perplexity embedding benchmark-q4_0-matmult main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll gptj.exe gpt2.exe main: examples/main/main.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -llamalib: ggml.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o - $(CXX) $(CXXFLAGS) ggml.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o -shared -o koboldcpp.dll $(LDFLAGS) +koboldcpp.dll: ggml.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o + $(CXX) $(CXXFLAGS) $^ -shared -o $@ $(LDFLAGS) -llamalib_openblas: ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o +koboldcpp_openblas.dll: ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o $(OPENBLAS_BUILD) - -llamalib_noavx2: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o + +koboldcpp_noavx2.dll: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o $(NOAVX2_BUILD) -llamalib_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o +koboldcpp_openblas_noavx2.dll: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o $(OPENBLAS_NOAVX2_BUILD) -llamalib_clblast: ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o +koboldcpp_clblast.dll: ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o $(CLBLAST_BUILD) quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o - $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize_llama $(LDFLAGS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o - $(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize_gptj: ggml.o llama.o - $(CXX) $(CXXFLAGS) otherarch/gptj_quantize.cpp ggml.o llama.o -o quantize_gptj $(LDFLAGS) +quantize_gptj: ggml.o llama.o otherarch/gptj_quantize.cpp + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize_gpt2: ggml.o llama.o - $(CXX) $(CXXFLAGS) otherarch/gpt2_quantize.cpp ggml.o llama.o -o quantize_gpt2 $(LDFLAGS) +quantize_gpt2: ggml.o llama.o otherarch/gpt2_quantize.cpp + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) libllama.so: llama.o ggml.o - $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) # # Tests # -benchmark: ggml.o - $(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS) +benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o + $(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS) ./benchmark-q4_0-matmult .PHONY: tests diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index c786fe208..050300931 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -16,9 +16,6 @@ #include #include -static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" }; -static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list"); - struct quantize_stats_params { std::string model = "models/7B/ggml-model-f16.bin"; bool verbose = false; @@ -224,7 +221,7 @@ int main(int argc, char ** argv) { break; } int j; - for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) { + for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) { // find match } if (j < GGML_TYPE_COUNT) { @@ -279,7 +276,7 @@ int main(int argc, char ** argv) { continue; } if (params.verbose) { - printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second)); + printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second)); } if (kv_tensor.second->type == GGML_TYPE_F16) { is_f16 = true; @@ -304,13 +301,14 @@ int main(int argc, char ** argv) { // loop throught quantization types for (int i = 0; i < GGML_TYPE_COUNT; i++) { + const ggml_type type = (ggml_type) i; if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); if (qfns.quantize_row_q && qfns.dequantize_row_q) { if (params.verbose) { - printf("testing %s ...\n", type_strs[i]); + printf("testing %s ...\n", ggml_type_name(type)); } error_stats global_stats {}; @@ -322,7 +320,7 @@ int main(int argc, char ** argv) { if (params.verbose) { printf(" %s ...\n", kv_tensor.first.c_str()); } - std::string layer_name { type_strs[i] }; + std::string layer_name { ggml_type_name(type) }; layer_name += "::" + kv_tensor.first; test_roundtrip_on_layer( layer_name, @@ -337,7 +335,7 @@ int main(int argc, char ** argv) { ); } - print_error_stats(type_strs[i], global_stats, params.print_histogram); + print_error_stats(ggml_type_name(type), global_stats, params.print_histogram); } } diff --git a/ggml.c b/ggml.c index 6f67be22f..c72084093 100644 --- a/ggml.c +++ b/ggml.c @@ -2671,6 +2671,18 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { }; static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated"); + +static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = "f32", + [GGML_TYPE_F16] = "f16", + [GGML_TYPE_Q4_0] = "q4_0", + [GGML_TYPE_Q4_1] = "q4_1", + [GGML_TYPE_I8] = "i8", + [GGML_TYPE_I16] = "i16", + [GGML_TYPE_I32] = "i32", +}; +static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_NAME is outdated"); + static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", @@ -2712,9 +2724,12 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "FLASH_ATTN", "FLASH_FF", + + "MAP_UNARY", + "MAP_BINARY", }; -static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36"); +static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -2757,9 +2772,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "flash_attn(x)", "flash_ff(x)", + + "f(x)", + "f(x,y)", }; -static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36"); +static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -2889,6 +2907,11 @@ float ggml_type_sizef(enum ggml_type type) { return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type]; } +const char * ggml_type_name(enum ggml_type type) { + return GGML_TYPE_NAME[type]; +} + + size_t ggml_element_size(const struct ggml_tensor * tensor) { return GGML_TYPE_SIZE[tensor->type]; } @@ -4907,6 +4930,90 @@ struct ggml_tensor * ggml_flash_ff( return result; } +// ggml_map_unary + +struct ggml_tensor * ggml_map_unary_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun, + bool inplace) { + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } + + struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); + *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; + struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_MAP_UNARY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->opt[0] = addr_tensor; + + return result; +} + +struct ggml_tensor * ggml_map_unary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun) { + return ggml_map_unary_impl_f32(ctx, a, fun, false); +} + +struct ggml_tensor * ggml_map_unary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun) { + return ggml_map_unary_impl_f32(ctx, a, fun, true); +} + +// ggml_map_binary + +struct ggml_tensor * ggml_map_binary_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun, + bool inplace) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } + + struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); + *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; + struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_MAP_BINARY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + result->opt[0] = addr_tensor; + + return result; +} + +struct ggml_tensor * ggml_map_binary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun) { + return ggml_map_binary_impl_f32(ctx, a, b, fun, false); +} + +struct ggml_tensor * ggml_map_binary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun) { + return ggml_map_binary_impl_f32(ctx, a, b, fun, true); +} + //////////////////////////////////////////////////////////////////////////////// void ggml_set_param( @@ -8875,6 +8982,111 @@ static void ggml_compute_forward_flash_ff( } } +// ggml_compute_forward_map_unary + +static void ggml_compute_forward_map_unary_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + fun(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1]))); + } +} + + +static void ggml_compute_forward_map_unary( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_unary_f32(params, src0, dst, fun); + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_map_binary + +static void ggml_compute_forward_map_binary_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + const ggml_binary_op_f32_t fun) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + assert(src1->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + fun(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1])), + (float *) ((char *) src1->data + i*(src1->nb[1]))); + } +} + + +static void ggml_compute_forward_map_binary( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + const ggml_binary_op_f32_t fun) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); + } break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_F16: + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + ///////////////////////////////// static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { @@ -9024,6 +9236,18 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); } break; + case GGML_OP_MAP_UNARY: + { + const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data); + ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun); + } + break; + case GGML_OP_MAP_BINARY: + { + const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data); + ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun); + } + break; case GGML_OP_NONE: { // nop @@ -9283,6 +9507,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // not supported } break; + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + { + GGML_ASSERT(false); // not supported + } break; case GGML_OP_NONE: { // nop @@ -9775,6 +10004,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) work_size = MAX(work_size, cur); } break; + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + { + node->n_tasks = 1; + } break; case GGML_OP_NONE: { node->n_tasks = 1; diff --git a/ggml.h b/ggml.h index c06c09e06..617298a95 100644 --- a/ggml.h +++ b/ggml.h @@ -253,6 +253,9 @@ enum ggml_op { GGML_OP_FLASH_ATTN, GGML_OP_FLASH_FF, + GGML_OP_MAP_UNARY, + GGML_OP_MAP_BINARY, + GGML_OP_COUNT, }; @@ -351,6 +354,8 @@ int ggml_blck_size (enum ggml_type type); size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float +const char * ggml_type_name(enum ggml_type type); + size_t ggml_element_size(const struct ggml_tensor * tensor); struct ggml_context * ggml_init(struct ggml_init_params params); @@ -652,6 +657,21 @@ struct ggml_tensor * ggml_flash_ff( struct ggml_tensor * c0, struct ggml_tensor * c1); +// Mapping operations +typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); +typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + +struct ggml_tensor * ggml_map_unary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun); + +struct ggml_tensor * ggml_map_binary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun); + // // automatic differentiation // diff --git a/llama.cpp b/llama.cpp index 1eba38d4c..2c931480a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -269,16 +269,6 @@ static std::string llama_format_tensor_shape(const std::vector & ne) { return ret; } -static const char * llama_format_type(enum ggml_type type) { - switch (type) { - case GGML_TYPE_F32: return "f32"; - case GGML_TYPE_F16: return "f16"; - case GGML_TYPE_Q4_0: return "q4_0"; - case GGML_TYPE_Q4_1: return "q4_1"; - default: LLAMA_ASSERT(false); - } -} - static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { size_t size = ggml_type_size(type); for (uint32_t dim : ne) { @@ -1589,7 +1579,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s printf("[%zu/%zu] %36s - %s, type = %6s, ", ++idx, model_loader->tensors_map.tensors.size(), tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), - llama_format_type(tensor.type)); + ggml_type_name(tensor.type)); // This used to be a regex, but has an extreme cost to compile times. bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? @@ -1622,7 +1612,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data[i] = ggml_fp16_to_fp32(f16_data[i]); } } else { - throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type)); + throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type)); } printf("quantizing .. "); diff --git a/requirements.txt b/requirements.txt index f3944951a..6c32cbd04 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ numpy==1.24 -sentencepiece==0.1.97 +sentencepiece==0.1.98