Merge branch 'ggerganov:master' into master

2023-06-25 08:51:59 -07:00 · 2023-06-25 08:51:59 -07:00 · af058cf820
commit af058cf820
parent 23b516b053 447ccbe8c3
7 changed files with 466 additions and 81 deletions
--- a/README.md
+++ b/README.md
@ -5,12 +5,15 @@
 [![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 **Hot topics:**
 - New roadmap: https://github.com/users/ggerganov/projects/7
 - Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
 - p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
 - Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
 <details>
  <summary>Table of Contents</summary>
--- a/build.zig
+++ b/build.zig
@ -1,61 +1,58 @@
 const std = @import("std");
 // Zig Version: 0.11.0-dev.3379+629f0d23b
 pub fn build(b: *std.build.Builder) void {
    const target = b.standardTargetOptions(.{});
-    const optimize = b.standardReleaseOptions();
+    const optimize = b.standardOptimizeOption(.{});
-    const want_lto = b.option(bool, "lto", "Want -fLTO");
+    const lib = b.addStaticLibrary(.{
-
+        .name = "llama",
-    const lib = b.addStaticLibrary("llama", null);
+        .target = target,
-    lib.want_lto = want_lto;
+        .optimize = optimize,
-    lib.setTarget(target);
+    });
-    lib.setBuildMode(optimize);
+    lib.linkLibC();
    lib.linkLibCpp();
    lib.addIncludePath(".");
-    lib.addIncludePath("examples");
+    lib.addIncludePath("./examples");
    lib.addCSourceFiles(&.{
        "ggml.c",
    }, &.{"-std=c11"});
    lib.addCSourceFiles(&.{
        "llama.cpp",
    }, &.{"-std=c++11"});
-    lib.install();
+    b.installArtifact(lib);
-    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
+    const examples = .{
        "main",
        "baby-llama",
        "embedding",
        // "metal",
        "perplexity",
        "quantize",
        "quantize-stats",
        "save-load-state",
        // "server",
        "simple",
        "train-text-from-scratch",
    };
-    const exe = build_example("main", build_args);
+    inline for (examples) |example_name| {
-    _ = build_example("quantize", build_args);
+        const exe = b.addExecutable(.{
-    _ = build_example("perplexity", build_args);
+            .name = example_name,
-    _ = build_example("embedding", build_args);
+            .target = target,
-
+            .optimize = optimize,
-    // create "zig build run" command for ./main
+        });
    const run_cmd = exe.run();
    run_cmd.step.dependOn(b.getInstallStep());
    if (b.args) |args| {
        run_cmd.addArgs(args);
    }
    const run_step = b.step("run", "Run the app");
    run_step.dependOn(&run_cmd.step);
 }
 fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
    const b = args.b;
    const lib = args.lib;
    const want_lto = args.want_lto;
    const exe = b.addExecutable(name, null);
    exe.want_lto = want_lto;
    lib.setTarget(args.target);
    lib.setBuildMode(args.optimize);
        exe.addIncludePath(".");
-    exe.addIncludePath("examples");
+        exe.addIncludePath("./examples");
        exe.addCSourceFiles(&.{
-        std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
+            std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
            "examples/common.cpp",
        }, &.{"-std=c++11"});
        exe.linkLibrary(lib);
-    exe.install();
+        b.installArtifact(exe);
-
+        const run_cmd = b.addRunArtifact(exe);
-    return exe;
+        run_cmd.step.dependOn(b.getInstallStep());
        if (b.args) |args| run_cmd.addArgs(args);
        const run_step = b.step("run_" ++ example_name, "Run the app");
        run_step.dependOn(&run_cmd.step);
    }
 }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -374,10 +374,10 @@ struct llama_server_context {
                    result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
                } else {
                    // Temperature sampling
                    llama_sample_top_k(ctx, &candidates_p, top_k, 1);
                    llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
                    llama_sample_typical(ctx, &candidates_p, typical_p, 1);
                    llama_sample_top_p(ctx, &candidates_p, top_p, 1);
                    llama_sample_top_k(ctx, &candidates_p, top_k, 1);
                    llama_sample_temperature(ctx, &candidates_p, temp);
                    result.tok = llama_sample_token(ctx, &candidates_p);
                }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -2635,7 +2635,7 @@ void ggml_cuda_free_scratch() {
 bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
    ggml_cuda_func_t func;
    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
-        || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
+        || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
        || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
    switch (tensor->op) {
--- a/ggml.c
+++ b/ggml.c
@ -1,5 +1,5 @@
-// Defines CLOCK_MONOTONIC on Linux
+#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
-#define _GNU_SOURCE
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
 #include "ggml.h"
@ -131,6 +131,34 @@ typedef void* thread_ret_t;
    #define GGML_MEM_ALIGN 16
 #endif
 //
 // logging
 //
 #if (GGML_DEBUG >= 1)
 #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG(...)
 #endif
 #if (GGML_DEBUG >= 5)
 #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG_5(...)
 #endif
 #if (GGML_DEBUG >= 10)
 #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG_10(...)
 #endif
 #define GGML_PRINT(...) printf(__VA_ARGS__)
 //
 // end of logging block
 //
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #define GGML_ALIGNED_MALLOC(size)  _aligned_malloc(size, GGML_MEM_ALIGN)
 #define GGML_ALIGNED_FREE(ptr)     _aligned_free(ptr)
@ -144,6 +172,17 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #endif
    if (result != 0) {
        // Handle allocation failure
        const char *error_desc = "unknown allocation error";
        switch (result) {
            case EINVAL:
                error_desc = "invalid alignment value";
                break;
            case ENOMEM:
                error_desc = "insufficient memory";
                break;
        }
        GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
            __func__, error_desc, size/(1024.0*1024.0));
        return NULL;
    }
    return aligned_memory;
@ -3530,30 +3569,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
    *s = 1.f/(*s);
 }
 //
 // logging
 //
 #if (GGML_DEBUG >= 1)
 #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG(...)
 #endif
 #if (GGML_DEBUG >= 5)
 #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG_5(...)
 #endif
 #if (GGML_DEBUG >= 10)
 #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
 #else
 #define GGML_PRINT_DEBUG_10(...)
 #endif
 #define GGML_PRINT(...) printf(__VA_ARGS__)
 //
 // data types
 //
@ -3713,11 +3728,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "MAP_UNARY",
    "MAP_BINARY",
    "MAP_CUSTOM1",
    "MAP_CUSTOM2",
    "MAP_CUSTOM3",
    "CROSS_ENTROPY_LOSS",
    "CROSS_ENTROPY_LOSS_BACK",
 };
-static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
+static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -3785,11 +3804,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "f(x)",
    "f(x,y)",
    "custom(x)",
    "custom(x,y)",
    "custom(x,y,z)",
    "cross_entropy_loss(x,y)",
    "cross_entropy_loss_back(x,y)",
 };
-static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
+static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@ -7094,9 +7117,14 @@ struct ggml_tensor * ggml_map_unary_impl_f32(
        is_node = true;
    }
    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
-    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
    ggml_scratch_load(ctx);
    result->op = GGML_OP_MAP_UNARY;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -7136,9 +7164,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32(
        is_node = true;
    }
    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
-    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
    ggml_scratch_load(ctx);
    result->op = GGML_OP_MAP_BINARY;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -7165,6 +7198,150 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
 }
 // ggml_map_custom1
 struct ggml_tensor * ggml_map_custom1_impl_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_f32_t   fun,
        bool   inplace) {
    bool is_node = false;
    if (!inplace && a->grad) {
        is_node = true;
    }
    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
    ggml_scratch_load(ctx);
    result->op = GGML_OP_MAP_CUSTOM1;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
    result->opt[0] = addr_tensor;
    return result;
 }
 struct ggml_tensor * ggml_map_custom1_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_f32_t   fun) {
    return ggml_map_custom1_impl_f32(ctx, a, fun, false);
 }
 struct ggml_tensor * ggml_map_custom1_inplace_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_f32_t   fun) {
    return ggml_map_custom1_impl_f32(ctx, a, fun, true);
 }
 // ggml_map_custom2
 struct ggml_tensor * ggml_map_custom2_impl_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_f32_t   fun,
        bool   inplace) {
    bool is_node = false;
    if (!inplace && (a->grad || b->grad)) {
        is_node = true;
    }
    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
    ggml_scratch_load(ctx);
    result->op = GGML_OP_MAP_CUSTOM2;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
    result->src1 = b;
    result->opt[0] = addr_tensor;
    return result;
 }
 struct ggml_tensor * ggml_map_custom2_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_f32_t   fun) {
    return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
 }
 struct ggml_tensor * ggml_map_custom2_inplace_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_f32_t   fun) {
    return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
 }
 // ggml_map_custom3
 struct ggml_tensor * ggml_map_custom3_impl_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_f32_t   fun,
        bool   inplace) {
    bool is_node = false;
    if (!inplace && (a->grad || b->grad || c->grad)) {
        is_node = true;
    }
    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_scratch_save(ctx);
    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
    ggml_scratch_load(ctx);
    result->op = GGML_OP_MAP_CUSTOM3;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src0 = a;
    result->src1 = b;
    result->opt[0] = addr_tensor;
    result->opt[1] = c;
    return result;
 }
 struct ggml_tensor * ggml_map_custom3_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_f32_t   fun) {
    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
 }
 struct ggml_tensor * ggml_map_custom3_inplace_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_f32_t   fun) {
    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
 }
 // ggml_cross_entropy_loss
 struct ggml_tensor * ggml_cross_entropy_loss(
@ -14621,6 +14798,114 @@ static void ggml_compute_forward_map_binary(
    }
 }
 // ggml_compute_forward_map_custom1
 static void ggml_compute_forward_map_custom1_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * a,
        struct ggml_tensor * dst,
        const ggml_custom1_op_f32_t fun) {
    assert(params->ith == 0);
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    fun(dst, a);
 }
 static void ggml_compute_forward_map_custom1(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * a,
        struct ggml_tensor * dst,
        const ggml_custom1_op_f32_t fun) {
    switch (a->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 // ggml_compute_forward_map_custom2
 static void ggml_compute_forward_map_custom2_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * a,
        const struct ggml_tensor * b,
        struct ggml_tensor * dst,
        const ggml_custom2_op_f32_t fun) {
    assert(params->ith == 0);
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    fun(dst, a, b);
 }
 static void ggml_compute_forward_map_custom2(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * a,
        const struct ggml_tensor * b,
        struct ggml_tensor * dst,
        const ggml_custom2_op_f32_t fun) {
    switch (a->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 // ggml_compute_forward_map_custom3
 static void ggml_compute_forward_map_custom3_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * a,
        const struct ggml_tensor * b,
        const struct ggml_tensor * c,
        struct ggml_tensor * dst,
        const ggml_custom3_op_f32_t fun) {
    assert(params->ith == 0);
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
    fun(dst, a, b, c);
 }
 static void ggml_compute_forward_map_custom3(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * a,
        const struct ggml_tensor * b,
        const struct ggml_tensor * c,
        struct ggml_tensor * dst,
        const ggml_custom3_op_f32_t fun) {
    switch (a->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 // ggml_compute_forward_cross_entropy_loss
 static void ggml_compute_forward_cross_entropy_loss_f32(
@ -14911,7 +15196,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
    if (skip_cpu) {
        return;
    }
-    GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
    GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
 #endif // GGML_USE_CUBLAS
@ -15158,6 +15443,24 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
            }
            break;
        case GGML_OP_MAP_CUSTOM1:
            {
                const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data);
                ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun);
            }
            break;
        case GGML_OP_MAP_CUSTOM2:
            {
                const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data);
                ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun);
            }
            break;
        case GGML_OP_MAP_CUSTOM3:
            {
                const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data);
                ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun);
            }
            break;
        case GGML_OP_CROSS_ENTROPY_LOSS:
            {
                ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor);
@ -15964,6 +16267,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
        case GGML_OP_WIN_UNPART:
        case GGML_OP_MAP_UNARY:
        case GGML_OP_MAP_BINARY:
        case GGML_OP_MAP_CUSTOM1:
        case GGML_OP_MAP_CUSTOM2:
        case GGML_OP_MAP_CUSTOM3:
            {
                GGML_ASSERT(false); // not supported
            } break;
@ -16605,6 +16911,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                case GGML_OP_WIN_UNPART:
                case GGML_OP_MAP_UNARY:
                case GGML_OP_MAP_BINARY:
                case GGML_OP_MAP_CUSTOM1:
                case GGML_OP_MAP_CUSTOM2:
                case GGML_OP_MAP_CUSTOM3:
                    {
                        node->n_tasks = 1;
                    } break;
--- a/ggml.h
+++ b/ggml.h
@ -345,6 +345,10 @@ extern "C" {
        GGML_OP_MAP_UNARY,
        GGML_OP_MAP_BINARY,
        GGML_OP_MAP_CUSTOM1,
        GGML_OP_MAP_CUSTOM2,
        GGML_OP_MAP_CUSTOM3,
        GGML_OP_CROSS_ENTROPY_LOSS,
        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
@ -1167,21 +1171,73 @@ extern "C" {
            int                   h0,
            int                   w);
-    // Mapping operations
+    // custom operators
    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
    GGML_API struct ggml_tensor * ggml_map_unary_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
                   ggml_unary_op_f32_t   fun);
    GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
                   ggml_unary_op_f32_t   fun);
    GGML_API struct ggml_tensor * ggml_map_binary_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
                   ggml_binary_op_f32_t   fun);
    GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
                   ggml_binary_op_f32_t   fun);
    GGML_API struct ggml_tensor * ggml_map_custom1_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
                   ggml_custom1_op_f32_t   fun);
    GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
                   ggml_custom1_op_f32_t   fun);
    GGML_API struct ggml_tensor * ggml_map_custom2_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
                   ggml_custom2_op_f32_t   fun);
    GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
                   ggml_custom2_op_f32_t   fun);
    GGML_API struct ggml_tensor * ggml_map_custom3_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
            struct ggml_tensor           * c,
                   ggml_custom3_op_f32_t   fun);
    GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
            struct ggml_tensor           * c,
                   ggml_custom3_op_f32_t   fun);
    // loss function
    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@ -1,3 +1,4 @@
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
 #include "ggml.h"
 #include <math.h>
@ -5,6 +6,10 @@
 #include <stdlib.h>
 #include <assert.h>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 #define MAX_NARGS 3
 #undef MIN
@ -197,8 +202,23 @@ bool check_gradient(
        float max_error_abs,
        float max_error_rel) {
    static int n_threads = -1;
    if (n_threads < 0) {
        n_threads = GGML_DEFAULT_N_THREADS;
        const char *env = getenv("GGML_N_THREADS");
        if (env) {
            n_threads = atoi(env);
        }
        printf("GGML_N_THREADS = %d\n", n_threads);
    }
    struct ggml_cgraph gf = ggml_build_forward (f);
    gf.n_threads = n_threads;
    struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
    gb.n_threads = n_threads;
    ggml_graph_compute(ctx0, &gf);
    ggml_graph_reset  (&gf);