diff --git a/.devops/tools.sh b/.devops/tools.sh index 860a7e891..efdd6663c 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -26,6 +26,8 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then ./quantize "$i" "${i/f16/q4_0}" q4_0 fi done +elif [[ $arg1 == '--server' || $arg1 == '-s' ]]; then + ./server $arg2 else echo "Unknown command: $arg1" echo "Available commands: " @@ -37,4 +39,6 @@ else echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" echo " --all-in-one (-a): Execute --convert & --quantize" echo " ex: \"/models/\" 7B" + echo " --server (-s): Run a model on the server" + echo " ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080" fi diff --git a/CMakeLists.txt b/CMakeLists.txt index 54c091413..9f9c55a67 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -273,7 +273,7 @@ if (LLAMA_CUBLAS) if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) if (LLAMA_CUDA_DMMV_F16) - set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics + set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics else() set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics endif() diff --git a/README.md b/README.md index 63457b6ae..476cc438b 100644 --- a/README.md +++ b/README.md @@ -239,7 +239,7 @@ In order to build llama.cpp you have three different options. - Using `Zig`: ```bash - zig build -Drelease-fast + zig build -Doptimize=ReleaseFast ``` ### Metal Build diff --git a/examples/common.cpp b/examples/common.cpp index fad16887d..fd551c9cb 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -236,6 +236,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.mirostat_tau = std::stof(argv[i]); + } else if (arg == "--cfg-negative-prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.cfg_negative_prompt = argv[i]; + } else if (arg == "--cfg-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.cfg_scale = std::stof(argv[i]); + } else if (arg == "--cfg-smooth-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.cfg_smooth_factor = std::stof(argv[i]); } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; @@ -469,6 +487,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n"); fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); + fprintf(stderr, " --cfg-negative-prompt PROMPT \n"); + fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n"); + fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); + fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); @@ -535,7 +557,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s return res; } -std::tuple llama_init_from_gpt_params(const gpt_params & params) { +struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto lparams = llama_context_default_params(); lparams.n_ctx = params.n_ctx; @@ -551,6 +573,12 @@ std::tuple llama_init_from_gpt_par lparams.logits_all = params.perplexity; lparams.embedding = params.embedding; + return lparams; +} + +std::tuple llama_init_from_gpt_params(const gpt_params & params) { + auto lparams = llama_context_params_from_gpt_params(params); + llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams); if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); diff --git a/examples/common.h b/examples/common.h index 96f2228f8..6315df961 100644 --- a/examples/common.h +++ b/examples/common.h @@ -48,6 +48,12 @@ struct gpt_params { float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate + // Classifier-Free Guidance + // https://arxiv.org/abs/2306.17806 + std::string cfg_negative_prompt; // string to help guidance + float cfg_scale = 1.f; // How strong is guidance + float cfg_smooth_factor = 1.f; // Smooth factor between old and new logits + std::string model = "models/7B/ggml-model.bin"; // model path std::string model_alias = "unknown"; // model alias std::string prompt = ""; @@ -99,6 +105,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s // std::tuple llama_init_from_gpt_params(const gpt_params & params); +struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); // // Console utils diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 07d8fc6ac..2248c2458 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -109,10 +109,16 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; + llama_context * ctx_guidance = NULL; g_ctx = &ctx; // load the model and apply lora adapter, if any std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (params.cfg_scale > 1.f) { + struct llama_context_params lparams = llama_context_params_from_gpt_params(params); + ctx_guidance = llama_new_context_with_model(model, lparams); + } + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; @@ -183,15 +189,28 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector embd_inp; - if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { embd_inp = ::llama_tokenize(ctx, params.prompt, true); } else { embd_inp = session_tokens; } + // Tokenize negative prompt + std::vector guidance_inp; + int guidance_offset = 0; + int original_prompt_len = 0; + if (ctx_guidance) { + params.cfg_negative_prompt.insert(0, 1, ' '); + guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true); + + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true); + original_prompt_len = original_inp.size(); + guidance_offset = (int)guidance_inp.size() - original_prompt_len; + } + const int n_ctx = llama_n_ctx(ctx); if ((int) embd_inp.size() > n_ctx - 4) { @@ -258,6 +277,16 @@ int main(int argc, char ** argv) { for (int i = 0; i < (int) embd_inp.size(); i++) { fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); } + + if (ctx_guidance) { + fprintf(stderr, "\n"); + fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); + fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); + for (int i = 0; i < (int) guidance_inp.size(); i++) { + fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i])); + } + } + if (params.n_keep > 0) { fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { @@ -334,11 +363,13 @@ int main(int argc, char ** argv) { int n_remain = params.n_predict; int n_consumed = 0; int n_session_consumed = 0; + int n_past_guidance = 0; // the first thing we will do is to output the prompt, so set color accordingly console_set_color(con_st, CONSOLE_COLOR_PROMPT); std::vector embd; + std::vector embd_guidance; // do one empty run to warm up the model { @@ -367,11 +398,12 @@ int main(int argc, char ** argv) { // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - if (n_past + (int) embd.size() > n_ctx) { + if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) { const int n_left = n_past - params.n_keep; // always keep the first token - BOS n_past = std::max(1, params.n_keep); + n_past_guidance = std::max(1, params.n_keep + guidance_offset); // insert n_left/2 tokens at the start of embd from last_n_tokens embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); @@ -412,6 +444,48 @@ int main(int argc, char ** argv) { // evaluate tokens in batches // embd is typically prepared beforehand to fit within a batch, but not always + + if (ctx_guidance) { + int input_size = 0; + llama_token* input_buf = NULL; + + if (n_past_guidance < (int) guidance_inp.size()) { + // Guidance context should have the same data with these modifications: + // + // * Replace the initial prompt + // * Shift everything by guidance_offset + embd_guidance = guidance_inp; + if (embd.begin() + original_prompt_len < embd.end()) { + embd_guidance.insert( + embd_guidance.end(), + embd.begin() + original_prompt_len, + embd.end() + ); + } + + input_buf = embd_guidance.data(); + input_size = embd_guidance.size(); + //fprintf(stderr, "\n---------------------\n"); + //for (int i = 0; i < (int) embd_guidance.size(); i++) { + //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i])); + //} + //fprintf(stderr, "\n---------------------\n"); + } else { + input_buf = embd.data(); + input_size = embd.size(); + } + + for (int i = 0; i < input_size; i += params.n_batch) { + int n_eval = std::min(input_size - i, params.n_batch); + if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + + n_past_guidance += n_eval; + } + } + for (int i = 0; i < (int) embd.size(); i += params.n_batch) { int n_eval = (int) embd.size() - i; if (n_eval > params.n_batch) { @@ -431,6 +505,7 @@ int main(int argc, char ** argv) { } embd.clear(); + embd_guidance.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { // out of user input, sample next token @@ -473,6 +548,10 @@ int main(int argc, char ** argv) { llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + if (ctx_guidance) { + llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor); + } + // Apply penalties float nl_logit = logits[llama_token_nl()]; auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); @@ -668,6 +747,7 @@ int main(int argc, char ** argv) { } llama_print_timings(ctx); + if (ctx_guidance) { llama_free(ctx_guidance); } llama_free(ctx); llama_free_model(model); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index b96fdcdc4..afbb4a777 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1354,17 +1354,9 @@ struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) { } } - if (t->src0) { - expand(g, t->src0); - } - - if (t->src1) { - expand(g, t->src1); - } - - for (int i = 0; i < GGML_MAX_OPT; ++i) { - if (t->opt[i]) { - expand(g, t->opt[i]); + for (int i = 0; i < GGML_MAX_SRC; ++i) { + if (t->src[i]) { + expand(g, t->src[i]); } } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d05ffd723..436ba870f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -268,6 +268,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define CUDA_ADD_BLOCK_SIZE 256 #define CUDA_MUL_BLOCK_SIZE 256 +#define CUDA_GELU_BLOCK_SIZE 256 #define CUDA_SILU_BLOCK_SIZE 256 #define CUDA_CPY_BLOCK_SIZE 32 #define CUDA_SCALE_BLOCK_SIZE 256 @@ -322,6 +323,20 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co dst[i] = x[i] * y[i%ky]; } +static const float GELU_COEF_A = 0.044715f; +static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + +static __global__ void gelu_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + float xi = x[i]; + dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi))); +} + static __global__ void silu_f32(const float * x, float * dst, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -331,16 +346,46 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) { dst[i] = x[i] / (1.0f + expf(-x[i])); } +static __global__ void norm_f32(const float * x, float * dst, const int ncols) { + const int row = blockIdx.x*blockDim.y + threadIdx.y; + const int tid = threadIdx.x; + + const float eps = 1e-5f; + + float mean = 0.0f; + float var = 0.0f; + + for (int col = tid; col < ncols; col += WARP_SIZE) { + const float xi = x[row*ncols + col]; + mean += xi; + var += xi * xi; + } + + // sum up partial sums +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + mean += __shfl_xor_sync(0xffffffff, mean, mask, 32); + var += __shfl_xor_sync(0xffffffff, var, mask, 32); + } + + mean /= ncols; + var = var / ncols - mean * mean; + const float inv_var = rsqrtf(var + eps); + + for (int col = tid; col < ncols; col += WARP_SIZE) { + dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var; + } +} + static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) { const int row = blockIdx.x*blockDim.y + threadIdx.y; const int tid = threadIdx.x; - const float eps = 1e-6; + const float eps = 1e-6f; float tmp = 0.0f; // partial sum for thread in warp - for (int i = 0; i < ncols; i += WARP_SIZE) { - const int col = i + tid; + for (int col = tid; col < ncols; col += WARP_SIZE) { const float xi = x[row*ncols + col]; tmp += xi * xi; } @@ -352,10 +397,9 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol } const float mean = tmp / ncols; - const float scale = 1.0f / sqrtf(mean + eps); + const float scale = rsqrtf(mean + eps); - for (int i = 0; i < ncols; i += WARP_SIZE) { - const int col = i + tid; + for (int col = tid; col < ncols; col += WARP_SIZE) { dst[row*ncols + col] = scale * x[row*ncols + col]; } } @@ -1285,7 +1329,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __ } static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics +#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; int vi; @@ -1306,11 +1350,11 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 600 +#endif // __CUDA_ARCH__ >= 610 } static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics +#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]); @@ -1331,11 +1375,11 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 600 +#endif // __CUDA_ARCH__ >= 610 } static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics +#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; int qs; @@ -1366,11 +1410,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 600 +#endif // __CUDA_ARCH__ >= 610 } static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics +#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]); @@ -1400,11 +1444,11 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 600 +#endif // __CUDA_ARCH__ >= 610 } static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics +#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; int vi; @@ -1419,7 +1463,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 600 +#endif // __CUDA_ARCH__ >= 610 } template @@ -1760,11 +1804,22 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in mul_f32<<>>(x, y, dst, kx, ky); } +static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; + gelu_f32<<>>(x, dst, k); +} + static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE; silu_f32<<>>(x, dst, k); } +static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + const dim3 block_dims(WARP_SIZE, 1, 1); + norm_f32<<>>(x, dst, ncols); +} + static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % WARP_SIZE == 0); const dim3 block_dims(WARP_SIZE, 1, 1); @@ -2293,16 +2348,21 @@ inline void ggml_cuda_op_add( GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr); GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); - const int64_t ne0 = src0->ne[0]; + // TODO: support broadcasting + GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1)); + + const int64_t ne00 = src0->ne[0]; const int64_t i01_diff = i01_high - i01_low; + const int64_t ne10 = src1->ne[0]; + // compute if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main); + add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main); + add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main); } else { GGML_ASSERT(false); } @@ -2321,10 +2381,9 @@ inline void ggml_cuda_op_mul( GGML_ASSERT(src0_ddf_i != nullptr); GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); const int64_t ne00 = src0->ne[0]; - const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; @@ -2333,7 +2392,7 @@ inline void ggml_cuda_op_mul( float * src0_ddf_i01 = src0_ddf_i + i01*ne00; float * src1_ddf_i01 = src1_ddf_i + i11*ne10; - float * dst_ddf_i01 = dst_ddf_i + i01*ne00; + float * dst_ddf_i01 = dst_ddf_i + i01*ne00; // compute mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main); @@ -2344,6 +2403,28 @@ inline void ggml_cuda_op_mul( (void) i02; } +inline void ggml_cuda_op_gelu( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + inline void ggml_cuda_op_silu( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, @@ -2366,6 +2447,28 @@ inline void ggml_cuda_op_silu( (void) i1; } +inline void ggml_cuda_op_norm( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + inline void ggml_cuda_op_rms_norm( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, @@ -2412,7 +2515,7 @@ inline void ggml_cuda_op_mul_mat_vec( src0->type == GGML_TYPE_Q5_1 || src0->type == GGML_TYPE_Q8_0; - const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented; + const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented; #endif if (use_mul_mat_vec_q) { @@ -2981,11 +3084,21 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten } +void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true); +} + void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true); } +void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true); +} + void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true); @@ -3216,7 +3329,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { } - cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice); + CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice)); extra->data_device[id] = buf; @@ -3256,36 +3369,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo } // recursively assign CUDA buffers until a compute tensor is found - if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) { - const ggml_op src0_op = tensor->src0->op; + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { + const ggml_op src0_op = tensor->src[0]->op; if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) { - ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace); + ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace); } } - if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) { - ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace); + if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { + ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace); } tensor->backend = GGML_BACKEND_GPU; struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; memset(extra, 0, sizeof(*extra)); - const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) || + const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || tensor->op == GGML_OP_VIEW || force_inplace; const size_t size = ggml_nbytes(tensor); CUDA_CHECK(cudaSetDevice(g_main_device)); - if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) { - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra; + if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t offset = 0; if (tensor->op == GGML_OP_VIEW) { - memcpy(&offset, tensor->opt[0]->data, sizeof(size_t)); + memcpy(&offset, tensor->src[2]->data, sizeof(size_t)); } extra->data_device[g_main_device] = src0_ddc + offset; } else if (tensor->op == GGML_OP_CPY) { - struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra; + struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; void * src1_ddv = src1_extra->data_device[g_main_device]; extra->data_device[g_main_device] = src1_ddv; } else if (scratch) { @@ -3356,8 +3469,8 @@ void ggml_cuda_free_scratch() { bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ ggml_cuda_func_t func; const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) - || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU); + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); switch (tensor->op) { case GGML_OP_ADD: @@ -3372,12 +3485,24 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ } func = ggml_cuda_mul; break; + case GGML_OP_GELU: + if (!any_on_device) { + return false; + } + func = ggml_cuda_gelu; + break; case GGML_OP_SILU: if (!any_on_device) { return false; } func = ggml_cuda_silu; break; + case GGML_OP_NORM: + if (!any_on_device) { + return false; + } + func = ggml_cuda_norm; + break; case GGML_OP_RMS_NORM: if (!any_on_device) { return false; @@ -3385,7 +3510,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func = ggml_cuda_rms_norm; break; case GGML_OP_MUL_MAT: - if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) { + if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { return false; } func = ggml_cuda_mul_mat; @@ -3439,6 +3564,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return true; } - func(tensor->src0, tensor->src1, tensor); + func(tensor->src[0], tensor->src[1], tensor); return true; } diff --git a/ggml-metal.m b/ggml-metal.m index 6473644c2..02dc9beb9 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -393,8 +393,8 @@ void ggml_metal_graph_compute( for (int i = node_start; i < node_end; ++i) { metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); - struct ggml_tensor * src0 = gf->nodes[i]->src0; - struct ggml_tensor * src1 = gf->nodes[i]->src1; + struct ggml_tensor * src0 = gf->nodes[i]->src[0]; + struct ggml_tensor * src1 = gf->nodes[i]->src[1]; struct ggml_tensor * dst = gf->nodes[i]; const int64_t ne00 = src0 ? src0->ne[0] : 0; @@ -739,7 +739,10 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { + if (src0t == GGML_TYPE_Q4_0) { + [encoder dispatchThreadgroups:MTLSizeMake(ne01 / 8+((ne01 % 8) & 0x01), ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } + else if (src0t == GGML_TYPE_Q4_1) { [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } diff --git a/ggml-metal.metal b/ggml-metal.metal index e62fe6842..30d60fa58 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -365,6 +365,10 @@ kernel void kernel_rms_norm( } } +// putting them in the kernel cause a significant performance penalty +#define N_DST 4 // each SIMD group works on 4 rows +#define N_SIMDGROUP 2 // number of SIMD groups in a thread group +#define N_SIMDWIDTH 32 // assuming SIMD group size is 32 kernel void kernel_mul_mat_q4_0_f32( device const void * src0, device const float * src1, @@ -372,64 +376,69 @@ kernel void kernel_mul_mat_q4_0_f32( constant int64_t & ne00, constant int64_t & ne10, constant int64_t & ne0, - threadgroup float * sum [[threadgroup(0)]], + constant int64_t & ne01[[buffer(4)]], uint2 tgpig[[threadgroup_position_in_grid]], - uint2 tpitg[[thread_position_in_threadgroup]], - uint2 tptg[[threads_per_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nb = ne00/QK4_0; - - const int64_t r0 = tgpig.x; - const int64_t r1 = tgpig.y; - - device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + device const block_q4_0 * x = (device const block_q4_0 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb; device const float * y = (device const float *) src1 + r1*ne10; + block_q4_0 qb_curr, qb_next; + float4 y_curr[8]; // src1 vector cache + float sumf[N_DST]={0.f}, all_sum; + thread float * yl=(thread float *)y_curr; - const int nth = tptg.x*tptg.y; - const int ith = tptg.y*tpitg.x + tpitg.y; - - const int ix = tpitg.y/4; // 0 or 1 - const int iy = tpitg.y - 4*ix; // 0...3 - - const int first = 4 * iy; - - float sumf = 0; - - for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) { - - const float d = (float)x[i].d; - - device const uint8_t * xl = x[i].qs + first; - device const float * yl = y + i * QK4_0 + first; - - float2 acc = {0.0f, 0.0f}; - - for (int j = 0; j < 4; ++j) { - - acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4); - acc[1] += yl[j] + yl[j+16]; + // bootstrap + qb_curr = x[tiisg]; + // each thread in a SIMD group deals with 1 block. + for (int column = 0; column < nb / N_SIMDWIDTH; column++) { + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i)); } - sumf += d * (acc[0] - 8.f*acc[1]); + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH]; + + // calculate + float d = qb_curr.d; + float2 acc = {0.0f, 0.0f}; + for (int i = 0; i < 16; i++) { + acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + acc[1] += yl[i] + yl[i+16]; + } + sumf[row] += d * (acc[0] - 8.f*acc[1]); + qb_curr = qb_next; + } } - sum[ith] = sumf; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i)); + } - // - // Accumulate the sum from all threads in the threadgroup - // - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%4 == 0) { - sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%16 == 0) { - sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith == 0) { - for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; - dst[r1*ne0 + r0] = sum[0]; + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH]; + + // calculate + float d = qb_curr.d; + float2 acc = {0.0f, 0.0f}; + for (int i = 0; i < 16; i++) { + acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + acc[1] += yl[i] + yl[i+16]; + } + if (tiisg < nb % N_SIMDWIDTH) { + sumf[row] += d * (acc[0] - 8.f*acc[1]); + } + qb_curr = qb_next; + + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } } } diff --git a/ggml-mpi.c b/ggml-mpi.c index 872e808de..ae176d707 100644 --- a/ggml-mpi.c +++ b/ggml-mpi.c @@ -175,11 +175,11 @@ void ggml_mpi_graph_compute_pre( // attach the input data to all nodes that need it // TODO: not great - should be able to do this without modifying the compute graph (see next TODO below) for (int i = idx_l0; i < idx_l1; i++) { - if (gf->nodes[i]->src0 == gf->nodes[idx_l0]) { - gf->nodes[i]->src0 = inp0; + if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) { + gf->nodes[i]->src[0] = inp0; } - if (gf->nodes[i]->src1 == gf->nodes[idx_l0]) { - gf->nodes[i]->src1 = inp0; + if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) { + gf->nodes[i]->src[1] = inp0; } } diff --git a/ggml.c b/ggml.c index c10877a76..c137ae658 100644 --- a/ggml.c +++ b/ggml.c @@ -25,6 +25,7 @@ #include #include #include +#include #ifdef GGML_USE_METAL #include @@ -49,23 +50,23 @@ typedef volatile LONG atomic_int; typedef atomic_int atomic_bool; -static void atomic_store(atomic_int* ptr, LONG val) { +static void atomic_store(atomic_int * ptr, LONG val) { InterlockedExchange(ptr, val); } -static LONG atomic_load(atomic_int* ptr) { +static LONG atomic_load(atomic_int * ptr) { return InterlockedCompareExchange(ptr, 0, 0); } -static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) { +static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { return InterlockedExchangeAdd(ptr, inc); } -static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { +static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); } typedef HANDLE pthread_t; typedef DWORD thread_ret_t; -static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { +static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) { (void) unused; HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); if (handle == NULL) @@ -77,7 +78,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void return 0; } -static int pthread_join(pthread_t thread, void* unused) { +static int pthread_join(pthread_t thread, void * unused) { (void) unused; return (int) WaitForSingleObject(thread, INFINITE); } @@ -90,7 +91,7 @@ static int sched_yield (void) { #include #include -typedef void* thread_ret_t; +typedef void * thread_ret_t; #include #include @@ -3786,6 +3787,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CLAMP", "CONV_1D", "CONV_2D", + "POOL_1D", + "POOL_2D", "FLASH_ATTN", "FLASH_FF", @@ -3804,7 +3807,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66"); +static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3864,6 +3867,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "clamp(x)", "conv_1d(x)", "conv_2d(x)", + "pool_1d(x)", + "pool_2d(x)", "flash_attn(x)", "flash_ff(x)", @@ -3882,7 +3887,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66"); +static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); + +static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -4161,10 +4168,9 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) { static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return - (t0->ne[0] == t1->ne[0]) && - (t0->ne[2] == t1->ne[2]) && - (t0->ne[3] == t1->ne[3]); + return (t0->ne[0] == t1->ne[0]) && + (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable + (t1->ne[3]%t0->ne[3] == 0); } static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { @@ -4584,9 +4590,7 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.op =*/ GGML_OP_NONE, /*.is_param =*/ false, /*.grad =*/ NULL, - /*.src0 =*/ NULL, - /*.src1 =*/ NULL, - /*.opt =*/ { NULL }, + /*.src =*/ { NULL }, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, @@ -4725,7 +4729,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { { assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); } } break; case GGML_TYPE_F32: @@ -4777,7 +4781,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { { assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); } } break; case GGML_TYPE_F32: @@ -5012,8 +5016,8 @@ struct ggml_tensor * ggml_dup_impl( result->op = GGML_OP_DUP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5037,11 +5041,15 @@ struct ggml_tensor * ggml_add_impl( struct ggml_tensor * a, struct ggml_tensor * b, bool inplace) { - GGML_ASSERT(ggml_are_same_shape(a, b)); + // TODO: support less-strict constraint + // GGML_ASSERT(ggml_can_repeat(b, a)); + GGML_ASSERT(ggml_can_repeat_rows(b, a)); bool is_node = false; - if (a->grad || b->grad) { + if (!inplace && (a->grad || b->grad)) { + // TODO: support backward pass for broadcasting + GGML_ASSERT(ggml_are_same_shape(a, b)); is_node = true; } @@ -5049,8 +5057,8 @@ struct ggml_tensor * ggml_add_impl( result->op = GGML_OP_ADD; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5089,8 +5097,8 @@ struct ggml_tensor * ggml_add1_impl( result->op = GGML_OP_ADD1; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5147,9 +5155,9 @@ struct ggml_tensor * ggml_acc_impl( result->op = GGML_OP_ACC; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } @@ -5195,8 +5203,8 @@ struct ggml_tensor * ggml_sub_impl( result->op = GGML_OP_SUB; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5242,8 +5250,8 @@ struct ggml_tensor * ggml_mul_impl( result->op = GGML_OP_MUL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5285,8 +5293,8 @@ struct ggml_tensor * ggml_div_impl( result->op = GGML_OP_DIV; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5321,8 +5329,8 @@ struct ggml_tensor * ggml_sqr_impl( result->op = GGML_OP_SQR; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5355,8 +5363,8 @@ struct ggml_tensor * ggml_sqrt_impl( result->op = GGML_OP_SQRT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5390,8 +5398,8 @@ struct ggml_tensor * ggml_log_impl( result->op = GGML_OP_LOG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5423,8 +5431,8 @@ struct ggml_tensor * ggml_sum( result->op = GGML_OP_SUM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5450,8 +5458,8 @@ struct ggml_tensor * ggml_sum_rows( result->op = GGML_OP_SUM_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5473,8 +5481,8 @@ struct ggml_tensor * ggml_mean( result->op = GGML_OP_MEAN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5497,8 +5505,8 @@ struct ggml_tensor * ggml_argmax( result->op = GGML_OP_ARGMAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5525,8 +5533,8 @@ struct ggml_tensor * ggml_repeat( result->op = GGML_OP_REPEAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5553,8 +5561,8 @@ struct ggml_tensor * ggml_repeat_back( result->op = GGML_OP_REPEAT_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5575,8 +5583,8 @@ struct ggml_tensor * ggml_abs_impl( result->op = GGML_OP_ABS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5610,8 +5618,8 @@ struct ggml_tensor * ggml_sgn_impl( result->op = GGML_OP_SGN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5644,8 +5652,8 @@ struct ggml_tensor * ggml_neg_impl( result->op = GGML_OP_NEG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5678,8 +5686,8 @@ struct ggml_tensor * ggml_step_impl( result->op = GGML_OP_STEP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5712,8 +5720,8 @@ struct ggml_tensor * ggml_tanh_impl( result->op = GGML_OP_TANH; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5746,8 +5754,8 @@ struct ggml_tensor * ggml_elu_impl( result->op = GGML_OP_ELU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5780,8 +5788,8 @@ struct ggml_tensor * ggml_relu_impl( result->op = GGML_OP_RELU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5814,8 +5822,8 @@ struct ggml_tensor * ggml_gelu_impl( result->op = GGML_OP_GELU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5848,8 +5856,8 @@ struct ggml_tensor * ggml_gelu_quick_impl( result->op = GGML_OP_GELU_QUICK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5882,8 +5890,8 @@ struct ggml_tensor * ggml_silu_impl( result->op = GGML_OP_SILU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -5917,8 +5925,8 @@ struct ggml_tensor * ggml_silu_back( result->op = GGML_OP_SILU_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -5940,8 +5948,8 @@ struct ggml_tensor * ggml_norm_impl( result->op = GGML_OP_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; // TODO: maybe store epsilon here? + result->src[0] = a; + result->src[1] = NULL; // TODO: maybe store epsilon here? return result; } @@ -5972,8 +5980,8 @@ struct ggml_tensor * ggml_rms_norm_impl( result->op = GGML_OP_RMS_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; // TODO: maybe store epsilon here? + result->src[0] = a; + result->src[1] = NULL; // TODO: maybe store epsilon here? return result; } @@ -6005,8 +6013,8 @@ struct ggml_tensor * ggml_rms_norm_back( result->op = GGML_OP_RMS_NORM_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6027,13 +6035,13 @@ struct ggml_tensor * ggml_mul_mat( is_node = true; } - const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); + const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); result->op = GGML_OP_MUL_MAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6058,8 +6066,8 @@ struct ggml_tensor * ggml_out_prod( result->op = GGML_OP_OUT_PROD; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6084,8 +6092,8 @@ struct ggml_tensor * ggml_scale_impl( result->op = GGML_OP_SCALE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6140,9 +6148,9 @@ struct ggml_tensor * ggml_set_impl( result->op = GGML_OP_SET; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } @@ -6229,8 +6237,8 @@ struct ggml_tensor * ggml_cpy_impl( result->op = GGML_OP_CPY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6266,8 +6274,8 @@ struct ggml_tensor * ggml_cont_impl( result->op = GGML_OP_CONT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6310,8 +6318,8 @@ struct ggml_tensor * ggml_reshape( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6335,8 +6343,8 @@ struct ggml_tensor * ggml_reshape_1d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6361,8 +6369,8 @@ struct ggml_tensor * ggml_reshape_2d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6388,8 +6396,8 @@ struct ggml_tensor * ggml_reshape_3d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6417,8 +6425,8 @@ struct ggml_tensor * ggml_reshape_4d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6450,9 +6458,9 @@ struct ggml_tensor * ggml_view_1d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = offs; return result; } @@ -6492,9 +6500,9 @@ struct ggml_tensor * ggml_view_2d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = offs; return result; } @@ -6536,9 +6544,9 @@ struct ggml_tensor * ggml_view_3d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = offs; return result; } @@ -6582,9 +6590,9 @@ struct ggml_tensor * ggml_view_4d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = offs; return result; } @@ -6644,8 +6652,8 @@ struct ggml_tensor * ggml_permute( result->op = GGML_OP_PERMUTE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; if (is_node) { ggml_scratch_save(ctx); @@ -6659,7 +6667,7 @@ struct ggml_tensor * ggml_permute( ggml_scratch_load(ctx); - result->opt[0] = b; + result->src[2] = b; } return result; @@ -6687,8 +6695,8 @@ struct ggml_tensor * ggml_transpose( result->op = GGML_OP_TRANSPOSE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6713,8 +6721,8 @@ struct ggml_tensor * ggml_get_rows( result->op = GGML_OP_GET_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6741,9 +6749,9 @@ struct ggml_tensor * ggml_get_rows_back( result->op = GGML_OP_GET_ROWS_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } @@ -6765,8 +6773,8 @@ struct ggml_tensor * ggml_diag( result->op = GGML_OP_DIAG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6798,8 +6806,8 @@ struct ggml_tensor * ggml_diag_mask_inf_impl( result->op = GGML_OP_DIAG_MASK_INF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6846,8 +6854,8 @@ struct ggml_tensor * ggml_diag_mask_zero_impl( result->op = GGML_OP_DIAG_MASK_ZERO; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6882,8 +6890,8 @@ struct ggml_tensor * ggml_soft_max_impl( result->op = GGML_OP_SOFT_MAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } @@ -6918,8 +6926,8 @@ struct ggml_tensor * ggml_soft_max_back_impl( result->op = GGML_OP_SOFT_MAX_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -6970,8 +6978,8 @@ struct ggml_tensor * ggml_rope_impl( result->op = GGML_OP_ROPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -7028,8 +7036,8 @@ struct ggml_tensor * ggml_rope_back( result->op = GGML_OP_ROPE_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -7067,8 +7075,8 @@ struct ggml_tensor * ggml_alibi( result->op = GGML_OP_ALIBI; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -7101,8 +7109,8 @@ struct ggml_tensor * ggml_clamp( result->op = GGML_OP_CLAMP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -7144,9 +7152,9 @@ GGML_API struct ggml_tensor * ggml_conv_1d( result->op = GGML_OP_CONV_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } @@ -7164,7 +7172,6 @@ struct ggml_tensor* ggml_conv_2d( int d0, int d1) { - GGML_ASSERT(b->ne[3] == 1); GGML_ASSERT(a->ne[2] == b->ne[2]); bool is_node = false; @@ -7176,7 +7183,7 @@ struct ggml_tensor* ggml_conv_2d( const int64_t ne[4] = { ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), - a->ne[3], 1, + a->ne[3], b->ne[3], }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); @@ -7192,9 +7199,9 @@ struct ggml_tensor* ggml_conv_2d( result->op = GGML_OP_CONV_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; @@ -7211,6 +7218,98 @@ struct ggml_tensor* ggml_conv_1d_ph( return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); } + +// ggml_pool_* + +static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) { + return (ins + 2 * p - ks) / s + 1; +} + +// ggml_pool_2d + +struct ggml_tensor* ggml_pool_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int s0, + int p0) { + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[3] = { + ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), + a->ne[1], + }; + struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + + ggml_scratch_save(ctx); + struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); + ((int32_t*)c->data)[0] = op; + ((int32_t*)c->data)[1] = k0; + ((int32_t*)c->data)[2] = s0; + ((int32_t*)c->data)[3] = p0; + ggml_scratch_load(ctx); + + result->op = GGML_OP_POOL_1D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = c; + + return result; +} + +// ggml_pool_2d + +struct ggml_tensor* ggml_pool_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + int p0, + int p1) { + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[3] = { + ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), + ggml_calc_pool_output_size(a->ne[1], k1, s1, p1), + a->ne[2], + }; + struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); + + ggml_scratch_save(ctx); + struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7); + ((int32_t*)c->data)[0] = op; + ((int32_t*)c->data)[1] = k0; + ((int32_t*)c->data)[2] = k1; + ((int32_t*)c->data)[3] = s0; + ((int32_t*)c->data)[4] = s1; + ((int32_t*)c->data)[5] = p0; + ((int32_t*)c->data)[6] = p1; + ggml_scratch_load(ctx); + + result->op = GGML_OP_POOL_2D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = c; + + return result; +} + // ggml_flash_attn struct ggml_tensor * ggml_flash_attn( @@ -7233,10 +7332,10 @@ struct ggml_tensor * ggml_flash_attn( result->op = GGML_OP_FLASH_ATTN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = q; - result->src1 = k; - result->opt[0] = v; - result->opt[1] = ggml_new_i32(ctx, masked ? 1 : 0); + result->src[0] = q; + result->src[1] = k; + result->src[2] = v; + result->src[3] = ggml_new_i32(ctx, masked ? 1 : 0); return result; } @@ -7264,11 +7363,11 @@ struct ggml_tensor * ggml_flash_ff( result->op = GGML_OP_FLASH_FF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b0; - result->opt[0] = b1; - result->opt[1] = c0; - result->opt[2] = c1; + result->src[0] = a; + result->src[1] = b0; + result->src[2] = b1; + result->src[3] = c0; + result->src[4] = c1; return result; } @@ -7328,11 +7427,11 @@ struct ggml_tensor * ggml_flash_attn_back( result->op = GGML_OP_FLASH_ATTN_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = q; - result->src1 = k; - result->opt[0] = v; - result->opt[1] = d; - result->opt[2] = ggml_new_i32(ctx, masked ? 1 : 0); + result->src[0] = q; + result->src[1] = k; + result->src[2] = v; + result->src[3] = d; + result->src[4] = ggml_new_i32(ctx, masked ? 1 : 0); return result; } @@ -7377,9 +7476,9 @@ struct ggml_tensor * ggml_win_part( result->op = GGML_OP_WIN_PART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = b; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = b; return result; } @@ -7414,9 +7513,9 @@ struct ggml_tensor * ggml_win_unpart( result->op = GGML_OP_WIN_UNPART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = b; + result->src[0] = a; + result->src[1] = NULL; + result->src[2] = b; return result; } @@ -7445,8 +7544,8 @@ struct ggml_tensor * ggml_map_unary_impl_f32( result->op = GGML_OP_MAP_UNARY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->opt[0] = addr_tensor; + result->src[0] = a; + result->src[2] = addr_tensor; return result; } @@ -7492,9 +7591,9 @@ struct ggml_tensor * ggml_map_binary_impl_f32( result->op = GGML_OP_MAP_BINARY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = addr_tensor; + result->src[0] = a; + result->src[1] = b; + result->src[2] = addr_tensor; return result; } @@ -7539,8 +7638,8 @@ struct ggml_tensor * ggml_map_custom1_impl_f32( result->op = GGML_OP_MAP_CUSTOM1; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->opt[0] = addr_tensor; + result->src[0] = a; + result->src[2] = addr_tensor; return result; } @@ -7584,9 +7683,9 @@ struct ggml_tensor * ggml_map_custom2_impl_f32( result->op = GGML_OP_MAP_CUSTOM2; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = addr_tensor; + result->src[0] = a; + result->src[1] = b; + result->src[2] = addr_tensor; return result; } @@ -7633,10 +7732,10 @@ struct ggml_tensor * ggml_map_custom3_impl_f32( result->op = GGML_OP_MAP_CUSTOM3; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = addr_tensor; - result->opt[1] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = addr_tensor; + result->src[3] = c; return result; } @@ -7676,8 +7775,8 @@ struct ggml_tensor * ggml_cross_entropy_loss( result->op = GGML_OP_CROSS_ENTROPY_LOSS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->src[0] = a; + result->src[1] = b; return result; } @@ -7696,9 +7795,9 @@ struct ggml_tensor * ggml_cross_entropy_loss_back( result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; result->grad = NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } @@ -8299,7 +8398,7 @@ static void ggml_compute_forward_add_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -8324,23 +8423,23 @@ static void ggml_compute_forward_add_f32( if (nb10 == sizeof(float)) { for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); #ifdef GGML_USE_ACCELERATE - vDSP_vadd( - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, - ne0); + vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00); #else - ggml_vec_add_f32(ne0, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); #endif // } // } @@ -8348,15 +8447,20 @@ static void ggml_compute_forward_add_f32( } else { // src1 is not contiguous for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i0 = 0; i0 < ne0; i0++) { - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10); dst_ptr[i0] = src0_ptr[i0] + *src1_ptr; } @@ -10535,7 +10639,6 @@ static void ggml_compute_forward_rms_norm_back( } } - // ggml_compute_forward_mul_mat #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) @@ -10579,17 +10682,17 @@ static void ggml_compute_forward_mul_mat( const int ith = params->ith; const int nth = params->nth; - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - const enum ggml_type type = src0->type; ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); GGML_ASSERT(nb10 == sizeof(float)); @@ -10600,16 +10703,16 @@ static void ggml_compute_forward_mul_mat( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); - // nb01 >= nb00 - src0 is not transposed // compute by src0 rows #if defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(src0, src1, dst)) { + // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension + // ref: https://github.com/ggerganov/ggml/pull/224 + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); } @@ -10619,6 +10722,11 @@ static void ggml_compute_forward_mul_mat( #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension + // ref: https://github.com/ggerganov/ggml/pull/224 + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + if (params->ith != 0) { return; } @@ -10688,41 +10796,44 @@ static void ggml_compute_forward_mul_mat( return; } - // parallelize by src0 rows using ggml_vec_dot_q + // parallelize by src0 rows + const int64_t dr = (ne01 + nth - 1)/nth; - // total rows in src0 - const int nr = ne01*ne02*ne03; + const int64_t ir10 = dr*ith; + const int64_t ir11 = MIN(ir10 + dr, ne01); - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + // src1 rows + const int64_t nr1 = ne11*ne12*ne13; void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; + const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { + const int64_t i13 = (ir1/(ne12*ne11)); + const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; + const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); - const int i13 = i03; - const int i12 = i02; + const int64_t ir0 = (ir1/ne11)%(ne02*ne03); + const int64_t i03 = (ir0/(ne02)); + // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2. + // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470: + // GG: this is likely the correct way to broadcast, though need some more thought + // therefore leaving the comments to remind us for now + const int64_t i02 = (i12 / (ne12 / ne02)); + // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon) + // const int64_t i02 = (ir0 - i03*ne02); - const int i0 = i01; - const int i2 = i02; - const int i3 = i03; + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; - void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*row_size)); + const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 ); + const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size; - float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); - for (int64_t ic = 0; ic < ne11; ++ic) { - vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); + for (int64_t ir = ir10; ir < ir11; ++ir) { + vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col); } } @@ -11719,7 +11830,7 @@ static void ggml_compute_forward_alibi_f32( const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int ne1 = src0->ne[1]; // seq_len_without_past - //const int ne2 = src0->ne[2]; // n_head -> this is k + const int ne2 = src0->ne[2]; // n_head -> this is k //const int ne3 = src0->ne[3]; // 1 -> bsz const int n = ggml_nrows(src0); @@ -11730,8 +11841,9 @@ static void ggml_compute_forward_alibi_f32( const int nb2 = src0->nb[2]; //const int nb3 = src0->nb[3]; - assert(nb0 == sizeof(float)); - assert(ne1 + n_past == ne0); (void) n_past; + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(ne1 + n_past == ne0); + GGML_ASSERT(n_head == ne2); // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); @@ -11755,7 +11867,7 @@ static void ggml_compute_forward_alibi_f32( m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); } - pdst[0] = (i-ne0+1) * m_k + src[0]; + pdst[0] = i * m_k + src[0]; } } @@ -11784,7 +11896,7 @@ static void ggml_compute_forward_alibi_f16( const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int ne1 = src0->ne[1]; // seq_len_without_past - //const int ne2 = src0->ne[2]; // n_head -> this is k + const int ne2 = src0->ne[2]; // n_head -> this is k //const int ne3 = src0->ne[3]; // 1 -> bsz const int n = ggml_nrows(src0); @@ -11795,8 +11907,9 @@ static void ggml_compute_forward_alibi_f16( const int nb2 = src0->nb[2]; //const int nb3 = src0->nb[3]; - assert(nb0 == sizeof(ggml_fp16_t)); - assert(ne1 + n_past == ne0); (void) n_past; + GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(ne1 + n_past == ne0); (void) n_past; + GGML_ASSERT(n_head == ne2); // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); @@ -11821,7 +11934,7 @@ static void ggml_compute_forward_alibi_f16( } // we return F32 - pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]); + pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]); } } } @@ -12905,16 +13018,18 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( { ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int i12 = 0; i12 < ne12; i12++) { - const float * const src = (float *)((char *) src1->data + i12*nb12); - ggml_fp16_t * dst_data = wdata; + for (int i13 = 0; i13 < ne13; i13++) { + for (int i12 = 0; i12 < ne12; i12++) { + const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12); + ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0); - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - for (int ik1 = 0; ik1 < nk1; ik1++) { - for (int ik0 = 0; ik0 < nk0; ik0++) { - dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = - GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]); + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + for (int ik1 = 0; ik1 < nk1; ik1++) { + for (int ik0 = 0; ik0 < nk0; ik0++) { + dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = + GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]); + } } } } @@ -12941,14 +13056,16 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int i2 = ip0; i2 < ip1; i2++) { - float * dst_data = (float *)((char *) dst->data + i2*nb2); + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ip0; i2 < ip1; i2++) { + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2); - for (int i1 = 0; i1 < ne1; ++i1) { - for (int i0 = 0; i0 < ne0; ++i0) { - ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, - (ggml_fp16_t *) ((char *) src0->data + i2*nb03), - (ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0); + for (int i1 = 0; i1 < ne1; ++i1) { + for (int i0 = 0; i0 < ne0; ++i0) { + ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, + (ggml_fp16_t *) ((char *) src0->data + i2*nb03), + (ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0); + } } } } @@ -12997,10 +13114,169 @@ static void ggml_compute_forward_conv_2d( if (s0 == src0->ne[0] && s1 == src0->ne[1]) { ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst); - } - else { + } else { GGML_ASSERT(false); // only stride equal to kernel size is supported - }; + } +} + +// ggml_compute_forward_pool_1d_sk_p0 + +static void ggml_compute_forward_pool_1d_sk_p0( + const struct ggml_compute_params * params, + const enum ggml_op_pool op, + const struct ggml_tensor * src, + const int k, + struct ggml_tensor * dst) { + assert(src->type == GGML_TYPE_F32); + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const char * cdata = (const char *)src->data; + const char * const data_end = cdata + ggml_nbytes(src); + float * drow = (float *)dst->data; + + const int64_t rs = dst->ne[0]; + + while (cdata < data_end) { + const float * const srow = (const float *)cdata; + + int j = 0; + + for (int64_t i = 0; i < rs; ++i) { + switch (op) { + case GGML_OP_POOL_AVG: drow[i] = 0; break; + case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + for (int ki = 0; ki < k; ++ki) { + switch (op) { + case GGML_OP_POOL_AVG: drow[i] += srow[j]; break; + case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + ++j; + } + switch (op) { + case GGML_OP_POOL_AVG: drow[i] /= k; break; + case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + } + + cdata += src->nb[1]; + drow += rs; + } +} + +// ggml_compute_forward_pool_1d + +static void ggml_compute_forward_pool_1d( + const struct ggml_compute_params* params, + const struct ggml_tensor* src0, + const struct ggml_tensor* opt0, + struct ggml_tensor* dst) { + GGML_ASSERT(opt0->ne[0] == 4); + const int* opts = (const int*)opt0->data; + enum ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int s0 = opts[2]; + const int p0 = opts[3]; + GGML_ASSERT(p0 == 0); // padding not supported + GGML_ASSERT(k0 == s0); // only s = k supported + + ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst); +} + +// ggml_compute_forward_pool_2d_sk_p0 + +static void ggml_compute_forward_pool_2d_sk_p0( + const struct ggml_compute_params * params, + const enum ggml_op_pool op, + const struct ggml_tensor * src, + const int k0, + const int k1, + struct ggml_tensor * dst) { + assert(src->type == GGML_TYPE_F32); + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const char * cdata = (const char*)src->data; + const char * const data_end = cdata + ggml_nbytes(src); + + const int64_t px = dst->ne[0]; + const int64_t py = dst->ne[1]; + const int64_t pa = px * py; + + float * dplane = (float *)dst->data; + + const int ka = k0 * k1; + + while (cdata < data_end) { + for (int oy = 0; oy < py; ++oy) { + float * const drow = dplane + oy * px; + for (int ox = 0; ox < px; ++ox) { + float * const out = drow + ox; + switch (op) { + case GGML_OP_POOL_AVG: *out = 0; break; + case GGML_OP_POOL_MAX: *out = -FLT_MAX; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + + const int ix = ox * k0; + const int iy = oy * k1; + + for (int ky = 0; ky < k1; ++ky) { + const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky)); + for (int kx = 0; kx < k0; ++kx) { + int j = ix + kx; + switch (op) { + case GGML_OP_POOL_AVG: *out += srow[j]; break; + case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + } + } + switch (op) { + case GGML_OP_POOL_AVG: *out /= ka; break; + case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + } + } + + cdata += src->nb[2]; + dplane += pa; + } +} + +// ggml_compute_forward_pool_2d + +static void ggml_compute_forward_pool_2d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(opt0->ne[0] == 7); + const int* opts = (const int*)opt0->data; + enum ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + GGML_ASSERT(p0 == 0); + GGML_ASSERT(p1 == 0); // padding not supported + GGML_ASSERT(k0 == s0); + GGML_ASSERT(k1 == s1); // only s = k supported + + ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst); } @@ -14567,287 +14843,295 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); #endif // GGML_USE_CUBLAS switch (tensor->op) { case GGML_OP_DUP: { - ggml_compute_forward_dup(params, tensor->src0, tensor); + ggml_compute_forward_dup(params, tensor->src[0], tensor); } break; case GGML_OP_ADD: { - ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ADD1: { - ggml_compute_forward_add1(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ACC: { - ggml_compute_forward_acc(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_SUB: { - ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_MUL: { - ggml_compute_forward_mul(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_DIV: { - ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_SQR: { - ggml_compute_forward_sqr(params, tensor->src0, tensor); + ggml_compute_forward_sqr(params, tensor->src[0], tensor); } break; case GGML_OP_SQRT: { - ggml_compute_forward_sqrt(params, tensor->src0, tensor); + ggml_compute_forward_sqrt(params, tensor->src[0], tensor); } break; case GGML_OP_LOG: { - ggml_compute_forward_log(params, tensor->src0, tensor); + ggml_compute_forward_log(params, tensor->src[0], tensor); } break; case GGML_OP_SUM: { - ggml_compute_forward_sum(params, tensor->src0, tensor); + ggml_compute_forward_sum(params, tensor->src[0], tensor); } break; case GGML_OP_SUM_ROWS: { - ggml_compute_forward_sum_rows(params, tensor->src0, tensor); + ggml_compute_forward_sum_rows(params, tensor->src[0], tensor); } break; case GGML_OP_MEAN: { - ggml_compute_forward_mean(params, tensor->src0, tensor); + ggml_compute_forward_mean(params, tensor->src[0], tensor); } break; case GGML_OP_ARGMAX: { - ggml_compute_forward_argmax(params, tensor->src0, tensor); + ggml_compute_forward_argmax(params, tensor->src[0], tensor); } break; case GGML_OP_REPEAT: { - ggml_compute_forward_repeat(params, tensor->src0, tensor); + ggml_compute_forward_repeat(params, tensor->src[0], tensor); } break; case GGML_OP_REPEAT_BACK: { - ggml_compute_forward_repeat_back(params, tensor->src0, tensor); + ggml_compute_forward_repeat_back(params, tensor->src[0], tensor); } break; case GGML_OP_ABS: { - ggml_compute_forward_abs(params, tensor->src0, tensor); + ggml_compute_forward_abs(params, tensor->src[0], tensor); } break; case GGML_OP_SGN: { - ggml_compute_forward_sgn(params, tensor->src0, tensor); + ggml_compute_forward_sgn(params, tensor->src[0], tensor); } break; case GGML_OP_NEG: { - ggml_compute_forward_neg(params, tensor->src0, tensor); + ggml_compute_forward_neg(params, tensor->src[0], tensor); } break; case GGML_OP_STEP: { - ggml_compute_forward_step(params, tensor->src0, tensor); + ggml_compute_forward_step(params, tensor->src[0], tensor); } break; case GGML_OP_TANH: { - ggml_compute_forward_tanh(params, tensor->src0, tensor); + ggml_compute_forward_tanh(params, tensor->src[0], tensor); } break; case GGML_OP_ELU: { - ggml_compute_forward_elu(params, tensor->src0, tensor); + ggml_compute_forward_elu(params, tensor->src[0], tensor); } break; case GGML_OP_RELU: { - ggml_compute_forward_relu(params, tensor->src0, tensor); + ggml_compute_forward_relu(params, tensor->src[0], tensor); } break; case GGML_OP_GELU: { - ggml_compute_forward_gelu(params, tensor->src0, tensor); + ggml_compute_forward_gelu(params, tensor->src[0], tensor); } break; case GGML_OP_GELU_QUICK: { - ggml_compute_forward_gelu_quick(params, tensor->src0, tensor); + ggml_compute_forward_gelu_quick(params, tensor->src[0], tensor); } break; case GGML_OP_SILU: { - ggml_compute_forward_silu(params, tensor->src0, tensor); + ggml_compute_forward_silu(params, tensor->src[0], tensor); } break; case GGML_OP_SILU_BACK: { - ggml_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_NORM: { - ggml_compute_forward_norm(params, tensor->src0, tensor); + ggml_compute_forward_norm(params, tensor->src[0], tensor); } break; case GGML_OP_RMS_NORM: { - ggml_compute_forward_rms_norm(params, tensor->src0, tensor); + ggml_compute_forward_rms_norm(params, tensor->src[0], tensor); } break; case GGML_OP_RMS_NORM_BACK: { - ggml_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_OUT_PROD: { - ggml_compute_forward_out_prod(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_SCALE: { - ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_SET: { - ggml_compute_forward_set(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_CPY: { - ggml_compute_forward_cpy(params, tensor->src0, tensor); + ggml_compute_forward_cpy(params, tensor->src[0], tensor); } break; case GGML_OP_CONT: { - ggml_compute_forward_cont(params, tensor->src0, tensor); + ggml_compute_forward_cont(params, tensor->src[0], tensor); } break; case GGML_OP_RESHAPE: { - ggml_compute_forward_reshape(params, tensor->src0, tensor); + ggml_compute_forward_reshape(params, tensor->src[0], tensor); } break; case GGML_OP_VIEW: { - ggml_compute_forward_view(params, tensor->src0); + ggml_compute_forward_view(params, tensor->src[0]); } break; case GGML_OP_PERMUTE: { - ggml_compute_forward_permute(params, tensor->src0); + ggml_compute_forward_permute(params, tensor->src[0]); } break; case GGML_OP_TRANSPOSE: { - ggml_compute_forward_transpose(params, tensor->src0); + ggml_compute_forward_transpose(params, tensor->src[0]); } break; case GGML_OP_GET_ROWS: { - ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_GET_ROWS_BACK: { - ggml_compute_forward_get_rows_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_DIAG: { - ggml_compute_forward_diag(params, tensor->src0, tensor); + ggml_compute_forward_diag(params, tensor->src[0], tensor); } break; case GGML_OP_DIAG_MASK_INF: { - ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_DIAG_MASK_ZERO: { - ggml_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_SOFT_MAX: { - ggml_compute_forward_soft_max(params, tensor->src0, tensor); + ggml_compute_forward_soft_max(params, tensor->src[0], tensor); } break; case GGML_OP_SOFT_MAX_BACK: { - ggml_compute_forward_soft_max_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ROPE: { - ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ROPE_BACK: { - ggml_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_ALIBI: { - ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_CLAMP: { - ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_CONV_1D: { - ggml_compute_forward_conv_1d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_CONV_2D: { - ggml_compute_forward_conv_2d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + } break; + case GGML_OP_POOL_1D: + { + ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_POOL_2D: + { + ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_FLASH_ATTN: { - const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0); + const int32_t t = ggml_get_i32_1d(tensor->src[3], 0); GGML_ASSERT(t == 0 || t == 1); const bool masked = t != 0; - ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor); + ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor); } break; case GGML_OP_FLASH_FF: { - ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); + ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor); } break; case GGML_OP_FLASH_ATTN_BACK: { - int32_t t = ggml_get_i32_1d(tensor->opt[2], 0); + int32_t t = ggml_get_i32_1d(tensor->src[4], 0); GGML_ASSERT(t == 0 || t == 1); bool masked = t != 0; - ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor); + ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor); } break; case GGML_OP_WIN_PART: { - ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor); + ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor); } break; case GGML_OP_WIN_UNPART: { - ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor); + ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor); } break; case GGML_OP_MAP_UNARY: { - const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun); + const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data); + ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun); } break; case GGML_OP_MAP_BINARY: { - const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun); + const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data); + ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); } break; case GGML_OP_MAP_CUSTOM1: { - const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun); + const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data); + ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun); } break; case GGML_OP_MAP_CUSTOM2: { - const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun); + const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data); + ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun); } break; case GGML_OP_MAP_CUSTOM3: { - const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun); + const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data); + ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun); } break; case GGML_OP_CROSS_ENTROPY_LOSS: { - ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - ggml_compute_forward_cross_entropy_loss_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; case GGML_OP_NONE: @@ -14864,8 +15148,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm //////////////////////////////////////////////////////////////////////////////// static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) { - struct ggml_tensor * src0 = tensor->src0; - struct ggml_tensor * src1 = tensor->src1; + struct ggml_tensor * src0 = tensor->src[0]; + struct ggml_tensor * src1 = tensor->src[1]; switch (tensor->op) { case GGML_OP_DUP: @@ -14901,12 +15185,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5); - GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32); - const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0]; - const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1]; - const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2]; - const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3]; + GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5); + GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32); + const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0]; + const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1]; + const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2]; + const size_t offset = (( int32_t * ) tensor->src[2]->data)[3]; struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx, tensor->grad, @@ -15214,12 +15498,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_SET: { - GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5); - GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32); - const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0]; - const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1]; - const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2]; - const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3]; + GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5); + GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32); + const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0]; + const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1]; + const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2]; + const size_t offset = (( int32_t * ) tensor->src[2]->data)[3]; struct ggml_tensor * tensor_grad_view = NULL; @@ -15296,8 +15580,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor if (src0->grad) { size_t offset; - GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->opt[0])); - memcpy(&offset, tensor->opt[0]->data, sizeof(offset)); + GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2])); + memcpy(&offset, tensor->src[2]->data, sizeof(offset)); size_t nb1 = tensor->nb[1]; size_t nb2 = tensor->nb[2]; @@ -15324,7 +15608,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - int32_t * axes = (int32_t *) tensor->opt[0]->data; + int32_t * axes = (int32_t *) tensor->src[2]->data; int axis0 = axes[0] & 0x3; int axis1 = axes[1] & 0x3; int axis2 = axes[2] & 0x3; @@ -15484,18 +15768,26 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_POOL_1D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_POOL_2D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_FLASH_ATTN: { struct ggml_tensor * flash_grad = NULL; - if (src0->grad || src1->grad || tensor->opt[0]->grad) { - int32_t t = ggml_get_i32_1d(tensor->opt[1], 0); + if (src0->grad || src1->grad || tensor->src[2]->grad) { + int32_t t = ggml_get_i32_1d(tensor->src[3], 0); GGML_ASSERT(t == 0 || t == 1); bool masked = t != 0; flash_grad = ggml_flash_attn_back(ctx, src0, src1, - tensor->opt[0], + tensor->src[2], tensor->grad, masked); } @@ -15592,7 +15884,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor inplace); } - struct ggml_tensor * opt0 = tensor->opt[0]; + struct ggml_tensor * opt0 = tensor->src[2]; if (opt0->grad) { struct ggml_tensor * grad_v = NULL; @@ -15708,17 +16000,9 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } } - if (node->src0) { - ggml_visit_parents(cgraph, node->src0); - } - - if (node->src1) { - ggml_visit_parents(cgraph, node->src1); - } - - for (int i = 0; i < GGML_MAX_OPT; ++i) { - if (node->opt[i]) { - ggml_visit_parents(cgraph, node->opt[i]); + for (int i = 0; i < GGML_MAX_SRC; ++i) { + if (node->src[i]) { + ggml_visit_parents(cgraph, node->src[i]); } } @@ -15954,6 +16238,9 @@ struct ggml_compute_state_shared { // synchronization primitives atomic_int n_active; // num active threads atomic_int node_n; // active graph node + + bool (*abort_callback)(void * data); // abort ggml_graph_compute when true + void * abort_callback_data; }; struct ggml_compute_state { @@ -15985,6 +16272,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { int node_n = -1; while (true) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + state->shared->node_n += 1; + return (thread_ret_t) GGML_EXIT_ABORTED; + } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { // all other threads are finished and spinning // do finalize and init here so we don't have synchronize again @@ -16038,6 +16329,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } else { break; } + + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + break; + } } atomic_store(&state->shared->n_active, n_threads); @@ -16071,7 +16366,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } } - return 0; + return GGML_EXIT_SUCCESS; } struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { @@ -16110,8 +16405,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { size_t cur = 0; - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks; + if (ggml_is_quantized(node->src[0]->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -16122,8 +16417,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { size_t cur = 0; - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks; + if (ggml_is_quantized(node->src[0]->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -16166,39 +16461,39 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { n_tasks = n_threads; // TODO: use different scheduling for different matrix sizes - //const int nr0 = ggml_nrows(node->src0); - //const int nr1 = ggml_nrows(node->src1); + //const int nr0 = ggml_nrows(node->src[0]); + //const int nr1 = ggml_nrows(node->src[1]); //n_tasks = MIN(n_threads, MAX(1, nr0/128)); //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); size_t cur = 0; - const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type; + const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; #if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { + if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) { n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning } else #elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { + if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning - cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); + cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node); } else #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning - if (node->src0->type != GGML_TYPE_F32) { + if (node->src[0]->type != GGML_TYPE_F32) { // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]); } } else #endif - if (node->src1->type != vec_dot_type) { - cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type]; + if (node->src[1]->type != vec_dot_type) { + cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type]; } else { cur = 0; } @@ -16242,24 +16537,24 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { { n_tasks = n_threads; - GGML_ASSERT(node->src0->ne[3] == 1); - GGML_ASSERT(node->src1->ne[2] == 1); - GGML_ASSERT(node->src1->ne[3] == 1); + GGML_ASSERT(node->src[0]->ne[3] == 1); + GGML_ASSERT(node->src[1]->ne[2] == 1); + GGML_ASSERT(node->src[1]->ne[3] == 1); size_t cur = 0; - const int nk = node->src0->ne[0]; + const int nk = node->src[0]->ne[0]; - if (node->src0->type == GGML_TYPE_F16 && - node->src1->type == GGML_TYPE_F32) { + if (node->src[0]->type == GGML_TYPE_F16 && + node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(ggml_fp16_t)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + + ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] ); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + + ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] ); } else { GGML_ASSERT(false); @@ -16271,16 +16566,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { { n_tasks = n_threads; - GGML_ASSERT(node->src1->ne[3] == 1); + const int64_t ne00 = node->src[0]->ne[0]; // W + const int64_t ne01 = node->src[0]->ne[1]; // H + const int64_t ne02 = node->src[0]->ne[2]; // C + const int64_t ne03 = node->src[0]->ne[3]; // N - const int64_t ne00 = node->src0->ne[0]; // W - const int64_t ne01 = node->src0->ne[1]; // H - const int64_t ne02 = node->src0->ne[2]; // C - const int64_t ne03 = node->src0->ne[3]; // N - - const int64_t ne10 = node->src1->ne[0]; // W - const int64_t ne11 = node->src1->ne[1]; // H - const int64_t ne12 = node->src1->ne[2]; // C + const int64_t ne10 = node->src[1]->ne[0]; // W + const int64_t ne11 = node->src[1]->ne[1]; // H + const int64_t ne12 = node->src[1]->ne[2]; // C const int64_t nk = ne00*ne01; @@ -16290,11 +16583,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { size_t cur = 0; - if (node->src0->type == GGML_TYPE_F16 && - node->src1->type == GGML_TYPE_F32) { + if (node->src[0]->type == GGML_TYPE_F16 && + node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)* (ne10*ne11*ne12); } else { GGML_ASSERT(false); @@ -16302,20 +16595,25 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { work_size = MAX(work_size, cur); } break; + case GGML_OP_POOL_1D: + case GGML_OP_POOL_2D: + { + n_tasks = 1; + } break; case GGML_OP_FLASH_ATTN: { n_tasks = n_threads; size_t cur = 0; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); - if (node->src1->type == GGML_TYPE_F32) { + if (node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 } - if (node->src1->type == GGML_TYPE_F16) { + if (node->src[1]->type == GGML_TYPE_F16) { cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 } @@ -16328,14 +16626,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { size_t cur = 0; - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 + if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 } work_size = MAX(work_size, cur); @@ -16346,15 +16644,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { size_t cur = 0; - const int64_t D = node->src0->ne[0]; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t D = node->src[0]->ne[0]; + const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back - if (node->src1->type == GGML_TYPE_F32) { + if (node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 } - if (node->src1->type == GGML_TYPE_F16) { + if (node->src[1]->type == GGML_TYPE_F16) { cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 } @@ -16375,7 +16673,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { { n_tasks = n_threads; - size_t cur = ggml_type_size(node->type)*(n_tasks + node->src0->ne[0]*n_tasks); + size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); work_size = MAX(work_size, cur); } break; @@ -16383,7 +16681,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { { n_tasks = n_threads; - size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks; + size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks; work_size = MAX(work_size, cur); } break; @@ -16411,7 +16709,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { return cplan; } -void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { +int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { { GGML_ASSERT(cplan); GGML_ASSERT(cplan->n_threads > 0); @@ -16437,6 +16735,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) /*.n_threads =*/ n_threads, /*.n_active =*/ n_threads, /*.node_n =*/ -1, + /*.abort_callback =*/ NULL, + /*.abort_callback_data =*/ NULL, }; struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); @@ -16460,12 +16760,12 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) const int64_t perf_start_time_us = ggml_perf_time_us(); // this is a work thread too - ggml_graph_compute_thread(&workers[0]); + int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]); // don't leave affinity set on the main thread clear_numa_thread_affinity(); - // join thread pool + // join or kill thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; j++) { const int rc = ggml_thread_join(workers[j].thrd, NULL); @@ -16489,6 +16789,8 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) (double) perf_time_us_cur / 1000.0, (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs); } + + return compute_status; } void ggml_graph_reset(struct ggml_cgraph * cgraph) { @@ -16593,8 +16895,8 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { ggml_graph_export_leaf(cgraph->leafs[i], fout); GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); - GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); - GGML_ASSERT(cgraph->leafs[i]->src1 == NULL); + GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL); + GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL); } // header @@ -16605,17 +16907,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { for (int i = 0; i < cgraph->n_nodes; ++i) { ggml_graph_export_node(cgraph->nodes[i], "DST", fout); - if (cgraph->nodes[i]->src0) { - ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); - } - - if (cgraph->nodes[i]->src1) { - ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); - } - - for (int j = 0; j < GGML_MAX_OPT; ++j) { - if (cgraph->nodes[i]->opt[j]) { - ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); + for (int j = 0; j < GGML_MAX_SRC; ++j) { + if (cgraph->nodes[i]->src[j]) { + ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout); } } @@ -16706,16 +17000,13 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { // output the op arguments { - struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL }; + struct ggml_tensor * args[GGML_MAX_SRC] = { NULL }; - args[0] = tensor->src0; - args[1] = tensor->src1; - - for (int j = 0; j < GGML_MAX_OPT; ++j) { - args[2 + j] = tensor->opt[j]; + for (int j = 0; j < GGML_MAX_SRC; ++j) { + args[j] = tensor->src[j]; } - for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + for (int j = 0; j < GGML_MAX_SRC; ++j) { if (args[j]) { int32_t idx = -1; @@ -16933,12 +17224,12 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** const char * ptr_name = ptr; ptr += GGML_MAX_NAME; - const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t); + const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t); - struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL }; + struct ggml_tensor * args[GGML_MAX_SRC] = { NULL }; // parse args - for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + for (int j = 0; j < GGML_MAX_SRC; ++j) { const int32_t arg_idx = ptr_arg_idx[j]; if (arg_idx == -1) { @@ -16995,11 +17286,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** tensor->nb[j] = nb[j]; } - tensor->src0 = args[0]; - tensor->src1 = args[1]; - - for (int j = 0; j < GGML_MAX_OPT; ++j) { - tensor->opt[j] = args[2 + j]; + for (int j = 0; j < GGML_MAX_SRC; ++j) { + tensor->src[j] = args[j]; } result.nodes[i] = tensor; @@ -17198,19 +17486,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; - if (node->src0) { - ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x"); - } - - if (node->src1) { - ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y"); - } - - for (int j = 0; j < GGML_MAX_OPT; j++) { - if (node->opt[j]) { + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j]) { char label[16]; - snprintf(label, sizeof(label), "opt %d", j); - ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label); + snprintf(label, sizeof(label), "src %d", j); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label); } } } @@ -17218,19 +17498,11 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph for (int i = 0; i < gb->n_leafs; i++) { struct ggml_tensor * node = gb->leafs[i]; - if (node->src0) { - ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x"); - } - - if (node->src1) { - ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y"); - } - - for (int j = 0; j < GGML_MAX_OPT; j++) { - if (node->opt[j]) { + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j]) { char label[16]; - snprintf(label, sizeof(label), "opt %d", j); - ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label); + snprintf(label, sizeof(label), "src %d", j); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label); } } } diff --git a/ggml.h b/ggml.h index ab84bef68..b88c35bae 100644 --- a/ggml.h +++ b/ggml.h @@ -132,10 +132,10 @@ // { // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3); // -// // a[1, 2] = 1.0f; +// // a[2, 1] = 1.0f; // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f; // -// // a[2, 0] = 2.0f; +// // a[0, 2] = 2.0f; // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f; // // ... @@ -197,12 +197,17 @@ #define GGML_MAX_NODES 4096 #define GGML_MAX_PARAMS 256 #define GGML_MAX_CONTEXTS 64 -#define GGML_MAX_OPT 4 +#define GGML_MAX_SRC 6 #define GGML_MAX_NAME 48 #define GGML_DEFAULT_N_THREADS 4 + +#define GGML_EXIT_SUCCESS 0 +#define GGML_EXIT_ABORTED 1 + #define GGML_UNUSED(x) (void)(x) + #define GGML_ASSERT(x) \ do { \ if (!(x)) { \ @@ -363,6 +368,8 @@ extern "C" { GGML_OP_CLAMP, GGML_OP_CONV_1D, GGML_OP_CONV_2D, + GGML_OP_POOL_1D, + GGML_OP_POOL_2D, GGML_OP_FLASH_ATTN, GGML_OP_FLASH_FF, @@ -414,9 +421,7 @@ extern "C" { bool is_param; struct ggml_tensor * grad; - struct ggml_tensor * src0; - struct ggml_tensor * src1; - struct ggml_tensor * opt[GGML_MAX_OPT]; + struct ggml_tensor * src[GGML_MAX_SRC]; // performance int perf_runs; @@ -444,6 +449,10 @@ extern "C" { // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes int n_tasks[GGML_MAX_NODES]; + + // abort ggml_graph_compute when true + bool (*abort_callback)(void * data); + void * abort_callback_data; }; // computation graph @@ -1166,6 +1175,31 @@ extern "C" { int s, int d); + enum ggml_op_pool { + GGML_OP_POOL_MAX, + GGML_OP_POOL_AVG, + GGML_OP_POOL_COUNT, + }; + + GGML_API struct ggml_tensor* ggml_pool_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, // kernel size + int s0, // stride + int p0); // padding + + GGML_API struct ggml_tensor* ggml_pool_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + int p0, + int p1); + GGML_API struct ggml_tensor * ggml_flash_attn( struct ggml_context * ctx, struct ggml_tensor * q, @@ -1305,7 +1339,7 @@ extern "C" { // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // same as ggml_graph_compute() but the work data is allocated as a part of the context diff --git a/llama.cpp b/llama.cpp index 08ec21ab6..2d09d6ce7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2167,6 +2167,62 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l } } +static void llama_log_softmax(float * array, size_t size) { + float max_l = *std::max_element(array, array + size); + float sum = 0.f; + for (size_t i = 0; i < size; ++i) { + float p = expf(array[i] - max_l); + sum += p; + array[i] = p; + } + + for (size_t i = 0; i < size; ++i) { + array[i] = logf(array[i] / sum); + } +} + +void llama_sample_classifier_free_guidance( + struct llama_context * ctx, + llama_token_data_array * candidates, + struct llama_context * guidance_ctx, + float scale, + float smooth_factor) { + int64_t t_start_sample_us = t_start_sample_us = ggml_time_us(); + + assert(ctx); + auto n_vocab = llama_n_vocab(ctx); + assert(n_vocab == (int)candidates->size); + assert(!candidates->sorted); + + std::vector logits_base; + logits_base.reserve(candidates->size); + for (size_t i = 0; i < candidates->size; ++i) { + logits_base.push_back(candidates->data[i].logit); + } + llama_log_softmax(logits_base.data(), candidates->size); + + float* logits_guidance = llama_get_logits(guidance_ctx); + llama_log_softmax(logits_guidance, n_vocab); + + for (int i = 0; i < n_vocab; ++i) { + float logit_guidance = logits_guidance[i]; + float logit_base = logits_base[i]; + logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance; + } + + llama_log_softmax(logits_guidance, n_vocab); + + for (int i = 0; i < n_vocab; ++i) { + float logit_base = logits_base[i]; + float logit_guidance = logits_guidance[i]; + + candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base; + } + + if (ctx) { + ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + } +} llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) { assert(ctx); diff --git a/llama.h b/llama.h index 686463aa2..4596b1ed4 100644 --- a/llama.h +++ b/llama.h @@ -309,6 +309,18 @@ extern "C" { /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); + /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 + /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. + /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. + /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. + /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits. + LLAMA_API void llama_sample_classifier_free_guidance( + struct llama_context * ctx, + llama_token_data_array * candidates, + struct llama_context * guidance_ctx, + float scale, + float smooth_factor); + /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); diff --git a/tests/test-grad0.c b/tests/test-grad0.c index da4001ce5..01467bc18 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -10,7 +10,9 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wdouble-promotion" +#endif #define MAX_NARGS 3 diff --git a/tests/test-opt.c b/tests/test-opt.c index e928a7df7..5531814c4 100644 --- a/tests/test-opt.c +++ b/tests/test-opt.c @@ -7,7 +7,9 @@ #define MAX_NARGS 2 +#if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wdouble-promotion" +#endif // // logging