From 9ef07800622e4c371605f9419864d15667c3558f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 30 Jun 2024 20:27:13 +0200 Subject: [PATCH 1/7] Fix new line issue with chat template, disable template when in-prefix/suffix is set (#8203) * preserve new line llama_chat_format_single * disable chat template if in-prefix/suffix is set * remove redundant change --- common/common.cpp | 16 +++++++++++++--- common/common.h | 1 + examples/main/main.cpp | 11 +++++++---- tests/test-chat-template.cpp | 4 ++-- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 6a00d25be..5a0d0ee03 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1014,16 +1014,19 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--in-prefix-bos") { params.input_prefix_bos = true; + params.enable_chat_template = false; return true; } if (arg == "--in-prefix") { CHECK_ARG params.input_prefix = argv[i]; + params.enable_chat_template = false; return true; } if (arg == "--in-suffix") { CHECK_ARG params.input_suffix = argv[i]; + params.enable_chat_template = false; return true; } if (arg == "--spm-infill") { @@ -1406,7 +1409,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "halt generation at PROMPT, return control in interactive mode\n" "can be specified more than once for multiple prompts" }); options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); - options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" }); + options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: %s)", params.conversation ? "true" : "false" }); options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); @@ -2668,12 +2671,19 @@ std::string llama_chat_format_single(const struct llama_model * model, const std::vector & past_msg, const llama_chat_msg & new_msg, bool add_ass) { + std::ostringstream ss; auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false); std::vector chat_new(past_msg); + // if the past_msg ends with a newline, we must preserve it in the formatted version + if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { + ss << "\n"; + }; + // format chat with new_msg chat_new.push_back(new_msg); auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass); - auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); - return formatted; + // get the diff part + ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); + return ss.str(); } std::string llama_chat_format_example(const struct llama_model * model, diff --git a/common/common.h b/common/common.h index d6cb814b9..627b7ed85 100644 --- a/common/common.h +++ b/common/common.h @@ -200,6 +200,7 @@ struct gpt_params { std::string public_path = ""; std::string chat_template = ""; std::string system_prompt = ""; + bool enable_chat_template = true; std::vector api_keys; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 1114073b8..d512953b9 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -261,7 +261,7 @@ int main(int argc, char ** argv) { std::vector embd_inp; { - auto prompt = params.conversation + auto prompt = (params.conversation && params.enable_chat_template) ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode : params.prompt; if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { @@ -810,7 +810,9 @@ int main(int argc, char ** argv) { is_antiprompt = true; } - chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); + if (params.enable_chat_template) { + chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); + } is_interacting = true; printf("\n"); } @@ -872,12 +874,13 @@ int main(int argc, char ** argv) { string_process_escapes(buffer); } - std::string user_inp = params.conversation + bool format_chat = params.conversation && params.enable_chat_template; + std::string user_inp = format_chat ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) : std::move(buffer); // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation); + const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index b154038b2..03f536910 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -142,9 +142,9 @@ int main(void) { std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n"; return output; }; - assert(fmt_single("chatml") == "<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n"); + assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n"); assert(fmt_single("llama2") == "[INST] How are you [/INST]"); - assert(fmt_single("gemma") == "user\nHow are you\nmodel\n"); + assert(fmt_single("gemma") == "\nuser\nHow are you\nmodel\n"); assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); return 0; From d0a7145ba99ed3a8bc3145aa785b5c86ffe65020 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 1 Jul 2024 02:09:34 +0300 Subject: [PATCH 2/7] flake.lock: Update (#8218) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 79bb3f63f..973ff4e56 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1718895438, - "narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=", + "lastModified": 1719506693, + "narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3", + "rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a", "type": "github" }, "original": { From 197fe6c1d7bec6718ce901f0141b2725240f298c Mon Sep 17 00:00:00 2001 From: zhentaoyu Date: Mon, 1 Jul 2024 19:39:06 +0800 Subject: [PATCH 3/7] [SYCL] Update SYCL-Rope op and Refactor (#8157) * align with rope.cu and move sycl-op to a single file --- ggml/src/ggml-sycl.cpp | 305 +-------------------------------- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/rope.cpp | 275 +++++++++++++++++++++++++++++ ggml/src/ggml-sycl/rope.hpp | 22 +++ 4 files changed, 300 insertions(+), 303 deletions(-) create mode 100644 ggml/src/ggml-sycl/rope.cpp create mode 100644 ggml/src/ggml-sycl/rope.hpp diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 4a668a2c3..30d8a5b33 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -978,114 +978,6 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne, cpy_blck(cx + x_offset, cdst + dst_offset); } -static float rope_yarn_ramp(const float low, const float high, const int i0) { - const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low); - return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y)); -} - -struct rope_corr_dims { - float v[4]; -}; - -// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn -// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. -static void rope_yarn( - float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, - float * cos_theta, float * sin_theta -) { - // Get n-d rotational scaling corrected for extrapolation - float theta_interp = freq_scale * theta_extrap; - float theta = theta_interp; - if (ext_factor != 0.0f) { - float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; - theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; - - // Get n-d magnitude scaling corrected for interpolation - mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale); - } - *cos_theta = sycl::cos(theta) * mscale; - *sin_theta = sycl::sin(theta) * mscale; -} - -// rope == RoPE == rotary positional embedding -template -static void rope( - const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, - float ext_factor, float attn_factor, rope_corr_dims corr_dims -, - const sycl::nd_item<3> &item_ct1) { - const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1)); - - if (col >= ncols) { - return; - } - - const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - const int i = row*ncols + col; - const int i2 = row/p_delta_rows; - - const int p = has_pos ? pos[i2] : 0; - const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols); - - float cos_theta, sin_theta; - rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta); - - const float x0 = x[i + 0]; - const float x1 = x[i + 1]; - - dst[i + 0] = x0*cos_theta - x1*sin_theta; - dst[i + 1] = x0*sin_theta + x1*cos_theta; -} - -template -static void rope_neox( - const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, - float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, - const float * freq_factors, const sycl::nd_item<3> &item_ct1) { - const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1)); - - if (col >= ncols) { - return; - } - - const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - const int ib = col / n_dims; - const int ic = col % n_dims; - - if (ib > 0) { - const int i = row*ncols + ib*n_dims + ic; - - dst[i + 0] = x[i + 0]; - dst[i + 1] = x[i + 1]; - - return; - } - - const int i = row*ncols + ib*n_dims + ic/2; - const int i2 = row/p_delta_rows; - - float cur_rot = inv_ndims * ic - ib; - - const int p = has_pos ? pos[i2] : 0; - const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f; - - const float theta_base = - p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor; - - float cos_theta, sin_theta; - rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); - - const float x0 = x[i + 0]; - const float x1 = x[i + n_dims/2]; - - dst[i + 0] = x0*cos_theta - x1*sin_theta; - dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; -} - static void k_sum_rows_f32(const float * x, float * dst, const int ncols, const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(1); @@ -2241,110 +2133,6 @@ static void clamp_f32_sycl(const float *x, float *dst, const float min, }); } -template -static void rope_sycl(const T *x, T *dst, int ncols, int nrows, - const int32_t *pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, - rope_corr_dims corr_dims, queue_ptr stream) { - GGML_ASSERT(ncols % 2 == 0); - const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1); - const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE); - const sycl::range<3> block_nums(1, num_blocks_x, nrows); - if (pos == nullptr) { - /* - DPCT1049:40: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope(x, dst, ncols, pos, freq_scale, p_delta_rows, - freq_base, ext_factor, attn_factor, corr_dims, - item_ct1); - }); - } else { - /* - DPCT1049:41: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope(x, dst, ncols, pos, freq_scale, p_delta_rows, - freq_base, ext_factor, attn_factor, corr_dims, - item_ct1); - }); - } -} - -template -static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows, - const int32_t *pos, float freq_scale, - int p_delta_rows, float freq_base, float ext_factor, - float attn_factor, rope_corr_dims corr_dims, - const float * freq_factors, queue_ptr stream) { - GGML_ASSERT(ncols % 2 == 0); - const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1); - const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE); - const sycl::range<3> block_nums(1, num_blocks_x, nrows); - - const float theta_scale = powf(freq_base, -2.0f/n_dims); - const float inv_ndims = -1.0f / n_dims; - - if (pos == nullptr) { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - if (freq_factors == nullptr) { - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ncols, n_dims, pos, freq_scale, - p_delta_rows, ext_factor, attn_factor, - corr_dims, theta_scale, inv_ndims, freq_factors, - item_ct1); - }); - } else { - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ncols, n_dims, pos, freq_scale, - p_delta_rows, ext_factor, attn_factor, - corr_dims, theta_scale, inv_ndims, freq_factors, - item_ct1); - }); - } - } else { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - if (freq_factors == nullptr) { - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ncols, n_dims, pos, freq_scale, - p_delta_rows, ext_factor, attn_factor, - corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1); - }); - } else { - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ncols, n_dims, pos, freq_scale, - p_delta_rows, ext_factor, attn_factor, - corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1); - }); - } - } -} - static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, const int nrows, queue_ptr stream) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); @@ -3461,97 +3249,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - const ggml_tensor * src2 = dst->src[2]; - - GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); - GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - GGML_ASSERT(src0->type == dst->type); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t nrows = ggml_nrows(src0); - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - //const int n_ctx = ((int32_t *) dst->op_params)[3]; - const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; - - // RoPE alteration for extended context - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - - const float * freq_factors = nullptr; - const int32_t * pos = nullptr; - if ((mode & 1) == 0) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(src1->ne[0] == ne2); - pos = (const int32_t *) src1_dd; - } - - const bool is_neox = mode & 2; - -#pragma message("TODO: update rope NORM mode to match NEOX mode") -#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634") - - if (is_neox) { - pos = (const int32_t *) src1_dd; - - if (src2 != nullptr) { - freq_factors = (const float *) src2->data; - } - } else { - GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox"); - } - - rope_corr_dims corr_dims; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v); - - // compute - if (is_neox) { - if (src0->type == GGML_TYPE_F32) { - rope_neox_sycl( - (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, main_stream - ); - } else if (src0->type == GGML_TYPE_F16) { - rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, - ne00, n_dims, nrows, pos, freq_scale, ne01, - freq_base, ext_factor, attn_factor, corr_dims, - freq_factors, main_stream); - } else { - GGML_ASSERT(false); - } - } else { - if (src0->type == GGML_TYPE_F32) { - rope_sycl( - (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, main_stream - ); - } else if (src0->type == GGML_TYPE_F16) { - rope_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, - nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, main_stream); - } else { - GGML_ASSERT(false); - } - } - - (void) src1; - (void) dst; - (void) src1_dd; -} - static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const float *src0_dd, const float *src1_dd, @@ -6241,7 +5938,9 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons case GGML_OP_CONT: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: + return true; case GGML_OP_ROPE: + return ggml_is_contiguous(op->src[0]); case GGML_OP_IM2COL: case GGML_OP_POOL_2D: case GGML_OP_SUM_ROWS: diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 2d37e271f..d5a63cd71 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -19,5 +19,6 @@ #include "dmmv.hpp" #include "mmq.hpp" #include "mmvq.hpp" +#include "rope.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp new file mode 100644 index 000000000..eabf1693e --- /dev/null +++ b/ggml/src/ggml-sycl/rope.cpp @@ -0,0 +1,275 @@ +#include "rope.hpp" + +struct rope_corr_dims { + float v[2]; +}; + +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low); + return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale); + } + *cos_theta = sycl::cos(theta) * mscale; + *sin_theta = sycl::sin(theta) * mscale; +} + +template +static void rope_norm( + const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, + float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, + const sycl::nd_item<3> &item_ct1) { + const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + + if (i0 >= ne0) { + return; + } + + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i0 >= n_dims) { + const int i = row*ne0 + i0; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ne0 + i0; + const int i2 = row/p_delta_rows; + + const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); + + const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; + + float cos_theta; + float sin_theta; + + rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + 1]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + 1] = x0*sin_theta + x1*cos_theta; +} + +template +static void rope_neox( + const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, + float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, + const sycl::nd_item<3> &item_ct1) { + const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + + if (i0 >= ne0) { + return; + } + + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i0 >= n_dims) { + const int i = row*ne0 + i0; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ne0 + i0/2; + const int i2 = row/p_delta_rows; + + const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); + + const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; + + float cos_theta; + float sin_theta; + + rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + n_dims/2]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; +} + +template +static void rope_norm_sycl( + const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) { + GGML_ASSERT(ne0 % 2 == 0); + const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE); + const sycl::range<3> block_nums(1, num_blocks_x, nr); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + if (freq_factors == nullptr) { + /* + DPCT1049:40: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_norm(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, + ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } else { + /* + DPCT1049:41: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_norm(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, + ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } +} + +template +static void rope_neox_sycl( + const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) { + GGML_ASSERT(ne0 % 2 == 0); + const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE); + const sycl::range<3> block_nums(1, num_blocks_x, nr); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + if (freq_factors == nullptr) { + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ne0, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } else { + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ne0, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, freq_factors, + item_ct1); + }); + } +} + +void ggml_sycl_op_rope( + ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream) { + const ggml_tensor * src2 = dst->src[2]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == dst->type); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t nr = ggml_nrows(src0); + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + //const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; + + // RoPE alteration for extended context + float freq_base; + float freq_scale; + float ext_factor; + float attn_factor; + float beta_fast; + float beta_slow; + + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + + const bool is_neox = mode & 2; + + const int32_t * pos = (const int32_t *) src1_dd; + + const float * freq_factors = nullptr; + if (src2 != nullptr) { + freq_factors = (const float *) src2->data; + } + + rope_corr_dims corr_dims; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v); + + // compute + if (is_neox) { + if (src0->type == GGML_TYPE_F32) { + rope_neox_sycl( + (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_neox_sycl( + (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else { + GGML_ASSERT(false); + } + } else { + if (src0->type == GGML_TYPE_F32) { + rope_norm_sycl( + (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_norm_sycl( + (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, freq_factors, main_stream + ); + } else { + GGML_ASSERT(false); + } + } + + (void) src1; + (void) dst; + (void) src1_dd; +} diff --git a/ggml/src/ggml-sycl/rope.hpp b/ggml/src/ggml-sycl/rope.hpp new file mode 100644 index 000000000..00354c313 --- /dev/null +++ b/ggml/src/ggml-sycl/rope.hpp @@ -0,0 +1,22 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_ROPE_HPP +#define GGML_SYCL_ROPE_HPP + +#include "common.hpp" + +void ggml_sycl_op_rope( + ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream); + +#endif // GGML_SYCL_ROPE_HPP From 694c59cb42d1ebd6a7d912ca65d3d7363e0f14c9 Mon Sep 17 00:00:00 2001 From: iacore <74560659+iacore@users.noreply.github.com> Date: Mon, 1 Jul 2024 11:40:58 +0000 Subject: [PATCH 4/7] Document BERT support. (#8205) * Update README.md document BERT support * Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 99b16f6e2..153d837e3 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,7 @@ Typically finetunes of the base models below are supported as well. - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) +- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) From 257f8e41e24b5bbfc27d9e907189a3e0cdb650d4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 1 Jul 2024 14:46:18 +0300 Subject: [PATCH 5/7] nix : remove OpenCL remnants (#8235) * nix : remove OpenCL remnants * minor : remove parentheses --- .devops/nix/package.nix | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 4ee0d62cb..b75d7ff9e 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -17,18 +17,15 @@ rocmPackages, vulkan-headers, vulkan-loader, - clblast, useBlas ? builtins.all (x: !x) [ useCuda useMetalKit - useOpenCL useRocm useVulkan ] && blas.meta.available, useCuda ? config.cudaSupport, - useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL, + useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin, useMpi ? false, # Increases the runtime closure size by ~700M - useOpenCL ? false, useRocm ? config.rocmSupport, useVulkan ? false, llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake @@ -56,7 +53,6 @@ let ++ lib.optionals useCuda [ "CUDA" ] ++ lib.optionals useMetalKit [ "MetalKit" ] ++ lib.optionals useMpi [ "MPI" ] - ++ lib.optionals useOpenCL [ "OpenCL" ] ++ lib.optionals useRocm [ "ROCm" ] ++ lib.optionals useVulkan [ "Vulkan" ]; @@ -198,7 +194,6 @@ effectiveStdenv.mkDerivation ( optionals effectiveStdenv.isDarwin darwinBuildInputs ++ optionals useCuda cudaBuildInputs ++ optionals useMpi [ mpi ] - ++ optionals useOpenCL [ clblast ] ++ optionals useRocm rocmBuildInputs ++ optionals useBlas [ blas ] ++ optionals useVulkan vulkanBuildInputs; @@ -210,7 +205,6 @@ effectiveStdenv.mkDerivation ( (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_BLAS" useBlas) - (cmakeBool "GGML_CLBLAST" useOpenCL) (cmakeBool "GGML_CUDA" useCuda) (cmakeBool "GGML_HIPBLAS" useRocm) (cmakeBool "GGML_METAL" useMetalKit) @@ -254,7 +248,6 @@ effectiveStdenv.mkDerivation ( useCuda useMetalKit useMpi - useOpenCL useRocm useVulkan ; @@ -281,7 +274,7 @@ effectiveStdenv.mkDerivation ( # Configurations we don't want even the CI to evaluate. Results in the # "unsupported platform" messages. This is mostly a no-op, because # cudaPackages would've refused to evaluate anyway. - badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin; + badPlatforms = optionals useCuda lib.platforms.darwin; # Configurations that are known to result in build failures. Can be # overridden by importing Nixpkgs with `allowBroken = true`. From 3840b6f593751a0ba636bfda73b630cd6c29d7b5 Mon Sep 17 00:00:00 2001 From: Michael Francis Date: Mon, 1 Jul 2024 07:47:04 -0400 Subject: [PATCH 6/7] nix : enable curl (#8043) Co-authored-by: Georgi Gerganov --- .devops/nix/package.nix | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index b75d7ff9e..49e9b7528 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -17,6 +17,7 @@ rocmPackages, vulkan-headers, vulkan-loader, + curl, useBlas ? builtins.all (x: !x) [ useCuda useMetalKit @@ -27,6 +28,7 @@ useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin, useMpi ? false, # Increases the runtime closure size by ~700M useRocm ? config.rocmSupport, + enableCurl ? true, useVulkan ? false, llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake @@ -196,13 +198,15 @@ effectiveStdenv.mkDerivation ( ++ optionals useMpi [ mpi ] ++ optionals useRocm rocmBuildInputs ++ optionals useBlas [ blas ] - ++ optionals useVulkan vulkanBuildInputs; + ++ optionals useVulkan vulkanBuildInputs + ++ optionals enableCurl [ curl ]; cmakeFlags = [ (cmakeBool "LLAMA_BUILD_SERVER" true) (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) + (cmakeBool "LLAMA_CURL" enableCurl) (cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_BLAS" useBlas) (cmakeBool "GGML_CUDA" useCuda) From 0ddeff10230b88f1fa9866bbe5fe0d71ba2323a0 Mon Sep 17 00:00:00 2001 From: Roni Date: Mon, 1 Jul 2024 14:48:16 +0200 Subject: [PATCH 7/7] readme : update tool list (#8209) * Added gppm to Tool list in README * Update README.md --------- Co-authored-by: Georgi Gerganov --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 153d837e3..c136d4a5c 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: **Tools:** - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML +[crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption ---