Optimize RWKV6 Operator Naming and Implement Multi-core CPU/ SYCL Acceleration (#10133)
* rwkv6: rename to wkv6 * rwkv6: support avx2 avx512 armv8 armv9 * rwkv6: update cuda file name * rwkv6: rename params * wkv on sycl * sycl: add some ops * sycl: Enhance OP support judgment * wkv6: drop armv9 and tranfer to GGML style ggml-ci * sync : ggml * update the function to use appropriate types * fix define error * Update ggml/src/ggml-cpu.c * add appropriate asserts * move element-wise functions outside * put the declaration outside the loop * rewrite to be more inline with the common pattern for distributing threads * use recommended way GGML_TENSOR_LOCALS --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Diego Devesa <slarengh@gmail.com> Co-authored-by: Plamen Minev <pacominev@gmail.com> Co-authored-by: Yuri Khrustalev <ykhrustalev@users.noreply.github.com> Co-authored-by: Meng, Hengyu <airdldl@163.com>
This commit is contained in:
parent
5c333e0140
commit
3bcd40b3c5
22 changed files with 1977 additions and 1027 deletions
|
@ -7011,7 +7011,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
|||
{LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV}},
|
||||
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
|
||||
{LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_ATTN_OUT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
|
@ -7127,7 +7127,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|||
ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
|
||||
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
|
||||
} break;
|
||||
case GGML_OP_RWKV_WKV:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
{
|
||||
// FIXME
|
||||
const int64_t S = 123;
|
||||
|
@ -7140,7 +7140,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|||
ggml_tensor * tf = w;
|
||||
ggml_tensor * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
|
||||
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
|
||||
op_tensor = ggml_rwkv_wkv(ctx, k, v, r, tf, td, state);
|
||||
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
||||
|
@ -10083,7 +10083,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
|
|||
v = ggml_transpose(ctx, v);
|
||||
r = ggml_transpose(ctx, r);
|
||||
|
||||
struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
|
||||
struct ggml_tensor * wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
|
||||
cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
|
||||
*wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue