ggml : add SSM Metal kernels (#8546)
* ggml : add ggml_ssm_conv metal impl * ggml : add ssm_scan metal impl ggml-ci
This commit is contained in:
parent
879275ac98
commit
fc18425b6a
4 changed files with 303 additions and 2 deletions
|
@ -667,6 +667,127 @@ kernel void kernel_diag_mask_inf_8(
|
|||
}
|
||||
}
|
||||
|
||||
// ref: ggml.c:ggml_compute_forward_ssm_conv_f32
|
||||
// TODO: optimize
|
||||
kernel void kernel_ssm_conv_f32(
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t ir = tgpig.x;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i3 = tgpig.z;
|
||||
|
||||
const int64_t nc = ne10;
|
||||
const int64_t ncs = ne00;
|
||||
const int64_t nr = ne01;
|
||||
const int64_t n_t = ne1;
|
||||
const int64_t n_s = ne2;
|
||||
|
||||
device const float * s = (device const float *) ((device const char *) src0 + ir*nb01 + i2*nb00 + i3*nb02);
|
||||
device const float * c = (device const float *) ((device const char *) src1 + ir*nb11);
|
||||
device float * x = (device float *) ((device char *) dst + ir*nb0 + i2*nb1 + i3*nb2);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||
sumf += s[i0] * c[i0];
|
||||
}
|
||||
|
||||
x[0] = sumf;
|
||||
}
|
||||
|
||||
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32
|
||||
// TODO: optimize
|
||||
kernel void kernel_ssm_scan_f32(
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device const void * src2,
|
||||
device const void * src3,
|
||||
device const void * src4,
|
||||
device const void * src5,
|
||||
device float * dst,
|
||||
constant int64_t & d_state,
|
||||
constant int64_t & d_inner,
|
||||
constant int64_t & n_seq_tokens,
|
||||
constant int64_t & n_seqs,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant uint64_t & nb20,
|
||||
constant uint64_t & nb21,
|
||||
constant uint64_t & nb22,
|
||||
constant uint64_t & nb30,
|
||||
constant uint64_t & nb31,
|
||||
constant uint64_t & nb40,
|
||||
constant uint64_t & nb41,
|
||||
constant uint64_t & nb42,
|
||||
constant uint64_t & nb50,
|
||||
constant uint64_t & nb51,
|
||||
constant uint64_t & nb52,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t ir = tgpig.x;
|
||||
const int64_t i3 = tgpig.y;
|
||||
|
||||
const int64_t nc = d_state;
|
||||
const int64_t nr = d_inner;
|
||||
const int64_t n_t = n_seq_tokens;
|
||||
const int64_t n_s = n_seqs;
|
||||
|
||||
for (int64_t i2 = 0; i2 < n_t; ++i2) {
|
||||
device const float * s0 = (device const float *) ((device const char *) src0 + ir*nb01 + i3*nb02);
|
||||
device const float * x = (device const float *) ((device const char *) src1 + ir*nb10 + i2*nb11 + i3*nb12);
|
||||
device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*nb21 + i3*nb22);
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*nb31);
|
||||
device const float * B = (device const float *) ((device const char *) src4 + i2*nb41 + i3*nb42);
|
||||
device const float * C = (device const float *) ((device const char *) src5 + i2*nb51 + i3*nb52);
|
||||
device float * y = (device float *) ((device char *) dst + ir*nb10 + i2*nb11 + i3*nb12); // TODO: do not use src1 strides
|
||||
device float * s = (device float *) ((device char *) dst + ir*nb01 + i3*nb02 + nb13);
|
||||
|
||||
if (i2 > 0) {
|
||||
s0 = s;
|
||||
}
|
||||
|
||||
// i1 == 0
|
||||
float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
|
||||
float x_dt = x[0] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||
int64_t i = i0;
|
||||
float state = (s0[i] * exp(dt_soft_plus * A[i])) + (B[i0] * x_dt);
|
||||
sumf += state * C[i0];
|
||||
s[i] = state;
|
||||
}
|
||||
|
||||
y[0] = sumf;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_norm(
|
||||
device const void * src0,
|
||||
device float * dst,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue