Merge branch 'master' into gg/flash-attn

This commit is contained in:
Georgi Gerganov 2024-03-04 20:42:48 +02:00
commit e307882c34
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
17 changed files with 995 additions and 272 deletions

View file

@ -1959,6 +1959,49 @@ kernel void kernel_pad_f32(
}
}
kernel void kernel_arange_f32(
device char * dst,
constant int64_t & ne0,
constant float & start,
constant float & step,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
device float * dst_ptr = (device float *) dst;
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
dst_ptr[i0] = start + step * i0;
}
}
kernel void kernel_timestep_embedding_f32(
device const char * src0,
device char * dst,
constant uint64_t & nb1,
constant int & dim,
constant int & max_period,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
int i = tgpig.x;
device float * embed_data = (device float *)(dst + i*nb1);
int half_ = dim / 2;
for (int j = tpitg.x; j < half_; j += ntg.x) {
float timestep = ((device float *)src0)[i];
float freq = (float)exp(-log((float)max_period) * j / half_);
float arg = timestep * freq;
embed_data[j ] = cos(arg);
embed_data[j + half_] = sin(arg);
}
if (dim % 2 != 0 && tpitg.x == 0) {
embed_data[dim] = 0.f;
}
}
// bitonic sort implementation following the CUDA kernels as reference
typedef void (argsort_t)(
device const float * x,