add some new ops, fix some operators and add batch operations to certain operators. (ggml/747)

* cuda: fix group_norm

* cuda: add batch inference support for ggml_pad/ggml_upscale

* add ggml_arrange

* add ggml_timestep_embedding

* update ggml_arange/ggml_timestep_embedding tests

* cuda: fix im2col

* add ggml_arange/ggml_timestep_embbeding support for metal backend

* fix some bugs

* fix some bugs

* Update ggml.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-cuda.cu

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml-metal.metal

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* modify according to the review comments

* ggml : fix compile warnings + code style

* ggml : normalize compute_forward calls + fix seg fault in debug

* minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
leejet 2024-03-03 20:23:52 +08:00 committed by Georgi Gerganov
parent 82f3e668ad
commit 7d43c585dc
No known key found for this signature in database
GPG key ID: BF970631944C16B7
6 changed files with 550 additions and 52 deletions

View file

@ -1959,6 +1959,49 @@ kernel void kernel_pad_f32(
}
}
kernel void kernel_arange_f32(
device char * dst,
constant int64_t & ne0,
constant float & start,
constant float & step,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
device float * dst_ptr = (device float *) dst;
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
dst_ptr[i0] = start + step * i0;
}
}
kernel void kernel_timestep_embedding_f32(
device const char * src0,
device char * dst,
constant uint64_t & nb1,
constant int & dim,
constant int & max_period,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
int i = tgpig.x;
device float * embed_data = (device float *)(dst + i*nb1);
int half_ = dim / 2;
for (int j = tpitg.x; j < half_; j += ntg.x) {
float timestep = ((device float *)src0)[i];
float freq = (float)exp(-log((float)max_period) * j / half_);
float arg = timestep * freq;
embed_data[j ] = cos(arg);
embed_data[j + half_] = sin(arg);
}
if (dim % 2 != 0 && tpitg.x == 0) {
embed_data[dim] = 0.f;
}
}
// bitonic sort implementation following the CUDA kernels as reference
typedef void (argsort_t)(
device const float * x,