Merge branch 'ggerganov:master' into master

This commit is contained in:
Riccardo Orlando 2024-12-05 13:15:24 +01:00 committed by GitHub
commit b717db8eff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 459 additions and 8 deletions

View file

@ -17,7 +17,7 @@
#
# python3 convert_hf_to_gguf_update.py <huggingface_token>
#
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
# - Update llama.cpp with the new pre-tokenizer if necessary
#
# TODO: generate tokenizer tests for llama.cpp

View file

@ -12,7 +12,7 @@ int main(int argc, char** argv) {
}
// Get only the program name from the full path
auto pos = filename.find_last_of('/');
auto pos = filename.find_last_of("/\\");
if (pos != std::string::npos) {
filename = filename.substr(pos+1);
}

View file

@ -921,6 +921,8 @@ struct server_context {
slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min);
slot.params.speculative.n_min = std::max(slot.params.speculative.n_min, 2);
slot.params.speculative.n_max = std::max(slot.params.speculative.n_max, 0);
if (slot.params.sampling.dry_base < 1.0f) {
slot.params.sampling.dry_base = defaults.sampling.dry_base;
@ -2322,10 +2324,29 @@ struct server_context {
continue;
}
// determine the max draft that fits the current slot state
int n_draft_max = slot.params.speculative.n_max;
// note: n_past is not yet increased for the `id` token sampled above
// also, need to leave space for 1 extra token to allow context shifts
n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.n_past - 2);
if (slot.n_remaining > 0) {
n_draft_max = std::min(n_draft_max, slot.n_remaining - 1);
}
SLT_DBG(slot, "max possible draft: %d\n", n_draft_max);
if (n_draft_max < slot.params.speculative.n_min) {
SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.params.speculative.n_min);
continue;
}
llama_token id = slot.sampled;
struct common_speculative_params params_spec;
params_spec.n_draft = slot.params.speculative.n_max;
params_spec.n_draft = n_draft_max;
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
params_spec.p_min = slot.params.speculative.p_min;
@ -2333,6 +2354,8 @@ struct server_context {
// ignore small drafts
if (slot.params.speculative.n_min > (int) draft.size()) {
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
continue;
}
@ -2344,6 +2367,8 @@ struct server_context {
common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
}
SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens);
llama_decode(ctx, slot.batch_spec);
// the accepted tokens from the speculation
@ -2372,7 +2397,7 @@ struct server_context {
}
}
SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size());
SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past);
}
}

View file

@ -82,6 +82,37 @@ def test_different_draft_min_draft_max():
last_content = res.body["content"]
def test_slot_ctx_not_exceeded():
global server
server.n_ctx = 64
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "Hello " * 56,
"temperature": 0.0,
"top_k": 1,
"speculative.p_min": 0.0,
})
assert res.status_code == 200
assert len(res.body["content"]) > 0
def test_with_ctx_shift():
global server
server.n_ctx = 64
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "Hello " * 56,
"temperature": 0.0,
"top_k": 1,
"n_predict": 64,
"speculative.p_min": 0.0,
})
assert res.status_code == 200
assert len(res.body["content"]) > 0
assert res.body["tokens_predicted"] == 64
assert res.body["truncated"] == True
@pytest.mark.parametrize("n_slots,n_requests", [
(1, 2),
(2, 2),

View file

@ -499,6 +499,7 @@ extern "C" {
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@ -1695,6 +1696,13 @@ extern "C" {
int p2,
int p3);
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1);
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]

View file

@ -1374,7 +1374,10 @@ struct ggml_compute_state {
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
@ -8248,6 +8251,77 @@ static void ggml_compute_forward_set_f32(
}
}
static void ggml_compute_forward_set_i32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
// view src0 and dst with these strides and data offset inbytes during set
// nb0 is implicitly element_size because src0 and dst are contiguous
size_t nb1 = ((int32_t *) dst->op_params)[0];
size_t nb2 = ((int32_t *) dst->op_params)[1];
size_t nb3 = ((int32_t *) dst->op_params)[2];
size_t offset = ((int32_t *) dst->op_params)[3];
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace) {
if (params->ith == 0) {
// memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase
memcpy(
((char *) dst->data),
((char *) src0->data),
ggml_nbytes(dst));
}
ggml_barrier(params->threadpool);
}
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(src1);
const int nc = src1->ne[0];
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
// src0 and dst as viewed during set
const size_t nb0 = ggml_element_size(src0);
const int im0 = (ne10 == 0 ? 0 : ne10-1);
const int im1 = (ne11 == 0 ? 0 : ne11-1);
const int im2 = (ne12 == 0 ? 0 : ne12-1);
const int im3 = (ne13 == 0 ? 0 : ne13-1);
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
GGML_ASSERT(nb10 == sizeof(int32_t));
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
for (int ir = ir0; ir < ir1; ++ir) {
// src0 and dst are viewed with shape of src1 and offset
// => same indices
const int i3 = ir/(ne12*ne11);
const int i2 = (ir - i3*ne12*ne11)/ne11;
const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
ggml_vec_cpy_i32(nc,
(int32_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset),
(int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
}
}
static void ggml_compute_forward_set(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@ -8259,6 +8333,10 @@ static void ggml_compute_forward_set(
{
ggml_compute_forward_set_f32(params, dst);
} break;
case GGML_TYPE_I32:
{
ggml_compute_forward_set_i32(params, dst);
} break;
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0:
@ -10439,6 +10517,40 @@ static void ggml_compute_forward_pad(
}
}
// ggml_compute_forward_pad_reflect_1d
static void ggml_compute_forward_pad_reflect_1d(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int ith = params->ith;
const int nth = params->nth;
const int32_t * opts = (const int32_t *) dst->op_params;
const int p0 = opts[0];
const int p1 = opts[1];
GGML_TENSOR_UNARY_OP_LOCALS
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0);
float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0);
ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; }
for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; }
}
}
}
}
// ggml_compute_forward_arange
@ -12535,6 +12647,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad(params, tensor);
} break;
case GGML_OP_PAD_REFLECT_1D:
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@ -12877,6 +12993,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:

View file

@ -102,6 +102,21 @@ typedef struct {
uint64_t nb3;
} ggml_metal_kargs_cpy;
typedef struct {
int64_t ne10;
int64_t ne11;
int64_t ne12;
uint64_t nb10;
uint64_t nb11;
uint64_t nb12;
uint64_t nb13;
uint64_t nb1;
uint64_t nb2;
uint64_t nb3;
uint64_t offs;
bool inplace;
} ggml_metal_kargs_set;
typedef struct {
int32_t ne00;
int32_t ne01;

View file

@ -310,6 +310,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
GGML_METAL_KERNEL_TYPE_PAD_F32,
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
@ -371,6 +372,8 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,
GGML_METAL_KERNEL_TYPE_SET_I32,
GGML_METAL_KERNEL_TYPE_SET_F32,
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,
@ -877,6 +880,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
@ -938,6 +942,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat);
@ -1099,6 +1105,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_POOL_2D:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT:
@ -1156,6 +1163,16 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
return false;
};
}
case GGML_OP_SET:
{
switch (op->src[0]->type) {
case GGML_TYPE_F32:
case GGML_TYPE_I32:
return true;
default:
return false;
};
}
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_GET_ROWS:
{
@ -3258,6 +3275,38 @@ static void ggml_metal_encode_node(
const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_PAD_REFLECT_1D:
{
GGML_ASSERT(src0->type == GGML_TYPE_F32);
const int32_t p0 = ((const int32_t *)(dst->op_params))[0];
const int32_t p1 = ((const int32_t *)(dst->op_params))[1];
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:6];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:11];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:12];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:13];
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:14];
[encoder setBytes:&p0 length:sizeof(p0) atIndex:15];
[encoder setBytes:&p1 length:sizeof(p1) atIndex:16];
const int nth = MIN(1024, ne0);
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ARANGE:
@ -3789,6 +3838,68 @@ static void ggml_metal_encode_node(
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_SET:
{
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
// src0 and dst as viewed during set
const size_t dst_nb0 = ggml_element_size(src0);
const size_t dst_nb1 = ((int32_t *) dst->op_params)[0];
const size_t dst_nb2 = ((int32_t *) dst->op_params)[1];
const size_t dst_nb3 = ((int32_t *) dst->op_params)[2];
const size_t offset = ((int32_t *) dst->op_params)[3];
const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace) {
memcpy(((char *) dst->data), ((char *) src0->data), ggml_nbytes(dst));
}
const int im0 = (ne10 == 0 ? 0 : ne10-1);
const int im1 = (ne11 == 0 ? 0 : ne11-1);
const int im2 = (ne12 == 0 ? 0 : ne12-1);
const int im3 = (ne13 == 0 ? 0 : ne13-1);
GGML_ASSERT(offset + im0*dst_nb0 + im1*dst_nb1 + im2*dst_nb2 + im3*dst_nb3 <= ggml_nbytes(dst));
id<MTLComputePipelineState> pipeline = nil;
switch (src0t) {
case GGML_TYPE_F32:
GGML_ASSERT(nb10 == sizeof(float));
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break;
case GGML_TYPE_I32:
GGML_ASSERT(nb10 == sizeof(int32_t));
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break;
default: GGML_ABORT("fatal error");
}
ggml_metal_kargs_set args = {
/*.ne10 =*/ ne10,
/*.ne11 =*/ ne11,
/*.ne12 =*/ ne12,
/*.nb10 =*/ nb10,
/*.nb11 =*/ nb11,
/*.nb12 =*/ nb12,
/*.nb13 =*/ nb13,
/*.nb1 =*/ dst_nb1,
/*.nb2 =*/ dst_nb2,
/*.nb3 =*/ dst_nb3,
/*.offs =*/ offset,
/*.inplace =*/ inplace,
};
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10);
[encoder setComputePipelineState:pipeline];
[encoder setBytes:&args length:sizeof(args) atIndex:0];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_POOL_2D:
{
GGML_ASSERT(ggml_is_contiguous(src0));

View file

@ -2897,6 +2897,53 @@ kernel void kernel_pad_f32(
}
}
kernel void kernel_pad_reflect_1d_f32(
device const char * src0,
device char * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant int64_t & ne0,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int32_t & p0,
constant int32_t & p1,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tgpg[[threadgroups_per_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i3 = tgpig.z;
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;
const int64_t i03 = i3;
const int64_t i02 = i2;
const int64_t i01 = i1;
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
if (i0 < p0) {
dst_ptr[i0] = src0_ptr[p0 - i0];
} else if (i0 < ne0 - p1) {
dst_ptr[i0] = src0_ptr[i0 - p0];
} else {
dst_ptr[i0] = src0_ptr[(ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1];
}
}
}
}
kernel void kernel_arange_f32(
device char * dst,
constant int64_t & ne0,
@ -3880,6 +3927,38 @@ template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_
#undef FA_TYPES
template<typename T>
kernel void kernel_set(
constant ggml_metal_kargs_set & args,
device const char * src0,
device const char * src1,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
ushort3 tpitg[[thread_position_in_threadgroup]],
ushort3 ntg[[threads_per_threadgroup]]) {
const int i13 = tgpig[2];
const int i12 = tgpig[1];
const int i11 = tgpig[0];
const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10;
const int64_t i3 = n / (args.ne12*args.ne11*args.ne10);
const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10);
const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10;
device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs);
for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) {
device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10);
dst_data[i10] = (T) src[0];
}
}
typedef decltype(kernel_set<float>) kernel_set_t;
template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set<float>;
template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set<int32_t>;
template<typename T0, typename T1>
kernel void kernel_cpy(
constant ggml_metal_kargs_cpy & args,

View file

@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_2D_BACK",
"UPSCALE",
"PAD",
"PAD_REFLECT_1D",
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT",
@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_2d_back(x)",
"upscale(x)",
"pad(x)",
"pad_reflect_1d(x)",
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)",
@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4097,6 +4099,37 @@ struct ggml_tensor * ggml_pad(
return result;
}
// ggml_pad_reflect_1d
struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1) {
GGML_ASSERT(p0 >= 0);
GGML_ASSERT(p1 >= 0);
GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
GGML_ASSERT(ggml_is_contiguous(a));
GGML_ASSERT(a->type == GGML_TYPE_F32);
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
a->ne[0] + p0 + p1,
a->ne[1],
a->ne[2],
a->ne[3]);
int32_t params[] = { p0, p1 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_PAD_REFLECT_1D;
result->src[0] = a;
return result;
}
// ggml_arange
struct ggml_tensor * ggml_arange(

View file

@ -1 +1 @@
b903ffe79daf18c0aaacbebe44a7b93a6b8d0982
74d66b63eaf207a24f3e93bb922aba131cbf2906

View file

@ -2697,6 +2697,33 @@ struct test_pad : public test_case {
}
};
// GGML_OP_PAD_REFLECT_1D
struct test_pad_reflect_1d : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne_a;
const int pad_0;
const int pad_1;
std::string vars() override {
return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
}
test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {512, 34, 2, 1},
int pad_0 = 10, int pad_1 = 9)
: type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 2, ne_a.data());
ggml_set_name(a, "a");
ggml_tensor * out = ggml_pad_reflect_1d(ctx, a, pad_0, pad_1);
ggml_set_name(out, "out");
return out;
}
};
// GGML_OP_ARANGE
struct test_arange : public test_case {
const ggml_type type;
@ -3494,6 +3521,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
}
for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim));
}
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
for (ggml_type type_dst : all_types) {
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
@ -3816,6 +3847,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
test_cases.emplace_back(new test_acc());
test_cases.emplace_back(new test_pad());
test_cases.emplace_back(new test_pad_reflect_1d());
test_cases.emplace_back(new test_arange());
test_cases.emplace_back(new test_timestep_embedding());
test_cases.emplace_back(new test_leaky_relu());