Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
b717db8eff
12 changed files with 459 additions and 8 deletions
|
@ -17,7 +17,7 @@
|
|||
#
|
||||
# python3 convert_hf_to_gguf_update.py <huggingface_token>
|
||||
#
|
||||
# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
|
||||
# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
|
||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||
#
|
||||
# TODO: generate tokenizer tests for llama.cpp
|
||||
|
|
|
@ -12,7 +12,7 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
|
||||
// Get only the program name from the full path
|
||||
auto pos = filename.find_last_of('/');
|
||||
auto pos = filename.find_last_of("/\\");
|
||||
if (pos != std::string::npos) {
|
||||
filename = filename.substr(pos+1);
|
||||
}
|
||||
|
|
|
@ -921,6 +921,8 @@ struct server_context {
|
|||
slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
|
||||
|
||||
slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min);
|
||||
slot.params.speculative.n_min = std::max(slot.params.speculative.n_min, 2);
|
||||
slot.params.speculative.n_max = std::max(slot.params.speculative.n_max, 0);
|
||||
|
||||
if (slot.params.sampling.dry_base < 1.0f) {
|
||||
slot.params.sampling.dry_base = defaults.sampling.dry_base;
|
||||
|
@ -2322,10 +2324,29 @@ struct server_context {
|
|||
continue;
|
||||
}
|
||||
|
||||
// determine the max draft that fits the current slot state
|
||||
int n_draft_max = slot.params.speculative.n_max;
|
||||
|
||||
// note: n_past is not yet increased for the `id` token sampled above
|
||||
// also, need to leave space for 1 extra token to allow context shifts
|
||||
n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.n_past - 2);
|
||||
|
||||
if (slot.n_remaining > 0) {
|
||||
n_draft_max = std::min(n_draft_max, slot.n_remaining - 1);
|
||||
}
|
||||
|
||||
SLT_DBG(slot, "max possible draft: %d\n", n_draft_max);
|
||||
|
||||
if (n_draft_max < slot.params.speculative.n_min) {
|
||||
SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.params.speculative.n_min);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
llama_token id = slot.sampled;
|
||||
|
||||
struct common_speculative_params params_spec;
|
||||
params_spec.n_draft = slot.params.speculative.n_max;
|
||||
params_spec.n_draft = n_draft_max;
|
||||
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
|
||||
params_spec.p_min = slot.params.speculative.p_min;
|
||||
|
||||
|
@ -2333,6 +2354,8 @@ struct server_context {
|
|||
|
||||
// ignore small drafts
|
||||
if (slot.params.speculative.n_min > (int) draft.size()) {
|
||||
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -2344,6 +2367,8 @@ struct server_context {
|
|||
common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
|
||||
}
|
||||
|
||||
SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens);
|
||||
|
||||
llama_decode(ctx, slot.batch_spec);
|
||||
|
||||
// the accepted tokens from the speculation
|
||||
|
@ -2372,7 +2397,7 @@ struct server_context {
|
|||
}
|
||||
}
|
||||
|
||||
SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size());
|
||||
SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -82,6 +82,37 @@ def test_different_draft_min_draft_max():
|
|||
last_content = res.body["content"]
|
||||
|
||||
|
||||
def test_slot_ctx_not_exceeded():
|
||||
global server
|
||||
server.n_ctx = 64
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "Hello " * 56,
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"speculative.p_min": 0.0,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body["content"]) > 0
|
||||
|
||||
|
||||
def test_with_ctx_shift():
|
||||
global server
|
||||
server.n_ctx = 64
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "Hello " * 56,
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"n_predict": 64,
|
||||
"speculative.p_min": 0.0,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body["content"]) > 0
|
||||
assert res.body["tokens_predicted"] == 64
|
||||
assert res.body["truncated"] == True
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_slots,n_requests", [
|
||||
(1, 2),
|
||||
(2, 2),
|
||||
|
|
|
@ -499,6 +499,7 @@ extern "C" {
|
|||
GGML_OP_POOL_2D_BACK,
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_PAD,
|
||||
GGML_OP_PAD_REFLECT_1D,
|
||||
GGML_OP_ARANGE,
|
||||
GGML_OP_TIMESTEP_EMBEDDING,
|
||||
GGML_OP_ARGSORT,
|
||||
|
@ -1695,6 +1696,13 @@ extern "C" {
|
|||
int p2,
|
||||
int p3);
|
||||
|
||||
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
||||
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
||||
// timesteps: [N,]
|
||||
// return: [N, dim]
|
||||
|
|
|
@ -1374,7 +1374,10 @@ struct ggml_compute_state {
|
|||
|
||||
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
|
||||
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
|
||||
|
||||
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
||||
|
@ -8248,6 +8251,77 @@ static void ggml_compute_forward_set_f32(
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_set_i32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
||||
|
||||
// view src0 and dst with these strides and data offset inbytes during set
|
||||
// nb0 is implicitly element_size because src0 and dst are contiguous
|
||||
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
||||
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
||||
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
||||
size_t offset = ((int32_t *) dst->op_params)[3];
|
||||
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
||||
|
||||
if (!inplace) {
|
||||
if (params->ith == 0) {
|
||||
// memcpy needs to be synchronized across threads to avoid race conditions.
|
||||
// => do it in INIT phase
|
||||
memcpy(
|
||||
((char *) dst->data),
|
||||
((char *) src0->data),
|
||||
ggml_nbytes(dst));
|
||||
}
|
||||
ggml_barrier(params->threadpool);
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nr = ggml_nrows(src1);
|
||||
const int nc = src1->ne[0];
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
|
||||
|
||||
// src0 and dst as viewed during set
|
||||
const size_t nb0 = ggml_element_size(src0);
|
||||
|
||||
const int im0 = (ne10 == 0 ? 0 : ne10-1);
|
||||
const int im1 = (ne11 == 0 ? 0 : ne11-1);
|
||||
const int im2 = (ne12 == 0 ? 0 : ne12-1);
|
||||
const int im3 = (ne13 == 0 ? 0 : ne13-1);
|
||||
|
||||
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
|
||||
|
||||
GGML_ASSERT(nb10 == sizeof(int32_t));
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// src0 and dst are viewed with shape of src1 and offset
|
||||
// => same indices
|
||||
const int i3 = ir/(ne12*ne11);
|
||||
const int i2 = (ir - i3*ne12*ne11)/ne11;
|
||||
const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
|
||||
|
||||
ggml_vec_cpy_i32(nc,
|
||||
(int32_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset),
|
||||
(int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_set(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
@ -8259,6 +8333,10 @@ static void ggml_compute_forward_set(
|
|||
{
|
||||
ggml_compute_forward_set_f32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_I32:
|
||||
{
|
||||
ggml_compute_forward_set_i32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
|
@ -10439,6 +10517,40 @@ static void ggml_compute_forward_pad(
|
|||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_pad_reflect_1d
|
||||
|
||||
static void ggml_compute_forward_pad_reflect_1d(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int32_t * opts = (const int32_t *) dst->op_params;
|
||||
const int p0 = opts[0];
|
||||
const int p1 = opts[1];
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
||||
float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0);
|
||||
float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0);
|
||||
|
||||
ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
|
||||
|
||||
for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; }
|
||||
for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_arange
|
||||
|
||||
|
@ -12535,6 +12647,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{
|
||||
ggml_compute_forward_pad(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
{
|
||||
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
ggml_compute_forward_arange(params, tensor);
|
||||
|
@ -12877,6 +12993,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
} break;
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
|
|
|
@ -102,6 +102,21 @@ typedef struct {
|
|||
uint64_t nb3;
|
||||
} ggml_metal_kargs_cpy;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne10;
|
||||
int64_t ne11;
|
||||
int64_t ne12;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
uint64_t offs;
|
||||
bool inplace;
|
||||
} ggml_metal_kargs_set;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
int32_t ne01;
|
||||
|
|
|
@ -310,6 +310,7 @@ enum ggml_metal_kernel_type {
|
|||
GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
|
@ -371,6 +372,8 @@ enum ggml_metal_kernel_type {
|
|||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_SET_I32,
|
||||
GGML_METAL_KERNEL_TYPE_SET_F32,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_BF16,
|
||||
|
@ -877,6 +880,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
|
@ -938,6 +942,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat);
|
||||
|
@ -1099,6 +1105,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
|||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
|
@ -1156,6 +1163,16 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
|||
return false;
|
||||
};
|
||||
}
|
||||
case GGML_OP_SET:
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_I32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
};
|
||||
}
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
|
@ -3258,6 +3275,38 @@ static void ggml_metal_encode_node(
|
|||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
{
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
const int32_t p0 = ((const int32_t *)(dst->op_params))[0];
|
||||
const int32_t p1 = ((const int32_t *)(dst->op_params))[1];
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:6];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:11];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:12];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:13];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:14];
|
||||
[encoder setBytes:&p0 length:sizeof(p0) atIndex:15];
|
||||
[encoder setBytes:&p1 length:sizeof(p1) atIndex:16];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
|
@ -3789,6 +3838,68 @@ static void ggml_metal_encode_node(
|
|||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_SET:
|
||||
{
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
||||
|
||||
// src0 and dst as viewed during set
|
||||
const size_t dst_nb0 = ggml_element_size(src0);
|
||||
|
||||
const size_t dst_nb1 = ((int32_t *) dst->op_params)[0];
|
||||
const size_t dst_nb2 = ((int32_t *) dst->op_params)[1];
|
||||
const size_t dst_nb3 = ((int32_t *) dst->op_params)[2];
|
||||
const size_t offset = ((int32_t *) dst->op_params)[3];
|
||||
const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
||||
|
||||
if (!inplace) {
|
||||
memcpy(((char *) dst->data), ((char *) src0->data), ggml_nbytes(dst));
|
||||
}
|
||||
|
||||
const int im0 = (ne10 == 0 ? 0 : ne10-1);
|
||||
const int im1 = (ne11 == 0 ? 0 : ne11-1);
|
||||
const int im2 = (ne12 == 0 ? 0 : ne12-1);
|
||||
const int im3 = (ne13 == 0 ? 0 : ne13-1);
|
||||
|
||||
GGML_ASSERT(offset + im0*dst_nb0 + im1*dst_nb1 + im2*dst_nb2 + im3*dst_nb3 <= ggml_nbytes(dst));
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32:
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break;
|
||||
case GGML_TYPE_I32:
|
||||
GGML_ASSERT(nb10 == sizeof(int32_t));
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
ggml_metal_kargs_set args = {
|
||||
/*.ne10 =*/ ne10,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.nb1 =*/ dst_nb1,
|
||||
/*.nb2 =*/ dst_nb2,
|
||||
/*.nb3 =*/ dst_nb3,
|
||||
/*.offs =*/ offset,
|
||||
/*.inplace =*/ inplace,
|
||||
};
|
||||
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10);
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_POOL_2D:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
|
|
@ -2897,6 +2897,53 @@ kernel void kernel_pad_f32(
|
|||
}
|
||||
}
|
||||
|
||||
kernel void kernel_pad_reflect_1d_f32(
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant int64_t & ne0,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int32_t & p0,
|
||||
constant int32_t & p1,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i01 = i1;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
||||
|
||||
if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
if (i0 < p0) {
|
||||
dst_ptr[i0] = src0_ptr[p0 - i0];
|
||||
} else if (i0 < ne0 - p1) {
|
||||
dst_ptr[i0] = src0_ptr[i0 - p0];
|
||||
} else {
|
||||
dst_ptr[i0] = src0_ptr[(ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_arange_f32(
|
||||
device char * dst,
|
||||
constant int64_t & ne0,
|
||||
|
@ -3880,6 +3927,38 @@ template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_
|
|||
|
||||
#undef FA_TYPES
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_set(
|
||||
constant ggml_metal_kargs_set & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i13 = tgpig[2];
|
||||
const int i12 = tgpig[1];
|
||||
const int i11 = tgpig[0];
|
||||
|
||||
const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10;
|
||||
|
||||
const int64_t i3 = n / (args.ne12*args.ne11*args.ne10);
|
||||
const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10);
|
||||
const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10;
|
||||
|
||||
device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs);
|
||||
|
||||
for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) {
|
||||
device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10);
|
||||
dst_data[i10] = (T) src[0];
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_set<float>) kernel_set_t;
|
||||
|
||||
template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set<float>;
|
||||
template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set<int32_t>;
|
||||
|
||||
template<typename T0, typename T1>
|
||||
kernel void kernel_cpy(
|
||||
constant ggml_metal_kargs_cpy & args,
|
||||
|
|
|
@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"POOL_2D_BACK",
|
||||
"UPSCALE",
|
||||
"PAD",
|
||||
"PAD_REFLECT_1D",
|
||||
"ARANGE",
|
||||
"TIMESTEP_EMBEDDING",
|
||||
"ARGSORT",
|
||||
|
@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"OPT_STEP_ADAMW",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
|
@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"pool_2d_back(x)",
|
||||
"upscale(x)",
|
||||
"pad(x)",
|
||||
"pad_reflect_1d(x)",
|
||||
"arange(start, stop, step)",
|
||||
"timestep_embedding(timesteps, dim, max_period)",
|
||||
"argsort(x)",
|
||||
|
@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"adamw(x)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
|
@ -4097,6 +4099,37 @@ struct ggml_tensor * ggml_pad(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml_pad_reflect_1d
|
||||
|
||||
struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int p0,
|
||||
int p1) {
|
||||
GGML_ASSERT(p0 >= 0);
|
||||
GGML_ASSERT(p1 >= 0);
|
||||
|
||||
GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
|
||||
GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(a));
|
||||
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
||||
a->ne[0] + p0 + p1,
|
||||
a->ne[1],
|
||||
a->ne[2],
|
||||
a->ne[3]);
|
||||
|
||||
int32_t params[] = { p0, p1 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_PAD_REFLECT_1D;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_arange
|
||||
|
||||
struct ggml_tensor * ggml_arange(
|
||||
|
|
|
@ -1 +1 @@
|
|||
b903ffe79daf18c0aaacbebe44a7b93a6b8d0982
|
||||
74d66b63eaf207a24f3e93bb922aba131cbf2906
|
||||
|
|
|
@ -2697,6 +2697,33 @@ struct test_pad : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// GGML_OP_PAD_REFLECT_1D
|
||||
struct test_pad_reflect_1d : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne_a;
|
||||
const int pad_0;
|
||||
const int pad_1;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
|
||||
}
|
||||
|
||||
test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne_a = {512, 34, 2, 1},
|
||||
int pad_0 = 10, int pad_1 = 9)
|
||||
: type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 2, ne_a.data());
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * out = ggml_pad_reflect_1d(ctx, a, pad_0, pad_1);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_ARANGE
|
||||
struct test_arange : public test_case {
|
||||
const ggml_type type;
|
||||
|
@ -3494,6 +3521,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim));
|
||||
}
|
||||
|
||||
for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) {
|
||||
test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim));
|
||||
}
|
||||
|
||||
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
for (ggml_type type_dst : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
||||
|
@ -3816,6 +3847,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
|
||||
test_cases.emplace_back(new test_acc());
|
||||
test_cases.emplace_back(new test_pad());
|
||||
test_cases.emplace_back(new test_pad_reflect_1d());
|
||||
test_cases.emplace_back(new test_arange());
|
||||
test_cases.emplace_back(new test_timestep_embedding());
|
||||
test_cases.emplace_back(new test_leaky_relu());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue