Merge branch 'master' into concedo_experimental

This commit is contained in:
Concedo 2024-01-03 22:46:49 +08:00
commit d37c94bcd9
8 changed files with 249 additions and 17 deletions

View file

@ -95,6 +95,15 @@ export async function* llama(prompt, params = {}, config = {}) {
break;
}
}
if (result.error) {
result.error = JSON.parse(result.error);
if (result.error.content.includes('slot unavailable')) {
// Throw an error to be caught by upstream callers
throw new Error('slot unavailable');
} else {
console.error(`llama.cpp error: ${result.error.content}`);
}
}
if (result.error) {
result.error = JSON.parse(result.error);
console.error(`llama.cpp error: ${result.error.content}`);

View file

@ -10042,14 +10042,19 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
}
return false;
} break;
case GGML_OP_DUP:
case GGML_OP_REPEAT:
case GGML_OP_CONCAT:
{
ggml_type src0_type = op->src[0]->type;
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
} break;
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_REPEAT:
case GGML_OP_DUP:
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
@ -10066,7 +10071,6 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:
case GGML_OP_CONCAT:
case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:

View file

@ -87,6 +87,7 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
GGML_METAL_DECL_KERNEL(get_rows_i32);
GGML_METAL_DECL_KERNEL(rms_norm);
GGML_METAL_DECL_KERNEL(group_norm);
GGML_METAL_DECL_KERNEL(norm);
@ -377,6 +378,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
GGML_METAL_ADD_KERNEL(get_rows_i32);
GGML_METAL_ADD_KERNEL(rms_norm);
GGML_METAL_ADD_KERNEL(group_norm);
GGML_METAL_ADD_KERNEL(norm);
@ -499,6 +501,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(get_rows_q4_K);
GGML_METAL_DEL_KERNEL(get_rows_q5_K);
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
GGML_METAL_DEL_KERNEL(get_rows_i32);
GGML_METAL_DEL_KERNEL(rms_norm);
GGML_METAL_DEL_KERNEL(group_norm);
GGML_METAL_DEL_KERNEL(norm);
@ -1978,6 +1981,7 @@ void ggml_metal_graph_compute(
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break;
default: GGML_ASSERT(false && "not implemented");
}

View file

@ -3829,6 +3829,35 @@ kernel void kernel_get_rows_f16(
}
}
kernel void kernel_get_rows_i32(
device const void * src0,
device const char * src1,
device int32_t * dst,
constant int64_t & ne00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb1,
constant uint64_t & nb2,
uint3 tgpig[[threadgroup_position_in_grid]],
uint tiitg[[thread_index_in_threadgroup]],
uint3 tptg [[threads_per_threadgroup]]) {
const int64_t i10 = tgpig.x;
const int64_t i11 = tgpig.y;
const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
const int64_t i02 = i11;
for (int ind = tiitg; ind < ne00; ind += tptg.x) {
((device int32_t *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
((device int32_t *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
}
}
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
#define BLOCK_SIZE_K 32

166
ggml.c
View file

@ -4766,8 +4766,11 @@ struct ggml_tensor * ggml_get_rows(
}
// TODO: implement non F32 return
//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
enum ggml_type type = GGML_TYPE_F32;
if (a->type == GGML_TYPE_I32) {
type = a->type;
}
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
result->op = GGML_OP_GET_ROWS;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -6938,14 +6941,165 @@ static void ggml_compute_forward_dup_f32(
}
}
// A simplified version of ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy.
static void ggml_compute_forward_dup_bytes(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
GGML_ASSERT(src0->type == dst->type);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
ggml_compute_forward_dup_same_cont(params, src0, dst);
return;
}
GGML_TENSOR_UNARY_OP_LOCALS;
const size_t type_size = ggml_type_size(src0->type);
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
// parallelize by rows
const int nr = ne01;
// number of rows per thread
const int dr = (nr + nth - 1) / nth;
// row range for this thread
const int ir0 = dr * ith;
const int ir1 = MIN(ir0 + dr, nr);
if (src0->type == dst->type &&
ne00 == ne0 &&
nb00 == type_size && nb0 == type_size) {
// copy by rows
const size_t rs = ne00 * type_size;
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = ir0; i01 < ir1; i01++) {
memcpy(
((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
rs);
}
}
}
return;
}
if (ggml_is_contiguous(dst)) {
size_t id = 0;
char * dst_ptr = (char *) dst->data;
const size_t rs = ne00 * type_size;
if (nb00 == type_size) {
// src0 is contigous on first dimension, copy by rows
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
id += rs * ir0;
for (int64_t i01 = ir0; i01 < ir1; i01++) {
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
memcpy(dst_ptr + id, src0_ptr, rs);
id += rs;
}
id += rs * (ne01 - ir1);
}
}
} else {
//printf("%s: this is not optimal - fix me\n", __func__);
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
id += rs * ir0;
for (int64_t i01 = ir0; i01 < ir1; i01++) {
for (int64_t i00 = 0; i00 < ne00; i00++) {
const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03;
memcpy(dst_ptr + id, src0_ptr, type_size);
id += type_size;
}
}
id += rs * (ne01 - ir1);
}
}
}
return;
}
// dst counters
int64_t i10 = 0;
int64_t i11 = 0;
int64_t i12 = 0;
int64_t i13 = 0;
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
i10 += ne00 * ir0;
while (i10 >= ne0) {
i10 -= ne0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
for (int64_t i01 = ir0; i01 < ir1; i01++) {
for (int64_t i00 = 0; i00 < ne00; i00++) {
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
memcpy(dst_ptr, src0_ptr, type_size);
if (++i10 == ne0) {
i10 = 0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
}
}
i10 += ne00 * (ne01 - ir1);
while (i10 >= ne0) {
i10 -= ne0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
}
}
}
static void ggml_compute_forward_dup(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
ggml_compute_forward_dup_same_cont(params, src0, dst);
if (src0->type == dst->type) {
ggml_compute_forward_dup_bytes(params, src0, dst);
return;
}
switch (src0->type) {
case GGML_TYPE_F16:
{
@ -8404,10 +8558,12 @@ static void ggml_compute_forward_repeat(
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F16:
case GGML_TYPE_I16:
{
ggml_compute_forward_repeat_f16(params, src0, dst);
} break;
case GGML_TYPE_F32:
case GGML_TYPE_I32:
{
ggml_compute_forward_repeat_f32(params, src0, dst);
} break;
@ -8550,6 +8706,7 @@ static void ggml_compute_forward_concat(
struct ggml_tensor* dst) {
switch (src0->type) {
case GGML_TYPE_F32:
case GGML_TYPE_I32:
{
ggml_compute_forward_concat_f32(params, src0, src1, dst);
} break;
@ -10674,6 +10831,7 @@ static void ggml_compute_forward_get_rows(
ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
} break;
case GGML_TYPE_F32:
case GGML_TYPE_I32:
{
ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
} break;

View file

@ -27,7 +27,7 @@ echo "Syncing ggml changes since commit $lc"
cd $SRC_GGML
git log --oneline $lc..HEAD
git log --oneline $lc..HEAD | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits
git log --oneline $lc..HEAD --reverse | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits
if [ ! -s $SRC_LLAMA/ggml-commits ]; then
rm -v $SRC_LLAMA/ggml-commits
@ -87,7 +87,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
# src/ggml-impl.h -> ggml-impl.h
# src/ggml-metal.h -> ggml-metal.h
# src/ggml-metal.m -> ggml-metal.m
# src/ggml-metal.metal -> ggml-metal.metal
# src/ggml-mpi.h -> ggml-mpi.h
# src/ggml-mpi.c -> ggml-mpi.c
# src/ggml-opencl.cpp -> ggml-opencl.cpp
@ -114,7 +113,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
-e 's/src\/ggml-metal\.metal/ggml-metal.metal/g' \
-e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
-e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
-e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \

View file

@ -1 +1 @@
df098ea908764cba4a4889a1cbe7b026b2d31a14
3fd01e00e40583ccd4b393a7c6502d6a4455a1d5

View file

@ -58,6 +58,9 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
int64_t hist[16];
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist);
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
// This is going to create some weird integers though.
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
} else {
GGML_ASSERT(false);
}
@ -87,8 +90,13 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
tv.push_back(*(float *) &buf[i]);
} else if (t->type == GGML_TYPE_I32) {
tv.push_back((float)*(int32_t *) &buf[i]);
} else if (t->type == GGML_TYPE_I16) {
tv.push_back((float)*(int16_t *) &buf[i]);
} else if (t->type == GGML_TYPE_I8) {
tv.push_back((float)*(int8_t *) &buf[i]);
} else if (quantized) {
tt.to_float(&buf[i], vq.data(), bs);
std::vector<float> vq(ggml_blck_size(t->type));
tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
tv.insert(tv.end(), vq.begin(), vq.end());
} else {
GGML_ASSERT(false);
@ -661,17 +669,26 @@ struct test_repeat : public test_case {
struct test_dup : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
const std::array<int64_t, 4> permute;
bool _use_permute;
std::string vars() override {
return VARS_TO_STR2(type, ne);
std::string v = VARS_TO_STR2(type, ne);
if (_use_permute) v += "," + VAR_TO_STR(permute);
return v;
}
test_dup(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 10, 10, 1})
: type(type), ne(ne) {}
std::array<int64_t, 4> ne = {10, 10, 10, 1},
std::array<int64_t, 4> permute = {0, 0, 0, 0})
: type(type), ne(ne), permute(permute),
_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
if (_use_permute) {
src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
}
ggml_tensor * out = ggml_dup(ctx, src);
return out;
}
@ -1450,14 +1467,26 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
}
}
}
for (int b : {1, 7}) {
for (bool v : {false, true}) {
test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
}
}
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));
test_cases.emplace_back(new test_dup());
test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
for (ggml_type type : all_types) {
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
@ -1565,7 +1594,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_alibi());
test_cases.emplace_back(new test_im2col());
test_cases.emplace_back(new test_concat());
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));