Merge branch 'master' into concedo_experimental

# Conflicts:
#	tests/test-grad0.c
#	tests/test-opt.c
This commit is contained in:
Concedo 2023-07-24 20:51:28 +08:00
commit 8a9b40840b
5 changed files with 472 additions and 487 deletions

View file

@ -3954,18 +3954,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
} }
func = ggml_cuda_mul; func = ggml_cuda_mul;
break; break;
case GGML_OP_GELU: case GGML_OP_UNARY:
if (!any_on_device) { switch (ggml_get_unary_op(tensor)) {
return false; case GGML_UNARY_OP_GELU:
} if (!any_on_device) {
func = ggml_cuda_gelu; return false;
break; }
case GGML_OP_SILU: func = ggml_cuda_gelu;
if (!any_on_device) { break;
return false; case GGML_UNARY_OP_SILU:
} if (!any_on_device) {
func = ggml_cuda_silu; return false;
break; }
func = ggml_cuda_silu;
break;
default:
return false;
} break;
case GGML_OP_NORM: case GGML_OP_NORM:
if (!any_on_device) { if (!any_on_device) {
return false; return false;

View file

@ -519,48 +519,56 @@ void ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_SILU: case GGML_OP_UNARY:
{ switch (ggml_get_unary_op(gf->nodes[i])) {
if (encoder == nil) { case GGML_UNARY_OP_SILU:
encoder = [command_buffer computeCommandEncoder]; {
} if (encoder == nil) {
encoder = [command_buffer computeCommandEncoder];
}
[encoder setComputePipelineState:ctx->pipeline_silu]; [encoder setComputePipelineState:ctx->pipeline_silu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_RELU:
{
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoder];
}
[encoder setComputePipelineState:ctx->pipeline_relu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_UNARY_OP_GELU:
{
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoder];
}
[encoder setComputePipelineState:ctx->pipeline_gelu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
default:
{
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false);
}
} break; } break;
case GGML_OP_RELU:
{
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoder];
}
[encoder setComputePipelineState:ctx->pipeline_relu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_GELU:
{
if (encoder == nil) {
encoder = [command_buffer computeCommandEncoder];
}
[encoder setComputePipelineState:ctx->pipeline_gelu];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
{ {
if (encoder == nil) { if (encoder == nil) {
@ -979,8 +987,10 @@ void ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
default: default:
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); {
GGML_ASSERT(false); fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false);
}
} }
} }

775
ggml.c

File diff suppressed because it is too large Load diff

60
ggml.h
View file

@ -329,16 +329,6 @@ extern "C" {
GGML_OP_ARGMAX, GGML_OP_ARGMAX,
GGML_OP_REPEAT, GGML_OP_REPEAT,
GGML_OP_REPEAT_BACK, GGML_OP_REPEAT_BACK,
GGML_OP_ABS,
GGML_OP_SGN,
GGML_OP_NEG,
GGML_OP_STEP,
GGML_OP_TANH,
GGML_OP_ELU,
GGML_OP_RELU,
GGML_OP_GELU,
GGML_OP_GELU_QUICK,
GGML_OP_SILU,
GGML_OP_SILU_BACK, GGML_OP_SILU_BACK,
GGML_OP_NORM, // normalize GGML_OP_NORM, // normalize
GGML_OP_RMS_NORM, GGML_OP_RMS_NORM,
@ -377,6 +367,8 @@ extern "C" {
GGML_OP_WIN_PART, GGML_OP_WIN_PART,
GGML_OP_WIN_UNPART, GGML_OP_WIN_UNPART,
GGML_OP_UNARY,
GGML_OP_MAP_UNARY, GGML_OP_MAP_UNARY,
GGML_OP_MAP_BINARY, GGML_OP_MAP_BINARY,
@ -390,6 +382,18 @@ extern "C" {
GGML_OP_COUNT, GGML_OP_COUNT,
}; };
enum ggml_unary_op {
GGML_UNARY_OP_ABS,
GGML_UNARY_OP_SGN,
GGML_UNARY_OP_NEG,
GGML_UNARY_OP_STEP,
GGML_UNARY_OP_TANH,
GGML_UNARY_OP_ELU,
GGML_UNARY_OP_RELU,
GGML_UNARY_OP_GELU,
GGML_UNARY_OP_GELU_QUICK,
GGML_UNARY_OP_SILU,
};
// ggml object // ggml object
struct ggml_object { struct ggml_object {
@ -534,6 +538,7 @@ extern "C" {
GGML_API const char * ggml_type_name(enum ggml_type type); GGML_API const char * ggml_type_name(enum ggml_type type);
GGML_API const char * ggml_op_name (enum ggml_op op); GGML_API const char * ggml_op_name (enum ggml_op op);
GGML_API const char * ggml_op_symbol(enum ggml_op op);
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
@ -557,6 +562,7 @@ extern "C" {
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx); GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
@ -616,9 +622,11 @@ extern "C" {
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor); GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...); GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
// //
// operations on tensors with backpropagation // operations on tensors with backpropagation
@ -628,6 +636,11 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_dup_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_add( GGML_API struct ggml_tensor * ggml_add(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -951,11 +964,22 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// a -> b, in-place, return view(b)
GGML_API struct ggml_tensor * ggml_cpy_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// make contiguous // make contiguous
GGML_API struct ggml_tensor * ggml_cont( GGML_API struct ggml_tensor * ggml_cont(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// make contiguous, in-place
GGML_API struct ggml_tensor * ggml_cont_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return view(a), b specifies the new shape // return view(a), b specifies the new shape
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view
GGML_API struct ggml_tensor * ggml_reshape( GGML_API struct ggml_tensor * ggml_reshape(
@ -1267,6 +1291,16 @@ extern "C" {
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
GGML_API struct ggml_tensor * ggml_unary(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_unary_op op);
GGML_API struct ggml_tensor * ggml_unary_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_unary_op op);
GGML_API struct ggml_tensor * ggml_map_unary_f32( GGML_API struct ggml_tensor * ggml_map_unary_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,

View file

@ -3297,8 +3297,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
#else #else
int8_t aux8[QK_K];
uint8_t aux8[QK_K];
int16_t aux16[16]; int16_t aux16[16];
float sums [8]; float sums [8];
memset(sums, 0, 8*sizeof(float)); memset(sums, 0, 8*sizeof(float));
@ -3308,7 +3307,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
const uint8_t * restrict q4 = x[i].qs; const uint8_t * restrict q4 = x[i].qs;
const uint8_t * restrict hm = x[i].qh; const uint8_t * restrict hm = x[i].qh;
const int8_t * restrict q8 = y[i].qs; const int8_t * restrict q8 = y[i].qs;
uint8_t * restrict a = aux8; int8_t * restrict a = aux8;
for (int l = 0; l < 32; ++l) { for (int l = 0; l < 32; ++l) {
a[l+ 0] = q4[l] & 0xF; a[l+ 0] = q4[l] & 0xF;
a[l+32] = q4[l] >> 4; a[l+32] = q4[l] >> 4;