Merge branch 'master' into compilade/batch-splits

This commit is contained in:
Francis Couture-Harpin 2024-08-14 20:46:28 -04:00
commit 702e1995a1
134 changed files with 6827 additions and 2166 deletions

View file

@ -129,13 +129,13 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)

View file

@ -50,6 +50,8 @@ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family

View file

@ -244,6 +244,8 @@
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
#define GGML_ROPE_TYPE_NEOX 2
#define GGUF_MAGIC "GGUF"
#define GGUF_VERSION 3
@ -1140,16 +1142,17 @@ extern "C" {
// group normalize along ne0*ne1*n_groups
// used in stable-diffusion
// TODO: eps is hardcoded to 1e-6 for now
GGML_API struct ggml_tensor * ggml_group_norm(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_groups);
int n_groups,
float eps);
GGML_API struct ggml_tensor * ggml_group_norm_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_groups);
int n_groups,
float eps);
// a - x
// b - dy
@ -1452,11 +1455,10 @@ extern "C" {
struct ggml_tensor * b);
// rotary position embedding
// if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
// if mode & 2 == 1, GPT-NeoX style
// if (mode & 1) - skip n_past elements (NOT SUPPORTED)
// if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
//
// b is an int32 vector with size a->ne[2], it contains the positions
// c is freq factors (e.g. phi3-128k), (optional)
GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -1473,6 +1475,7 @@ extern "C" {
int mode);
// custom RoPE
// c is freq factors (e.g. phi3-128k), (optional)
GGML_API struct ggml_tensor * ggml_rope_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,

View file

@ -602,6 +602,10 @@ if (GGML_VULKAN)
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
endif()
if (GGML_VULKAN_PERF)
add_compile_definitions(GGML_VULKAN_PERF)
endif()
if (GGML_VULKAN_VALIDATE)
add_compile_definitions(GGML_VULKAN_VALIDATE)
endif()

View file

@ -16,6 +16,8 @@
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Woverlength-strings"
#elif defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define UNUSED GGML_UNUSED

View file

@ -351,15 +351,10 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
}
// an async copy would normally happen after all the queued operations on both backends are completed
// sync src, set_async dst
if (ggml_backend_buffer_is_host(src->buffer)) {
ggml_backend_synchronize(backend_src);
ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
} else {
ggml_backend_synchronize(backend_src);
ggml_backend_tensor_copy(src, dst);
ggml_backend_synchronize(backend_dst);
}
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
ggml_backend_synchronize(backend_src);
ggml_backend_synchronize(backend_dst);
ggml_backend_tensor_copy(src, dst);
}
// events
@ -1782,7 +1777,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
} else {
ggml_backend_synchronize(split_backend);
}
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
ggml_backend_synchronize(input_backend);
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
ggml_backend_tensor_copy(input, input_cpy);
}
}
}

View file

@ -627,7 +627,6 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
const void* src,
void* dst) {
GGML_ASSERT(tensor->op == GGML_OP_NONE);
int64_t n_elems = ggml_nelements(tensor);
int64_t groups = n_elems / QK4_0;
@ -679,7 +678,6 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
*/
GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
const ggml_tensor* tensor, void* src, void* dst) {
GGML_ASSERT(tensor->op == GGML_OP_NONE);
int64_t n_elems = ggml_nelements(tensor);
int64_t groups = n_elems / QK4_0;
@ -898,11 +896,10 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
* @param size Size of the data to be copied, in bytes.
*/
GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data,
ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
size_t offset, size_t size) {
// GGML_ASSERT(size == ggml_nbytes(tensor));
ggml_backend_cann_buffer_context* ctx =
(ggml_backend_cann_buffer_context*)buffer->context;
ggml_backend_cann_buffer_context *ctx =
(ggml_backend_cann_buffer_context *)buffer->context;
ggml_cann_set_device(ctx->device);
// TODO: refer to cann(#6017), it use thread's default stream.
@ -910,22 +907,21 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
// Why aclrtSynchronizeDevice?
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpy(tensor->data, size, (const char*)data + offset,
size, ACL_MEMCPY_HOST_TO_DEVICE));
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
ACL_MEMCPY_HOST_TO_DEVICE));
} else {
void* transform_buffer = malloc(size);
ggml_backend_cann_transform(tensor, (const char*)data + offset,
transform_buffer);
void *transform_buffer = malloc(size);
ggml_backend_cann_transform(tensor, data, transform_buffer);
#ifndef NDEBUG
void* check_buffer = malloc(size);
void *check_buffer = malloc(size);
ggml_backend_cann_transform_back(tensor, transform_buffer,
check_buffer);
GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size) ==
0);
GGML_ASSERT(memcmp(data, check_buffer, size) == 0);
free(check_buffer);
#endif
ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size,
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
transform_buffer, size,
ACL_MEMCPY_HOST_TO_DEVICE));
free(transform_buffer);
}
@ -947,21 +943,20 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
size_t offset, size_t size) {
GGML_ASSERT(size == ggml_nbytes(tensor));
ggml_backend_cann_buffer_context* ctx =
(ggml_backend_cann_buffer_context*)buffer->context;
ggml_cann_set_device(ctx->device);
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpy((char*)data + offset, size, tensor->data, size,
ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
ACL_MEMCPY_DEVICE_TO_HOST));
} else {
void* transform_buffer = malloc(size);
ACL_CHECK(aclrtMemcpy(transform_buffer, size, tensor->data, size,
ACL_CHECK(aclrtMemcpy(transform_buffer, size,
(char*)tensor->data + offset, size,
ACL_MEMCPY_DEVICE_TO_HOST));
ggml_backend_cann_transform_back(tensor, transform_buffer,
(char*)data + offset);
ggml_backend_cann_transform_back(tensor, transform_buffer, data);
free(transform_buffer);
}
}
@ -1450,42 +1445,41 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
* @param size Size of the data to copy in bytes.
*/
GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
ggml_tensor* tensor,
const void* data,
ggml_tensor *tensor,
const void *data,
size_t offset,
size_t size) {
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context;
ggml_backend_cann_context *cann_ctx =
(ggml_backend_cann_context *)backend->context;
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpyAsync(
tensor->data, size, (const char*)data + offset, size,
ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data,
size, ACL_MEMCPY_HOST_TO_DEVICE,
cann_ctx->stream()));
} else {
void* transform_buffer = malloc(size);
ggml_backend_cann_transform(tensor, (const char*)data + offset,
transform_buffer);
void *transform_buffer = malloc(size);
ggml_backend_cann_transform(tensor, data, transform_buffer);
#ifndef NDEBUG
void* check_buffer = malloc(size);
void *check_buffer = malloc(size);
ggml_backend_cann_transform_back(tensor, transform_buffer,
check_buffer);
GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size));
GGML_ASSERT(memcmp(data, check_buffer, size));
free(check_buffer);
#endif
ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, transform_buffer, size,
ACL_MEMCPY_HOST_TO_DEVICE,
cann_ctx->stream()));
ACL_CHECK(aclrtMemcpyAsync(
(char *)tensor->data + offset, size, transform_buffer, size,
ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
free(transform_buffer);
}
}
GGML_CALL static void ggml_backend_cann_get_tensor_async(
ggml_backend_t backend, const ggml_tensor* tensor, void* data,
ggml_backend_t backend, const ggml_tensor *tensor, void *data,
size_t offset, size_t size) {
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context;
ggml_backend_cann_context *cann_ctx =
(ggml_backend_cann_context *)backend->context;
ggml_backend_buffer_t buf =
tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@ -1493,17 +1487,16 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
"unsupported buffer type");
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpyAsync((char*)data + offset, size, tensor->data,
ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset,
size, ACL_MEMCPY_DEVICE_TO_HOST,
cann_ctx->stream()));
} else {
void* transform_buffer = malloc(size);
ACL_CHECK(aclrtMemcpyAsync(transform_buffer, size, tensor->data, size,
ACL_MEMCPY_DEVICE_TO_HOST,
cann_ctx->stream()));
void *transform_buffer = malloc(size);
ACL_CHECK(aclrtMemcpyAsync(
transform_buffer, size, (char *)tensor->data + offset, size,
ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream()));
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
ggml_backend_cann_transform_back(tensor, transform_buffer,
(char*)data + offset);
ggml_backend_cann_transform_back(tensor, transform_buffer, data);
free(transform_buffer);
}
}
@ -1666,10 +1659,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
}
case GGML_OP_MUL_MAT: {
switch (op->src[0]->type) {
// case GGML_TYPE_Q4_0:
case GGML_TYPE_F16:
case GGML_TYPE_F32:
case GGML_TYPE_Q8_0:
// TODO: fix me
// Current groupsize should not be greater than k-1 in
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
case GGML_TYPE_Q4_0:
return true;
default:
return false;
@ -1694,6 +1690,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0:
return true;
default:
return false;

View file

@ -37,6 +37,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
return ACL_INT16;
case GGML_TYPE_I32:
return ACL_INT32;
case GGML_TYPE_Q4_0:
return ACL_INT4;
case GGML_TYPE_Q8_0:
return ACL_INT8;
default:
return ACL_DT_UNDEFINED;
}
@ -89,33 +93,6 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
return false;
}
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
size_t type_size, int64_t* ne, size_t* nb,
int64_t dims, aclFormat format,
size_t offset) {
int64_t tmp_ne[GGML_MAX_DIMS * 2];
int64_t tmp_stride[GGML_MAX_DIMS * 2];
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
for (int i = 0; i < dims; i++) {
tmp_stride[i] = nb[i] / type_size;
}
std::reverse(tmp_ne, tmp_ne + dims);
std::reverse(tmp_stride, tmp_stride + dims);
int64_t acl_storage_len = 0;
for (int i = 0; i < dims; i++) {
acl_storage_len += (ne[i] - 1) * nb[i];
}
aclTensor* acl_tensor =
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
format, &acl_storage_len, 1, data_ptr);
return acl_tensor;
}
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
const ggml_tensor* src1,
int64_t* bcast_src0_ne,

View file

@ -23,6 +23,9 @@
#ifndef CANN_ACL_TENSOR_H
#define CANN_ACL_TENSOR_H
#include <algorithm>
#include <cstring>
#include <aclnn/aclnn_base.h>
#include "common.h"
@ -65,7 +68,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
size_t offset = 0);
/**
* @brief Creates an ACL tensor from provided parameters.
* @brief Template for creating an ACL tensor from provided parameters. typename TYPE
* should be size_t or float.
*
* @details This function creates an ACL tensor using the provided data pointer,
* data type, dimensions, strides, format, offset, and additional parameters.
@ -83,10 +87,34 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
* @return Pointer to the created ACL tensor.
*/
template<typename TYPE>
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
size_t type_size, int64_t* ne, size_t* nb,
int64_t dims, aclFormat format = ACL_FORMAT_ND,
size_t offset = 0);
TYPE type_size, int64_t* ne, TYPE* nb,
int64_t dims,
aclFormat format = ACL_FORMAT_ND,
size_t offset = 0) {
int64_t tmp_ne[GGML_MAX_DIMS * 2];
int64_t tmp_stride[GGML_MAX_DIMS * 2];
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
for (int i = 0; i < dims; i++) {
tmp_stride[i] = nb[i] / type_size;
}
std::reverse(tmp_ne, tmp_ne + dims);
std::reverse(tmp_stride, tmp_stride + dims);
int64_t acl_storage_len = 0;
for (int i = 0; i < dims; i++) {
acl_storage_len += (ne[i] - 1) * nb[i];
}
aclTensor* acl_tensor =
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
format, &acl_storage_len, 1, data_ptr);
return acl_tensor;
}
/**
* @brief Checks if tensors require broadcasting based on their shapes.

View file

@ -464,9 +464,11 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
aclTensor* acl_src = ggml_cann_create_tensor(src);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
const float eps = 1e-6f; // TODO: make this a parameter
int n_groups = dst->op_params[0];
float eps;
memcpy(&eps, dst->op_params + 1, sizeof(float));
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
@ -910,6 +912,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->ne);
return;
}
if (dst->type == GGML_TYPE_Q4_0) {
aclrtlaunch_ascendc_quantize_f16_to_q4_0(
24, ctx.stream(), src->data, dst->data,
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
((ggml_tensor*)dst->extra)->ne);
return;
}
if (dst->type == GGML_TYPE_F16) {
if (ggml_are_same_shape(src, dst)) {
cann_copy(ctx, acl_src, acl_dst);
@ -971,6 +980,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->ne);
return;
}
if (dst->type == GGML_TYPE_Q4_0) {
aclrtlaunch_ascendc_quantize_f32_to_q4_0(
24, ctx.stream(), src->data, dst->data,
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
((ggml_tensor*)dst->extra)->ne);
return;
}
if (dst->type == GGML_TYPE_F32) {
if (ggml_are_same_shape(src, dst)) {
cann_copy(ctx, acl_src, acl_dst);
@ -2463,21 +2479,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
* @param dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
ggml_tensor* dst) {
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
ggml_tensor* dst,
const enum ggml_type type) {
ggml_tensor* src0 = dst->src[0]; // weight
ggml_tensor* src1 = dst->src[1]; // input
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
// is regarded as batch. weight need transpose.
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
size_t weight_elem_size = sizeof(uint8_t);
size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
float weight_elem_size;
if (type == GGML_TYPE_Q4_0) {
weight_elem_size = float(sizeof(uint8_t)) / 2;
}
else if (type == GGML_TYPE_Q8_0) {
weight_elem_size = float(sizeof(uint8_t));
}
else {
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
}
float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
// size of one matrix is element_size * height * width.
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
// scale stored at the end of weight. Also need transpose.
GGML_ASSERT(QK4_0 == QK8_0);
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
size_t scale_elem_size = sizeof(uint16_t);
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
@ -2541,8 +2569,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
input_elem_size, input_ne, input_nb, 2);
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride, ACL_INT8,
weight_elem_size, weight_ne, weight_nb, 2);
(char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
weight_nb, 2);
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2);
@ -2596,11 +2625,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
case GGML_TYPE_F16:
ggml_cann_mat_mul_fp(ctx, dst);
break;
// case GGML_TYPE_Q4_0:
// ggml_cann_mul_mat_q4_0(ctx, dst);
// break;
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
ggml_cann_mul_mat_q8_0(ctx, dst);
ggml_cann_mul_mat_quant(ctx, dst, type);
break;
default:
GGML_ABORT("fatal error");
@ -2854,7 +2881,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
beta_slow, corr_dims);
const bool is_neox = mode & 2;
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
// init cos/sin cache
ggml_cann_pool_alloc sin_allocator(

View file

@ -9,6 +9,7 @@ file(GLOB SRC_FILES
get_row_q8_0.cpp
quantize_f32_q8_0.cpp
quantize_f16_q8_0.cpp
quantize_float_to_q4_0.cpp
dup.cpp
)
@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC
${SRC_FILES}
)
#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

View file

@ -8,6 +8,8 @@
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"

View file

@ -0,0 +1,278 @@
#include "kernel_operator.h"
using namespace AscendC;
#define BUFFER_NUM 2
#define Group_Size 32
template <typename SRC_T>
class QUANTIZE_FLOAT_TO_Q4_0 {
public:
__aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
int64_t *input_ne_ub, size_t *input_nb_ub,
int64_t *output_ne_ub) {
// TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
// permute=[0,0,0,0]):
// [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx();
// input stride of data elements
for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i];
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
output_ne[i] = output_ne_ub[i];
}
// output stride of data elements
output_stride[0] = 1;
for (int i = 1; i < 4; i++) {
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
}
// scale saved one by one after data:. [group1_scale, group2_scale, ...]
scale_ne = input_ne;
scale_stride[0] = 1;
scale_stride[1] = input_ne[0] / Group_Size;
for (int i = 2; i < 4; i++) {
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
}
// split input tensor by rows.
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
dr = nr / op_block_num;
uint64_t tails = nr % op_block_num;
if (op_block_idx < tails) {
dr += 1;
ir = dr * op_block_idx;
} else {
ir = dr * op_block_idx + tails;
}
group_size_in_row = scale_stride[1];
int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
output_ne[3] * sizeof(uint8_t) / 2;
input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
group_size_in_row *
sizeof(half)));
pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
pipe.InitBuffer(output_queue, BUFFER_NUM,
Group_Size * sizeof(int8_t) / 2);
pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
}
__aicore__ inline void copy_in(uint32_t offset) {
LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
DataCopy(input_local, input_gm[offset], Group_Size);
input_queue.EnQue(input_local);
}
__aicore__ inline void copy_out(uint32_t offset) {
// reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
// and using DataCopyPad to avoid 32 bits align.
LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
LocalTensor<int8_t> output_int8_local =
output_local.ReinterpretCast<int8_t>();
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t);
DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
output_queue.FreeTensor(output_local);
}
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
LocalTensor<float> input_local) {
DataCopy(cast_local, input_local, Group_Size);
}
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
LocalTensor<half> input_local) {
Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
}
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
const int64_t i1 =
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
const int64_t input_offset = i1 * input_stride[1] +
i2 * input_stride[2] +
i3 * input_stride[3] + Group_Size * group;
// output_offset is stride for output_gm which datatype is int8_t and
// divided by 2 is needed for int4b_t.
const int64_t output_offset = (i1 * output_stride[1] +
i2 * output_stride[2] +
i3 * output_stride[3] +
Group_Size * group) / 2;
copy_in(input_offset);
LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
LocalTensor<float> min_local = min_queue.AllocTensor<float>();
LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
LocalTensor<half> half_local = half_queue.AllocTensor<half>();
input_to_cast(cast_local, input_local);
ReduceMax(max_local, cast_local, work_local, Group_Size);
ReduceMin(min_local, cast_local, work_local, Group_Size);
const float max_value = max_local.GetValue(0);
const float min_value = min_local.GetValue(0);
float d = max_value;
if (min_value < 0 && (-1 * min_value) > max_value) {
d = min_value;
}
d = d / (-8);
if (d != 0) {
Muls(cast_local, cast_local, 1.0f / d, Group_Size);
}
// range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
float scalar = 8.5f;
Adds(cast_local, cast_local, scalar, Group_Size);
Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
scalar = 15.0f;
Mins(cast_local, cast_local, scalar, Group_Size);
scalar = -8.0f;
Adds(cast_local, cast_local, scalar, Group_Size);
// float->half->int4b
Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
output_queue.EnQue(output_local);
copy_out(output_offset);
input_queue.FreeTensor(input_local);
work_queue.FreeTensor(work_local);
max_queue.FreeTensor(max_local);
min_queue.FreeTensor(min_local);
int8_queue.FreeTensor(int8_local);
half_queue.FreeTensor(half_local);
cast_queue.FreeTensor(cast_local);
return (half)d;
}
__aicore__ inline void calculate() {
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
uint32_t scale_local_offset = 0;
uint32_t scale_global_offset = 0;
for (int64_t i = ir; i < ir + dr; i++) {
for (int64_t j = 0; j < group_size_in_row; j++) {
half scale = calculate_group(i, j);
scale_local.SetValue(scale_local_offset++, scale);
// Copy Group_Size/2 length data each time.
if (scale_local_offset == Group_Size / 2) {
scale_local_offset = 0;
// TODO: OPTIMIZE ME
pipe_barrier(PIPE_ALL);
DataCopy(scale_gm[scale_global_offset], scale_local,
Group_Size / 2);
pipe_barrier(PIPE_ALL);
scale_global_offset += Group_Size / 2;
}
}
}
if (scale_local_offset != 0) {
pipe_barrier(PIPE_ALL);
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
DataCopyPad(scale_gm[scale_global_offset], scale_local,
dataCopyParams);
pipe_barrier(PIPE_ALL);
}
scale_queue.FreeTensor(scale_local);
}
private:
int64_t input_ne[4];
size_t input_stride[4];
int64_t *scale_ne;
size_t scale_stride[4];
int64_t output_ne[4];
size_t output_stride[4];
int64_t group_size_in_row;
int64_t ir;
int64_t dr;
TPipe pipe;
GlobalTensor<SRC_T> input_gm;
GlobalTensor<half> scale_gm;
GlobalTensor<int8_t> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
};
template <typename T>
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
auto gm_ptr = (__gm__ uint8_t *)gm;
auto ub_ptr = (uint8_t *)(ub);
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
*ub_ptr = *gm_ptr;
}
}
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t output_ne_ub[4];
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
QUANTIZE_FLOAT_TO_Q4_0<half> op;
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
op.calculate();
}
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t output_ne_ub[4];
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
QUANTIZE_FLOAT_TO_Q4_0<float> op;
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
op.calculate();
}

View file

@ -1501,7 +1501,7 @@ static void ggml_cuda_op_mul_mat(
}
// If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
@ -2358,33 +2358,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
}
GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
if (!ggml_backend_buffer_is_cuda(src->buffer)) {
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
return false;
}
if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
return false;
}
// device -> device
// device -> device copy
ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
#ifndef NDEBUG
GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
#endif
return false;
}
if (backend_src != backend_dst) {
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
// copy on src stream
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
} else {
#ifdef GGML_CUDA_NO_PEER_COPY
return false;
@ -2393,7 +2395,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
#endif
}
// record event on src stream
// record event on src stream after the copy
if (!cuda_ctx_src->copy_event) {
ggml_cuda_set_device(cuda_ctx_src->device);
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
@ -2405,7 +2407,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
} else {
// src and dst are on the same backend
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
}
return true;
}
@ -2742,11 +2744,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_MUL_MAT_ID:
{
struct ggml_tensor * a = op->src[0];
if (op->op == GGML_OP_MUL_MAT) {
struct ggml_tensor * b = op->src[1];
if (a->ne[3] != b->ne[3]) {
return false;
}
struct ggml_tensor * b = op->src[1];
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
return false;
}
if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
return false;
}
switch (a->type) {
case GGML_TYPE_F32:
@ -2877,7 +2880,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
return true;
case GGML_OP_FLASH_ATTN_EXT:
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
#else
if (op->src[0]->ne[0] == 128) {
return true;

View file

@ -142,8 +142,7 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
}
}
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
static const float eps = 1e-6f;
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
if (group_size < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1);
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
@ -196,8 +195,12 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
GGML_ASSERT( dst->type == GGML_TYPE_F32);
int num_groups = dst->op_params[0];
float eps;
memcpy(&eps, dst->op_params + 1, sizeof(float));
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);
group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream);
}
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View file

@ -226,7 +226,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
const bool is_neox = mode & 2;
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const int32_t * pos = (const int32_t *) src1_d;

View file

@ -210,7 +210,7 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_COUNT
};
struct ggml_metal_context {
struct ggml_backend_metal_context {
int n_cb;
id<MTLDevice> device;
@ -224,6 +224,10 @@ struct ggml_metal_context {
bool support_simdgroup_mm;
bool should_capture_next_compute;
// abort ggml_metal_graph_compute if callback returns true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};
// MSL code
@ -289,7 +293,7 @@ static void * ggml_metal_host_malloc(size_t n) {
return data;
}
static struct ggml_metal_context * ggml_metal_init(int n_cb) {
static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
@ -306,7 +310,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
// Configure context
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
ctx->device = device;
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
ctx->queue = [ctx->device newCommandQueue];
@ -668,7 +672,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
return ctx;
}
static void ggml_metal_free(struct ggml_metal_context * ctx) {
static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
@ -734,7 +738,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
return nil;
}
static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx, const struct ggml_tensor * op) {
for (size_t i = 0, n = 3; i < n; ++i) {
if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
return false;
@ -845,7 +849,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
}
static enum ggml_status ggml_metal_graph_compute(
struct ggml_metal_context * ctx,
struct ggml_backend_metal_context * ctx,
struct ggml_cgraph * gf) {
@autoreleasepool {
@ -878,8 +882,11 @@ static enum ggml_status ggml_metal_graph_compute(
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
command_buffer_builder[cb_idx] = command_buffer;
// enqueue the command buffers in order to specify their execution order
[command_buffer enqueue];
// always enqueue the first two command buffers
// enqueue all of the command buffers if we don't need to abort
if (cb_idx < 2 || ctx->abort_callback == NULL) {
[command_buffer enqueue];
}
}
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
@ -2229,10 +2236,8 @@ static enum ggml_status ggml_metal_graph_compute(
GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ggml_is_contiguous(src0));
//float eps;
//memcpy(&eps, dst->op_params, sizeof(float));
const float eps = 1e-6f; // TODO: temporarily hardcoded
float eps;
memcpy(&eps, dst->op_params + 1, sizeof(float));
const int32_t n_groups = ((int32_t *) dst->op_params)[0];
@ -2308,7 +2313,7 @@ static enum ggml_status ggml_metal_graph_compute(
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
const bool is_neox = mode & 2;
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
id<MTLComputePipelineState> pipeline = nil;
@ -2829,7 +2834,9 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder endEncoding];
[command_buffer commit];
if (cb_idx < 2 || ctx->abort_callback == NULL) {
[command_buffer commit];
}
});
// Wait for completion and check status of each command buffer
@ -2849,6 +2856,23 @@ static enum ggml_status ggml_metal_graph_compute(
return GGML_STATUS_FAILED;
}
id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
if (!next_buffer) {
continue;
}
bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
if (next_queued) {
continue;
}
if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
return GGML_STATUS_ABORTED;
}
[next_buffer commit];
}
if (should_capture) {
@ -3152,7 +3176,7 @@ GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
}
GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
ggml_metal_free(ctx);
free(backend);
}
@ -3164,13 +3188,13 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
}
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
return ggml_metal_graph_compute(metal_ctx, cgraph);
}
GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
return ggml_metal_supports_op(metal_ctx, op);
}
@ -3215,9 +3239,9 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
}
ggml_backend_t ggml_backend_metal_init(void) {
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
if (ctx == NULL) {
GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
return NULL;
}
@ -3239,15 +3263,24 @@ bool ggml_backend_is_metal(ggml_backend_t backend) {
void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
GGML_ASSERT(ggml_backend_is_metal(backend));
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
}
void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) {
GGML_ASSERT(ggml_backend_is_metal(backend));
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
ctx->abort_callback = abort_callback;
ctx->abort_callback_data = user_data;
}
bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
GGML_ASSERT(ggml_backend_is_metal(backend));
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
}
@ -3255,7 +3288,7 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
GGML_ASSERT(ggml_backend_is_metal(backend));
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
ctx->should_capture_next_compute = true;
}

View file

@ -197,6 +197,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
fprintf(stderr, "Failed to set SO_REUSEADDR\n");
return nullptr;
}
if (inet_addr(host) == INADDR_NONE) {
fprintf(stderr, "Invalid host address: %s\n", host);
return nullptr;
}
struct sockaddr_in serv_addr;
serv_addr.sin_family = AF_INET;
serv_addr.sin_addr.s_addr = inet_addr(host);
@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
return nullptr;
}
// require that the tensor data does not go beyond the buffer end
uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
result->op = (ggml_op) tensor->op;
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
result->op_params[i] = tensor->op_params[i];
@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
uint64_t offset;
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead(),
@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
return false;
}
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
// sanitize tensor->data
{
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
}
}
const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
ggml_backend_tensor_set(tensor, data, offset, size);
ggml_free(ctx);
@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint
return false;
}
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
// sanitize tensor->data
{
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
}
}
// output serialization format: | data (size bytes) |
output.resize(size, 0);
ggml_backend_tensor_get(tensor, output.data(), offset, size);

View file

@ -874,7 +874,7 @@ namespace dpct
inline std::string get_preferred_gpu_platform_name() {
std::string result;
std::string filter = "level-zero";
std::string filter = "";
char* env = getenv("ONEAPI_DEVICE_SELECTOR");
if (env) {
if (std::strstr(env, "level_zero")) {
@ -892,11 +892,24 @@ namespace dpct
else {
throw std::runtime_error("invalid device filter: " + std::string(env));
}
} else {
auto default_device = sycl::device(sycl::default_selector_v);
auto default_platform_name = default_device.get_platform().get_info<sycl::info::platform::name>();
if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) {
filter = "level-zero";
}
else if (std::strstr(default_platform_name.c_str(), "CUDA")) {
filter = "cuda";
}
else if (std::strstr(default_platform_name.c_str(), "HIP")) {
filter = "hip";
}
}
auto plaform_list = sycl::platform::get_platforms();
auto platform_list = sycl::platform::get_platforms();
for (const auto& platform : plaform_list) {
for (const auto& platform : platform_list) {
auto devices = platform.get_devices();
auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
return d.is_gpu();

View file

@ -225,9 +225,8 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
}
static void group_norm_f32_sycl(const float* x, float* dst,
const int num_groups, const int group_size,
const int num_groups, const float eps, const int group_size,
const int ne_elements, queue_ptr stream, int device) {
static const float eps = 1e-6f;
if (group_size < 1024) {
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
stream->submit([&](sycl::handler& cgh) {
@ -343,8 +342,12 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor*
GGML_ASSERT(dst->type == GGML_TYPE_F32);
int num_groups = dst->op_params[0];
float eps;
memcpy(&eps, dst->op_params + 1, sizeof(float));
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device);
(void)src1;
(void)dst;

View file

@ -226,7 +226,7 @@ void ggml_sycl_op_rope(
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
const bool is_neox = mode & 2;
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const int32_t * pos = (const int32_t *) src1_dd;

File diff suppressed because it is too large Load diff

View file

@ -56,6 +56,9 @@ int ggml_sve_cnt_b = 0;
// disable POSIX deprecation warnings
// these functions are never going away, anyway
#pragma warning(disable: 4996)
// unreachable code because of multiple instances of code after GGML_ABORT
#pragma warning(disable: 4702)
#endif
#if defined(_WIN32)
@ -2312,7 +2315,7 @@ inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) {
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
@ -3721,7 +3724,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_tensor * view_src,
size_t view_offs) {
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
// find the base tensor and absolute offset
if (view_src != NULL && view_src->view_src != NULL) {
@ -5374,6 +5378,7 @@ static struct ggml_tensor * ggml_group_norm_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_groups,
float eps,
bool inplace) {
bool is_node = false;
@ -5384,7 +5389,8 @@ static struct ggml_tensor * ggml_group_norm_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
result->op_params[0] = n_groups;
ggml_set_op_params_i32(result, 0, n_groups);
ggml_set_op_params_f32(result, 1, eps);
result->op = GGML_OP_GROUP_NORM;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -5396,15 +5402,17 @@ static struct ggml_tensor * ggml_group_norm_impl(
struct ggml_tensor * ggml_group_norm(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_groups) {
return ggml_group_norm_impl(ctx, a, n_groups, false);
int n_groups,
float eps) {
return ggml_group_norm_impl(ctx, a, n_groups, eps, false);
}
struct ggml_tensor * ggml_group_norm_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_groups) {
return ggml_group_norm_impl(ctx, a, n_groups, true);
int n_groups,
float eps) {
return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
}
// ggml_mul_mat
@ -12083,10 +12091,11 @@ static void ggml_compute_forward_group_norm_f32(
GGML_TENSOR_UNARY_OP_LOCALS
const float eps = 1e-6f; // TODO: make this a parameter
// TODO: optimize
float eps;
memcpy(&eps, dst->op_params + 1, sizeof(float));
int n_channels = src0->ne[2];
int n_groups = dst->op_params[0];
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
@ -14073,7 +14082,7 @@ static void ggml_compute_forward_rope_f32(
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
const bool is_neox = mode & 2;
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const float * freq_factors = NULL;
if (src2 != NULL) {
@ -14198,7 +14207,7 @@ static void ggml_compute_forward_rope_f16(
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
const bool is_neox = mode & 2;
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const float * freq_factors = NULL;
if (src2 != NULL) {
@ -21027,7 +21036,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
(int64_t) info->ne[2] *
(int64_t) info->ne[3];
if (ne % ggml_blck_size(info->type) != 0) {
if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) {
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
fclose(file);

View file

@ -11,7 +11,7 @@ void main() {
const uint i2 = gl_WorkGroupID.y;
const uint i1 = gl_WorkGroupID.x;
const bool is_neox = (pcs.mode & 2) != 0;
const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
float corr_dims[2];
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);

View file

@ -11,7 +11,7 @@ void main() {
const uint i2 = gl_WorkGroupID.y;
const uint i1 = gl_WorkGroupID.x;
const bool is_neox = (pcs.mode & 2) != 0;
const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
float corr_dims[2];
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);

View file

@ -1,5 +1,7 @@
#include "common.comp"
#define GGML_ROPE_TYPE_NEOX 2
// TODO: use a local size of 32 or more (Metal uses 1024)
layout(local_size_x = 1) in;

View file

@ -1,5 +1,7 @@
find_package (Threads REQUIRED)
set(TARGET vulkan-shaders-gen)
add_executable(${TARGET} vulkan-shaders-gen.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)

View file

@ -4,9 +4,11 @@
#include "generic_binary_head.comp"
void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
}

View file

@ -4,10 +4,12 @@
#include "generic_unary_head.comp"
void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
}

View file

@ -0,0 +1,39 @@
#version 450
#include "types.comp"
#include "generic_binary_head.comp"
void main() {
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
const int dim = p.param3;
if (idx >= p.ne) {
return;
}
const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
const uint i2_offset = i2*p.ne21*p.ne20;
const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;
uint o[4] = {0, 0, 0, 0};
o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
#ifndef OPTIMIZATION_ERROR_WORKAROUND
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
#else
if (is_src0) {
data_d[p.d_offset + dst_idx] = data_a[src0_idx];
} else {
data_d[p.d_offset + dst_idx] = data_b[src1_idx];
}
#endif
}

View file

@ -4,13 +4,15 @@
#include "generic_unary_head.comp"
void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
#ifndef OPTIMIZATION_ERROR_WORKAROUND
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
#else
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)];
data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
#endif
}

View file

@ -4,9 +4,11 @@
#include "generic_binary_head.comp"
void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) / FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
}

View file

@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const float GELU_COEF_A = 0.044715f;
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
const uint i = gl_GlobalInvocationID.x;
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;

View file

@ -0,0 +1,23 @@
#version 450
#include "generic_head.comp"
#include "types.comp"
#extension GL_EXT_control_flow_attributes : enable
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const float GELU_QUICK_COEF = -1.702f;
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
}
const float x = float(data_a[i]);
data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
}

View file

@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
uint d_offset;
float param1; float param2;
float param1; float param2; int param3;
} p;
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
uint get_idx() {
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
}
uint src0_idx(uint idx) {
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;

View file

@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
uint get_idx() {
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
}
uint src0_idx(uint idx) {
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;

View file

@ -0,0 +1,66 @@
#version 450
#include "generic_head.comp"
#include "types.comp"
#extension GL_EXT_control_flow_attributes : enable
#define BLOCK_SIZE 512
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
shared float tmp[BLOCK_SIZE];
void main() {
const uint group_size = p.KX;
const float eps = p.param1;
const uint tid = gl_LocalInvocationID.x;
const uint start = gl_WorkGroupID.x * group_size + tid;
const uint end = start + group_size;
tmp[tid] = 0.0f;
// Calculate mean
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
tmp[tid] += float(data_a[col]);
}
// tmp up partial tmps and write back result
barrier();
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier();
}
const float mean = tmp[0] / group_size;
barrier();
tmp[tid] = 0.0f;
// Calculate variance
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
const float xi = float(data_a[col]) - mean;
data_d[col] = D_TYPE(xi);
tmp[tid] += xi * xi;
}
// sum up partial sums and write back result
barrier();
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
barrier();
}
const float variance = tmp[0] / group_size;
const float scale = inversesqrt(variance + eps);
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
data_d[col] *= D_TYPE(scale);
}
}

View file

@ -0,0 +1,57 @@
#version 450
#extension GL_EXT_shader_16bit_storage : require
layout (push_constant) uniform parameter
{
uint batch_offset; uint offset_delta;
uint IC;
uint IW; uint IH;
uint OW; uint OH;
uint KW; uint KH;
uint pelements;
uint CHW;
int s0; int s1;
int p0; int p1;
int d0; int d1;
} p;
#include "types.comp"
#define BLOCK_SIZE 256
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.x;
if (i >= p.pelements) {
return;
}
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
const uint kx = i / ksize;
const uint kd = kx * ksize;
const uint ky = (i - kd) / p.OW;
const uint ix = i % p.OW;
const uint oh = gl_GlobalInvocationID.y;
const uint batch = gl_GlobalInvocationID.z / p.IC;
const uint ic = gl_GlobalInvocationID.z % p.IC;
const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
const uint offset_dst =
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
(ic * (p.KW * p.KH) + ky * p.KW + kx);
if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
data_d[offset_dst] = D_TYPE(0.0f);
} else {
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
}
}

View file

@ -0,0 +1,22 @@
#version 450
#include "generic_head.comp"
#include "types.comp"
#extension GL_EXT_control_flow_attributes : enable
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
}
const float val = float(data_a[i]);
data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
}

View file

@ -4,9 +4,11 @@
#include "generic_binary_head.comp"
void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
}

View file

@ -16,6 +16,13 @@ void main() {
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
const uint tid = gl_LocalInvocationID.x;
// There are not enough cols to use all threads
if (tid >= p.ncols) {
return;
}
const uint block_size = min(p.ncols, BLOCK_SIZE);
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
@ -23,8 +30,8 @@ void main() {
tmp[tid] = FLOAT_TYPE(0.0f);
[[unroll]] for (uint i = 0; i < p.ncols/BLOCK_SIZE; i += 2) {
const uint col = i*BLOCK_SIZE + 2*tid;
[[unroll]] for (uint i = 0; i < p.ncols/block_size; i += 2) {
const uint col = i*block_size + 2*tid;
const uint ib = (row*p.ncols + col)/QUANT_K; // block index
const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
const uint iybs = col - col%QUANT_K; // y block start index
@ -32,13 +39,12 @@ void main() {
vec2 v = dequantize(ib, iqs, a_offset / QUANT_K);
// matrix multiplication
tmp[tid] += FLOAT_TYPE(v.x) * FLOAT_TYPE(data_b[b_offset + iybs + iqs]) +
FLOAT_TYPE(v.y) * FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
tmp[tid] = fma(FLOAT_TYPE(v.x), FLOAT_TYPE(data_b[b_offset + iybs + iqs]), fma(FLOAT_TYPE(v.y), FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]), tmp[tid]));
}
// sum up partial sums and write back result
barrier();
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
[[unroll]] for (uint s = block_size/2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}

View file

@ -53,7 +53,7 @@ void main() {
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
tmp[tid] += xi * FLOAT_TYPE(data_b[iy]);
tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]);
}
// sum up partial sums and write back result

View file

@ -52,7 +52,7 @@ void main() {
// y is not transposed but permuted
const uint iy = channel*nrows_y + row_y;
tmp[tid] += xi * FLOAT_TYPE(data_b[iy]);
tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]);
}
// dst is not transposed and not permuted

View file

@ -39,24 +39,25 @@ void main() {
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
sum1 += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3);
sum2 += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF);
sum1 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3), sum1))))))));
sum2 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF), sum2))))))));
}
tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
const uint tmp_idx = 16 * ix + tid;
tmp[tmp_idx] = fma(dall, sum1, fma(-dmin, sum2, tmp[tmp_idx]));
}
// sum up partial sums and write back result

View file

@ -40,16 +40,17 @@ void main() {
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
sum += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4))
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4))
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4))
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4))
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4))
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4))
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4))
+ FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4));
sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
}
tmp[16 * ix + tid] += d * sum;
const uint tmp_idx = 16 * ix + tid;
tmp[tmp_idx] = fma(d, sum, tmp[tmp_idx]);
}
// sum up partial sums and write back result

View file

@ -67,17 +67,17 @@ void main() {
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 66] >> 4);
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 67] >> 4);
const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1 + FLOAT_TYPE(data_b[b_offset + y1_idx + 2]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * q4_3);
const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_5 + FLOAT_TYPE(data_b[b_offset + y1_idx + 34]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * q4_7);
const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx]) * q4_8 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_9 + FLOAT_TYPE(data_b[b_offset + y2_idx + 2]) * q4_10 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * q4_11);
const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_12 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_13 + FLOAT_TYPE(data_b[b_offset + y2_idx + 34]) * q4_14 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * q4_15);
const FLOAT_TYPE smin = FLOAT_TYPE(
FLOAT_TYPE(data_b[b_offset + y1_idx ]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx ]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 2]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 34]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 2]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 34]) * sc7
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7
);
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
const FLOAT_TYPE sx = fma(FLOAT_TYPE(data_b[b_offset + y1_idx]), q4_0, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), q4_1, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 2]), q4_2, FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * q4_3)));
const FLOAT_TYPE sy = fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), q4_4, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), q4_5, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 34]), q4_6, FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * q4_7)));
const FLOAT_TYPE sz = fma(FLOAT_TYPE(data_b[b_offset + y2_idx]), q4_8, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), q4_9, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 2]), q4_10, FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * q4_11)));
const FLOAT_TYPE sw = fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), q4_12, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), q4_13, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 34]), q4_14, FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * q4_15)));
const FLOAT_TYPE smin =
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), sc7,
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), sc7,
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 2]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 34]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 2]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 34]), sc7,
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 3]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 35]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 3]), sc6, FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7)))))))))))))));
const uint tmp_idx = 16 * ix + tid;
tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx]));
#else
const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf);
const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf);
@ -88,16 +88,19 @@ void main() {
const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4);
const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4);
const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx ]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1);
const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3);
const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx ]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_5);
const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7);
const FLOAT_TYPE smin = FLOAT_TYPE(
FLOAT_TYPE(data_b[b_offset + y1_idx]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7
);
const FLOAT_TYPE sx = fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), q4_0, FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1);
const FLOAT_TYPE sy = fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), q4_2, FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3);
const FLOAT_TYPE sz = fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), q4_4, FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_5);
const FLOAT_TYPE sw = fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), q4_6, FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7);
const FLOAT_TYPE smin =
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), sc7,
+ fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), sc6, FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7)))))));
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) + sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin);
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) +
sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin);
const uint tmp_idx = 16 * ix + tid;
tmp[tmp_idx] = fma(dall, (fma(sx, FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f), fma(sy, FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f),
fma(sz, FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)), fma(sw, FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))))))), fma(-dmin, smin, tmp[tmp_idx]));
#endif
}

View file

@ -66,35 +66,33 @@ void main() {
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] >> 4);
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4);
const FLOAT_TYPE sx = FLOAT_TYPE(
FLOAT_TYPE(data_b[b_offset + y1_idx ]) * (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) * (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0))
);
const FLOAT_TYPE sy = FLOAT_TYPE(
FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) * (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0))
);
const FLOAT_TYPE sz = FLOAT_TYPE(
FLOAT_TYPE(data_b[b_offset + y2_idx ]) * (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) * (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0))
);
const FLOAT_TYPE sw = FLOAT_TYPE(
FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) * (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0))
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0))
);
const FLOAT_TYPE smin = FLOAT_TYPE(
(FLOAT_TYPE(data_b[b_offset + y1_idx]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17])) * sc2 + (FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49])) * sc3
+ (FLOAT_TYPE(data_b[b_offset + y2_idx]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17])) * sc6 + (FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7
);
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
const FLOAT_TYPE sx =
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0)),
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0)),
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 16]), (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)),
FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0)))));
const FLOAT_TYPE sy =
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0)),
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0)),
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 48]), (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)),
FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0)))));
const FLOAT_TYPE sz =
fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0)),
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0)),
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 16]), (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)),
FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0)))));
const FLOAT_TYPE sw =
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0)),
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0)),
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 48]), (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)),
FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0)))));
const FLOAT_TYPE smin =
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]), sc2,
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]), sc3,
fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]), sc6,
(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7)));
const uint tmp_idx = 16 * ix + tid;
tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx]));
}
// sum up partial sums and write back result

View file

@ -44,22 +44,22 @@ void main() {
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
#if K_QUANTS_PER_ITERATION == 1
FLOAT_TYPE sum = FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32)
+ FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32)
+ FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32)
+ FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32)
+ FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32)
+ FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32)
+ FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32)
+ FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32);
tmp[16 * ix + tid] += sum;
const uint tmp_idx = 16 * ix + tid;
tmp[tmp_idx] = fma(FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32),
fma(FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32), tmp[tmp_idx]))))))));
#else
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
[[unroll]] for (int l = 0; l < 4; ++l) {
sum += FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32)
+ FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32);
sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32),
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32), sum))));
}
tmp[16 * ix + tid] += sum;
#endif

View file

@ -326,10 +326,10 @@ void main() {
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
}
const float d = loadd.x * sc;
const float m = loadd.y * mbyte;
const float m = -loadd.y * mbyte;
buf_a[buf_idx ] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) - m);
buf_a[buf_idx + 1] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) - m);
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m));
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
#elif defined(DATA_A_Q5_K)
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
@ -357,10 +357,10 @@ void main() {
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
}
const float d = loadd.x * sc;
const float m = loadd.y * mbyte;
const float m = -loadd.y * mbyte;
buf_a[buf_idx ] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0)) - m);
buf_a[buf_idx + 1] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0)) - m);
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m));
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
#elif defined(DATA_A_Q6_K)
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
@ -463,7 +463,8 @@ void main() {
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr] += float(cache_a[wsir * TM + cr]) * float(cache_b[wsic * TN + cc]);
const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
sums[sums_idx] = fma(float(cache_a[wsir * TM + cr]), float(cache_b[wsic * TN + cc]), sums[sums_idx]);
}
}
}

View file

@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
shared vec2 sum[BLOCK_SIZE];
void main() {
const uint row = gl_WorkGroupID.x;
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint tid = gl_LocalInvocationID.x;
sum[tid] = vec2(0.0f, 0.0f);

View file

@ -0,0 +1,26 @@
#version 450
#include "types.comp"
#include "generic_unary_head.comp"
void main() {
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (idx >= p.ne) {
return;
}
const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
const uint i2_offset = i2*p.ne11*p.ne10;
const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
}

View file

@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.x;
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;

View file

@ -0,0 +1,24 @@
#version 450
#include "types.comp"
#include "generic_unary_head.comp"
uint src0_idx_mod(uint idx) {
const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
const uint i12_offset = i12*p.ne11*p.ne10;
const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
return (i13 % p.ne03)*p.nb03 + (i12 % p.ne02)*p.nb02 + (i11 % p.ne01)*p.nb01 + (i10 % p.ne00)*p.nb00;
}
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
}

View file

@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
shared FLOAT_TYPE sum[BLOCK_SIZE];
void main() {
const uint row = gl_WorkGroupID.x;
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint tid = gl_LocalInvocationID.x;
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp

View file

@ -4,9 +4,11 @@
#include "generic_unary_head.comp"
void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1));
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
}

View file

@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.x;
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;

View file

@ -28,7 +28,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE];
void main() {
const uint tid = gl_LocalInvocationID.x;
const uint rowx = gl_WorkGroupID.x;
const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint rowy = rowx % p.KY;
float slope = 1.0f;

View file

@ -4,10 +4,12 @@
#include "generic_unary_head.comp"
void main() {
if (gl_GlobalInvocationID.x >= p.ne) {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val * val);
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
}

View file

@ -14,7 +14,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
shared FLOAT_TYPE tmp[BLOCK_SIZE];
void main() {
const uint row = gl_WorkGroupID.x;
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint col = gl_LocalInvocationID.x;
tmp[col] = FLOAT_TYPE(0.0f);

View file

@ -0,0 +1,21 @@
#version 450
#include "generic_head.comp"
#include "types.comp"
#extension GL_EXT_control_flow_attributes : enable
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
}
data_d[i] = D_TYPE(tanh(data_a[i]));
}

View file

@ -0,0 +1,41 @@
#version 450
#extension GL_EXT_shader_16bit_storage : require
layout (push_constant) uniform parameter
{
uint nb1;
uint dim;
uint max_period;
} p;
#include "types.comp"
#extension GL_EXT_control_flow_attributes : enable
#define BLOCK_SIZE 256
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_WorkGroupID.y;
const uint j = gl_GlobalInvocationID.x;
const uint d_offset = i * p.nb1;
if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
data_d[d_offset + p.dim] = 0.f;
}
const uint half_dim = p.dim / 2;
if (j >= half_dim) {
return;
}
const float timestep = float(data_a[i]);
const float freq = float(exp(-log(p.max_period) * j / half_dim));
const float arg = timestep * freq;
data_d[d_offset + j] = D_TYPE(cos(arg));
data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
}

View file

@ -6,7 +6,7 @@
#define QUANT_K 1
#define QUANT_R 1
#ifndef LOAD_VEC_A
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
#define A_TYPE float
#elif LOAD_VEC_A == 4
#define A_TYPE vec4
@ -19,7 +19,7 @@
#define QUANT_K 1
#define QUANT_R 1
#ifndef LOAD_VEC_A
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
#define A_TYPE float16_t
#elif LOAD_VEC_A == 4
#define A_TYPE f16vec4

View file

@ -0,0 +1,36 @@
#version 450
layout (push_constant) uniform parameter
{
uint ne; uint d_offset;
uint nb00; uint nb01; uint nb02; uint nb03;
uint ne10; uint ne11; uint ne12; uint ne13;
float sf0; float sf1; float sf2; float sf3;
} p;
#include "types.comp"
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (idx >= p.ne) {
return;
}
const uint i10 = idx % p.ne10;
const uint i11 = (idx / p.ne10) % p.ne11;
const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
const uint i00 = uint(i10 / p.sf0);
const uint i01 = uint(i11 / p.sf1);
const uint i02 = uint(i12 / p.sf2);
const uint i03 = uint(i13 / p.sf3);
data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
}

View file

@ -22,6 +22,7 @@
#ifdef _WIN32
#include <windows.h>
#include <direct.h> // For _mkdir on Windows
#include <algorithm> // For std::replace on w64devkit
#else
#include <unistd.h>
#include <sys/wait.h>
@ -30,20 +31,6 @@
#define ASYNCIO_CONCURRENCY 64
// define prototypes
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str);
bool directory_exists(const std::string& path);
bool create_directory(const std::string& path);
std::string to_uppercase(const std::string& input);
bool string_ends_with(const std::string& str, const std::string& suffix);
std::string join_paths(const std::string& path1, const std::string& path2);
std::string basename(const std::string &path);
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16);
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b);
void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id);
void process_shaders(std::vector<std::future<void>>& tasks);
void write_output_files();
std::mutex lock;
std::vector<std::pair<std::string, std::string>> shader_fnames;
@ -52,7 +39,7 @@ std::string input_dir = "vulkan-shaders";
std::string output_dir = "/tmp";
std::string target_hpp = "ggml-vulkan-shaders.hpp";
std::string target_cpp = "ggml-vulkan-shaders.cpp";
bool clean = true;
bool no_clean = false;
const std::vector<std::string> type_names = {
"f32",
@ -193,11 +180,7 @@ bool string_ends_with(const std::string& str, const std::string& suffix) {
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
}
#ifdef _WIN32
static const char path_separator = '\\';
#else
static const char path_separator = '/';
#endif
static const char path_separator = '/';
std::string join_paths(const std::string& path1, const std::string& path2) {
return path1 + path_separator + path2;
@ -212,7 +195,11 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
std::string out_fname = join_paths(output_dir, name + ".spv");
std::string in_path = join_paths(input_dir, in_fname);
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
#ifdef _WIN32
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
#else
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
#endif
for (const auto& define : defines) {
cmd.push_back("-D" + define.first + "=" + define.second);
}
@ -283,9 +270,12 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu
for (const auto& tname : type_names) {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
// For unaligned, load one at a time for f32/f16, or two at a time for quants
std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
// For aligned matmul loads
std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
@ -354,6 +344,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
@ -371,6 +364,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
@ -384,6 +380,10 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
@ -396,15 +396,42 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
@ -438,6 +465,17 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
}));
tasks.push_back(std::async(std::launch::async, [=] {
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
}));
}
void write_output_files() {
@ -449,10 +487,16 @@ void write_output_files() {
for (const auto& pair : shader_fnames) {
const std::string& name = pair.first;
const std::string& path = pair.second;
#ifdef _WIN32
std::string path = pair.second;
std::replace(path.begin(), path.end(), '/', '\\' );
#else
const std::string& path = pair.second;
#endif
FILE* spv = fopen(path.c_str(), "rb");
if (!spv) {
std::cerr << "Error opening SPIR-V file: " << path << "\n";
std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
continue;
}
@ -464,7 +508,7 @@ void write_output_files() {
size_t read_size = fread(data.data(), 1, size, spv);
fclose(spv);
if (read_size != size) {
std::cerr << "Error reading SPIR-V file: " << path << "\n";
std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
continue;
}
@ -478,9 +522,8 @@ void write_output_files() {
}
fprintf(src, "\n};\n\n");
if (clean) {
if (!no_clean) {
std::remove(path.c_str());
// fprintf(stderr, "Removed: %s\n", path.c_str());
}
}
@ -496,18 +539,6 @@ int main(int argc, char** argv) {
}
}
if (argc <= 1 || args.find("--help") != args.end()) {
std::cout << "Usage:\n"
"\tvulkan-shaders-gen [options]\n\n"
"Options:\n"
"\t--glslc <path> Path to glslc executable (default: /usr/bin/glslc)\n"
"\t--input-dir Directory containing shader sources (required)\n"
"\t--output-dir Output directory for generated SPIR-V files and optional C++ headers\n"
"\t--target-hpp <path> Path to generate a header file with shader declarations in C++ format\n"
"\t--target-cpp <path> Path to generate a source code file implementing the declared shaders (optional)\n"
"\t--no-clean Keep temporary SPIR-V files after build (default: remove them)\n";
return EXIT_SUCCESS;
}
if (args.find("--glslc") != args.end()) {
GLSLC = args["--glslc"]; // Path to glslc
}
@ -524,7 +555,7 @@ int main(int argc, char** argv) {
target_cpp = args["--target-cpp"]; // Path to generated cpp file
}
if (args.find("--no-clean") != args.end()) {
clean = false; // Keep temporary SPIR-V files in output-dir after build
no_clean = true; // Keep temporary SPIR-V files in output-dir after build
}
if (!directory_exists(input_dir)) {