[cl][adreno] Add Adreno GPU support

Add new OpenCL backend to support Adreno GPUs

---------

Co-authored-by: Skyler Szot <quic_sszot@quicinc.com>
Co-authored-by: Shangqing Gu <quic_shawngu@quicinc.com>
Co-authored-by: Alexander Angus <quic_aangus@quicinc.com>
Co-authored-by: Hongqiang Wang <quic_wangh@quicinc.com>
Co-authored-by: Max Krasnyansky <quic_maxk@quicinc.com>
This commit is contained in:
Li He 2024-11-25 11:27:15 -08:00 committed by Max Krasnyansky
parent c27ac678dd
commit f56fb699bc
21 changed files with 9331 additions and 1 deletions

View file

@ -179,6 +179,12 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
"ggml: sycl device architecture")
option(GGML_OPENCL "ggml: use OpenCL" OFF)
option(GGML_OPENCL_SMALL_ALLOC "ggml: use small allocation for tensors" ON)
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
# extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})

View file

@ -69,6 +69,7 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
// Utils
// Create a buffer and allocate all the tensors in a ggml_context
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft_for_weights(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
#ifdef __cplusplus

View file

@ -0,0 +1,41 @@
// SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
// SPDX-License-Identifier: MIT
#ifndef GGML_OPENCL2_H
#define GGML_OPENCL2_H
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define CL_CHECK(err) \
do { \
cl_int err_ = (err); \
if (err_ != CL_SUCCESS) { \
fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
#err, err_, __FILE__, __LINE__); \
GGML_ASSERT(0); \
} \
} while (0)
//
// backend API
//
GGML_BACKEND_API ggml_backend_t ggml_backend_opencl2_init(void);
GGML_BACKEND_API bool ggml_backend_is_opencl2(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl2_buffer_type(void);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl2_host_buffer_type(void);
GGML_BACKEND_API ggml_backend_t ggml_backend_reg_opencl2_init(const char * params, void * user_data);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl2_reg(void);
#ifdef __cplusplus
}
#endif
#endif // GGML_OPENCL2_H

View file

@ -298,6 +298,19 @@ else ()
ggml_add_cpu_backend_variant_impl("")
endif()
# TODO: This is intrusive. We intend to remove SMALL_ALLOC path once the we fully
# migrate to the non SMALL_ALLOC path. Also need to converge on the backend name
# so we don't need this name conversion.
if (GGML_OPENCL)
set(GGML_OPENCL2 ON)
add_compile_definitions(GGML_USE_OPENCL)
if (GGML_OPENCL_SMALL_ALLOC)
add_compile_definitions(GGML_OPENCL_SMALL_ALLOC)
endif ()
else ()
set(GGML_OPENCL2 OFF)
endif ()
ggml_add_backend(BLAS)
ggml_add_backend(CANN)
ggml_add_backend(CUDA)
@ -308,6 +321,8 @@ ggml_add_backend(MUSA)
ggml_add_backend(RPC)
ggml_add_backend(SYCL)
ggml_add_backend(Vulkan)
ggml_add_backend(OpenCL2)
ggml_add_backend(MUSA)
foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

View file

@ -1033,6 +1033,92 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
return buffer;
}
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_for_weights(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
#ifndef GGML_OPENCL_SMALL_ALLOC
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
#else
// Small allocation allocates a separate buffer for each tensor. Instead of
// collecting multiple tensors to allocate a large buffer, each tensor is
// allocated a buffer immediately. This is only supposed to be used for
// weights tensors (note that weights can be f32).
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
size_t alignment = ggml_backend_buft_get_alignment(buft);
ggml_backend_buffer_t * buffers = NULL;
size_t n_buffers = 0;
struct ggml_tensor * first_view = NULL;
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size_t this_size = 0;
if (t->data == NULL && t->view_src == NULL) {
// Tensor size must be properly padded.
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
}
// The allocation logic here has gone beyond intention in order to make
// `test-backend-ops` work. The very initial intention was to allocate
// memory for weights - each weight tensor gets its own buffer object.
// The original function should be used to allocate for intermediate tensors.
// There are usually no view tensors for weights; this is not true for
// intermediate tensors. However, in `test-backend-ops` there is no
// differetiation between weight tensors and intermediate tensors.
// This function is used for general allocation when small allocation is
// enabled in the test. This requires the function to also handle view
// tensors, which do no require actual allocation. In the original function,
// view tensors are allocated with other non-view tensors since view tensors
// sizes are 0.
// Here, we try to identify view tensors and allocate them with the next
// non-view tensor. View tensors cannot allocated (alone) but must be
// initialized (together with non-view tensors).
// This is a view tensor of its size if 0. Record its location if it is the
// first one after a non-view tensor. If the next tensor is still a view,
// simply go to the next. We want to allocate all consecutive view tensors
// together with the next non-view tensor.
if (this_size == 0 && first_view == NULL) {
first_view = t;
continue;
}
if (first_view) {
// This is a non-view tensor. If there are any view tensors before
// this non-view tensor, we want to allocate these view tensors and
// this non-view tensor together.
// The first tensor to allocate is the first view tensor.
first = first_view;
} else {
// Otherwise, allocate this non-view tensor immediately.
first = t;
}
if (!alloc_tensor_range(ctx, first, ggml_get_next_tensor(ctx, t), buft, this_size, &buffers, &n_buffers)) {
return NULL;
}
// Always reset first_view after a non-view tensor.
first_view = NULL;
}
if (n_buffers == 0) {
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}
ggml_backend_buffer_t buffer;
if (n_buffers == 1) {
buffer = buffers[0];
} else {
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
}
free(buffers);
return buffer;
#endif
}
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
}

View file

@ -46,6 +46,10 @@
#include "ggml-vulkan.h"
#endif
#ifdef GGML_USE_OPENCL2
#include "ggml-opencl2.h"
#endif
#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#endif
@ -146,6 +150,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg());
#endif
#ifdef GGML_USE_OPENCL2
register_backend(ggml_backend_opencl2_reg());
#endif
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif

View file

@ -0,0 +1,159 @@
find_package(OpenCL)
if (OpenCL_FOUND)
find_package(Python3 REQUIRED)
set(TARGET_NAME ggml-opencl2)
add_library(${TARGET_NAME}
ggml-opencl2.cpp
../../include/ggml-opencl2.h)
target_link_libraries(${TARGET_NAME} PRIVATE ggml-base ${OpenCL_LIBRARIES})
target_include_directories(${TARGET_NAME} PRIVATE . .. ${OpenCL_INCLUDE_DIRS})
# TODO - this is kind of strange. We have been calling this backend OpenCL2,
# so everything (function names, folder name, etc) except macro switches
# has been OpenCL2. Now, the backend frameworke enforces the use of the folder
# name as the backend name and switch. So, GGML_USE_OPENCL2 is used in
# ggml-backend-reg.cpp, but the rest still uses GGML_USE_OPENCL.
add_compile_definitions(GGML_USE_OPENCL)
if (GGML_OPENCL_PROFILING)
message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
add_compile_definitions(GGML_OPENCL_PROFILING)
endif ()
add_compile_definitions(GGML_OPENCL_SOA_Q)
if (GGML_OPENCL_SMALL_ALLOC)
message(STATUS "OpenCL will allocate a separate buffer for each tensor. "
"The default behavior allocates a large buffer to hold multiple tensors.")
add_compile_definitions(GGML_OPENCL_SMALL_ALLOC)
endif ()
if (GGML_OPENCL_USE_ADRENO_KERNELS)
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
endif ()
if (GGML_OPENCL_EMBED_KERNELS)
add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
set(OPENCL2_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2.cl.h")
set(OPENCL2_MM_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_mm.cl.h")
set(OPENCL2_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_cvt.cl.h")
set(OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_gemv_noshuffle.cl.h")
set(OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_gemv_noshuffle_general.cl.h")
set(OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl.h")
set(OPENCL2_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_16.cl.h")
set(OPENCL2_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_32.cl.h")
set(OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl2_transpose_32_16.cl.h")
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
include_directories("${CMAKE_BINARY_DIR}/autogenerated")
# Python must be accessible from command line
add_custom_command(
OUTPUT ${OPENCL2_CL_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2.cl
${OPENCL2_CL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl2.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl2.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL2_MM_CL_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_mm.cl
${OPENCL2_MM_CL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl2_mm.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl2_mm.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL2_CVT_CL_SOURCE_EMBED}
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_cvt.cl
${OPENCL2_CVT_CL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl2_cvt.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl2_cvt.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
COMMAND python ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_gemv_noshuffle.cl
${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl2_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl2_gemv_noshuffle.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
COMMAND python ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_gemv_noshuffle_general.cl
${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl2_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl2_gemv_noshuffle_general.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
COMMAND python ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl
${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl2_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl2_mul_mat_Ab_Bi_8x4.cl.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
COMMAND python ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_16.cl
${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl2_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl2_transpose_16.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
COMMAND python ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_32.cl
${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl2_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl2_transpose_32.cl.h"
)
add_custom_command(
OUTPUT ${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED}
COMMAND python ${EMBED_KERNEL_SCRIPT}
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl2_transpose_32_16.cl
${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED}
DEPENDS kernels/ggml-opencl2_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
COMMENT "Generate ggml-opencl2_transpose_32_16.cl.h"
)
target_sources(${TARGET_NAME} PRIVATE
${OPENCL2_CL_SOURCE_EMBED}
${OPENCL2_MM_CL_SOURCE_EMBED}
${OPENCL2_CVT_CL_SOURCE_EMBED}
${OPENCL2_GEMV_NOSHUFFLE_SOURCE_EMBED}
${OPENCL2_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
${OPENCL2_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
${OPENCL2_TRANSPOSE_16_SOURCE_EMBED}
${OPENCL2_TRANSPOSE_32_SOURCE_EMBED}
${OPENCL2_TRANSPOSE_32_16_SOURCE_EMBED})
else ()
# copy ggml-opencl.cl to bin directory
configure_file(kernels/ggml-opencl2.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2.cl COPYONLY)
configure_file(kernels/ggml-opencl2_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2_mm.cl COPYONLY)
configure_file(kernels/ggml-opencl2_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl2_cvt.cl COPYONLY)
endif ()
else ()
message(WARNING "OpenCL not found")
endif ()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,19 @@
import sys
def main():
if len(sys.argv) != 3:
print("Usage: python embed_kernel.py <input_file> <output_file>")
exit(1)
ifile = open(sys.argv[1], "r")
ofile = open(sys.argv[2], "w")
ofile.write("R\"(\n\n")
ofile.write(ifile.read())
ofile.write("\n)\"")
ifile.close()
ofile.close()
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,109 @@
// SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
// SPDX-License-Identifier: MIT
//------------------------------------------------------------------------------
// This file is contains additional kernels for data conversion.
// These kernels are used when loading the model, so its performance is less
// important.
//------------------------------------------------------------------------------
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#elif defined(cl_amd_fp16)
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
#else
#error "Half precision floating point not supportedby OpenCL implementation on your device."
#endif
#ifdef cl_khr_subgroups
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#elif defined(cl_intel_subgroups)
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#error "Subgroup not supported on your device."
#endif
#ifdef cl_intel_required_subgroup_size
// Always use subgroup size of 32 on Intel.
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
// Always use subgroups size of 64 on Adreno.
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#else
// TODO: do not know how to choose subgroup size on other GPUs.
#error "Selecting subgroup size is not supported on your device."
#endif
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
#define QR4_1 2
#define QK5_0 32
#define QR5_0 2
#define QK5_1 32
#define QR5_1 2
#define QK8_0 32
#define QR8_0 1
#define QK_K 256
#define K_QUANTS_PER_ITERATION 2
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
//------------------------------------------------------------------------------
// block_q4_0
//------------------------------------------------------------------------------
struct block_q4_0
{
half d;
uint8_t qs[QK4_0 / 2];
};
//------------------------------------------------------------------------------
// mul_vec_q_n_f32_flat_noshuffle
//
// This variation uses flat arrays (struct of arrays, SOA) representation for
// quant tensors. It also uses non shuffled bit order for weights.
//
// The shuffled version is kept in the original file because moving it here
// seems to result in worse performance for adreno.
//------------------------------------------------------------------------------
kernel void kernel_convert_block_q4_0_noshuffle(
global struct block_q4_0 * src0,
global uchar * dst_q,
global half * dst_d
) {
global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
global half * d = (global half *) dst_d + get_global_id(0);
*d = b->d;
for (int i = 0; i < QK4_0/4; ++i) {
uchar x0 = b->qs[2*i + 0];
uchar x1 = b->qs[2*i + 1];
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
#ifdef ADRENO_GPU
// Workaround for adreno - must have the following printf statement for
// the kernel to work properly. Otherwise it produces incorrect result.
// convert_uchar above also seems necessary.
// Compare against a large number so that it does not print anything.
// get_sub_group_local_id() also works.
if (get_global_id(0) == 65536*4096) {
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
}
#endif
}
}

View file

@ -0,0 +1,268 @@
// SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
// SPDX-License-Identifier: MIT
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
// assume
#define QK4_0 32
#define N_SIMDGROUP 4
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
float shared_y; \
shared_y = sub_group_broadcast(y.s0, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 0); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 0); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 0); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 0); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 0); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 0); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 0); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 1); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 1); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 1); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 1); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 1); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 1); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 1); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y.s0, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 2); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 2); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 2); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 2); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 2); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 2); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 2); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 3); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 3); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 3); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 3); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 3); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 3); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 3); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
float8 shared_y; \
shared_y = sub_group_broadcast(y, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
__attribute__((qcom_reqd_sub_group_size("full")))
__kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half2 * src0_d, // A scales
__read_only image1d_buffer_t src1, // B
int offset1, // offset to B (0)
global float * dst, // C
int offsetd, // offset to C (0)
uint K, // K
int ne01, // M
int ne02, // 1
int ne10, // K
int ne12, // 1
int ne0, // M
int ne1, // N
int r2, // 1
int r3)
{
uint groupId = get_local_id(1);
uint gid = get_global_id(0);
ushort slid = get_sub_group_local_id();
__private uint4 regA;
__private half2 regS;
__private float8 regB;
__private float2 totalSum = (float2)(0.0f);
// loop along K in block granularity, skip 4 blocks every iter
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
// first 4 fibers in each wave load 8 B values to its private scope
if (slid < 4) {
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
}
// load half weights for two blocks in consecutive rows
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
}
// reduction in local memory, assumes #wave=4
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
// 2 outputs per fiber in wave 0
if (groupId == 0) {
dst = (global float*)((global char*)dst + offsetd);
vstore2(totalSum, 0, &(dst[gid * 2]));
}
}

View file

@ -0,0 +1,274 @@
// SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
// SPDX-License-Identifier: MIT
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
// assume
#define QK4_0 32
#define N_SIMDGROUP 4
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
float shared_y; \
shared_y = sub_group_broadcast(y.s0, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 0); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 0); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 0); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 0); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 0); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 0); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 0); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 1); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 1); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 1); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 1); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 1); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 1); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 1); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y.s0, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 2); \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 2); \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 2); \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 2); \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 2); \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 2); \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 2); \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s0, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s1, 3); \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s2, 3); \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s3, 3); \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s4, 3); \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s5, 3); \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s6, 3); \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
shared_y = sub_group_broadcast(y.s7, 3); \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
float8 shared_y; \
shared_y = sub_group_broadcast(y, 0); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 1); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
shared_y = sub_group_broadcast(y, 2); \
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
shared_y = sub_group_broadcast(y, 3); \
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
__attribute__((qcom_reqd_sub_group_size("full")))
__kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half2 * src0_d, // A scales
__read_only image1d_buffer_t src1, // B
int offset1, // offset to B (0)
global float * dst, // C
int offsetd, // offset to C (0)
int ne00, // K
int ne01, // M
int ne02, // 1
int ne10, // K
int ne12, // 1
int ne0, // M
int ne1, // N
int r2, // 1
int r3)
{
uint groupId = get_local_id(1);
uint gid = get_global_id(0);
ushort slid = get_sub_group_local_id();
uint K = ne00;
uint M = ne01;
uint LINE_STRIDE_A = M / 2;
uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
__private uint4 regA;
__private half2 regS;
__private float8 regB;
__private float2 totalSum = (float2)(0.0f);
// loop along K in block granularity, skip 4 blocks every iter
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
// first 4 fibers in each wave load 8 B values to its private scope
if (slid < 4) {
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
}
// load half weights for two blocks in consecutive rows
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
#ifdef VECTOR_SUB_GROUP_BROADCAT
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
#else
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
#endif // VECTOR_SUB_GROUP_BROADCAT
}
// reduction in local memory, assumes #wave=4
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
// 2 outputs per fiber in wave 0
if (groupId == 0) {
dst = (global float*)((global char*)dst + offsetd);
vstore2(totalSum, 0, &(dst[gid * 2]));
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,133 @@
// SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
// SPDX-License-Identifier: MIT
// src0_q, src0_d, src1 are transposed as a preprocessing step
// 4-bit weights are transposed in groups of 4 (unsigned short int)
// consider weights originally "next to each other", now "on top of each other"
// each fiber computes a 8x4 tile of output elements
// using unshuffled weights
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
__attribute__((qcom_reqd_sub_group_size("full")))
kernel void kernel_mul_mat_Ab_Bi_8x4(
global const ushort * src0_q, // quantized A
global const half * src0_d, // A scales
__read_only image1d_buffer_t src1, // B (1d image)
global float * dst, // C
int m, // M
int n, // N with padding
int k, // K
int n_no_padding // N without padding
) {
int m_4 = m >> 2;
int n_4 = n >> 2;
int gy = get_global_id(0);
int gx = get_global_id(1);
int gx_2 = gx << 2;
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
half8 B; // registers for activations
half4 dequantized_weights; // registers for dequantized weights
__global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
__global const half* scale_ptr = src0_d + gx_2; // pointer for scales
for(int i=0; i<k; i+=4){ //loop through K dimension
B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
// keep (i/4) and (i/32) in parenthesis, rounds down
// load 4 consecutive groups of 4 weights
ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
// load 4 consecutive scales
half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
// j=0
dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=1
B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=2
B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
// j=3
B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
c1 += B * dequantized_weights.s1;
c2 += B * dequantized_weights.s2;
c3 += B * dequantized_weights.s3;
}
int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
// conditional check if store is to a valid location. Required when N is not a multiple of 8
// if statements allow registers to be reused for each store
// provides a performance boost due to reduced register footprint, which increases number of concurrent waves
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
}
}

View file

@ -0,0 +1,35 @@
// SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
// SPDX-License-Identifier: MIT
// 16-bit transpose, loading/storing an 8x8 tile of elements
kernel void kernel_transpose_16(
__read_only image1d_buffer_t input,
__write_only image1d_buffer_t output,
const uint rows,
const uint cols
) {
const int i = get_global_id(0);
const int j = get_global_id(1);
const int i_3 = i<<3;
const int j_3 = j<<3;
ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i));
ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i));
ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i));
ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i));
ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0)));
write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1)));
write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2)));
write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3)));
write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
}

View file

@ -0,0 +1,28 @@
// SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
// SPDX-License-Identifier: MIT
// 32-bit transpose, loading/storing a 4x4 tile of elements
kernel void kernel_transpose_32(
__read_only image1d_buffer_t input,
__write_only image1d_buffer_t output,
const uint rows,
const uint cols
) {
const int i = get_global_id(0);
const int j = get_global_id(1);
const int i_2 = i<<2;
const int j_2 = j<<2;
float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
}

View file

@ -0,0 +1,38 @@
// SPDX-FileCopyrightText: Copyright (c) Qualcomm Innovation Center, Inc. All rights reserved
// SPDX-License-Identifier: MIT
// 32-bit transpose, loading/storing a 4x4 tile of elements
// Only used for activations
// converts to FP16
// also adds zero padding for non multiple of 8 prompt lengths
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
const int i = get_global_id(0);
const int j = get_global_id(1);
const int i_2 = i<<2;
const int j_2 = j<<2;
half4 temp0 = {0,0,0,0}; // initialize outputs to 0
half4 temp1 = {0,0,0,0};
half4 temp2 = {0,0,0,0};
half4 temp3 = {0,0,0,0};
if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
temp0 = read_imageh(input, (j_2+0)*cols+i);
}
if((j_2+1)*cols+i*4+3 < rows*cols*16){
temp1 = read_imageh(input, (j_2+1)*cols+i);
}
if((j_2+2)*cols+i*4+3 < rows*cols*16){
temp2 = read_imageh(input, (j_2+2)*cols+i);
}
if((j_2+3)*cols+i*4+3 < rows*cols*16){
temp3 = read_imageh(input, (j_2+3)*cols+i);
}
write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
}

View file

@ -17,6 +17,15 @@ add_library(llama
unicode-data.cpp
)
# TODO: This is intrusive. We intend to remove SMALL_ALLOC path once the we fully
# migrate to the non SMALL_ALLOC path.
if (GGML_OPENCL)
add_compile_definitions(GGML_USE_OPENCL)
if (GGML_OPENCL_SMALL_ALLOC)
add_compile_definitions(GGML_OPENCL_SMALL_ALLOC)
endif ()
endif ()
target_include_directories(llama PUBLIC . ../include)
target_compile_features (llama PUBLIC cxx_std_17) # don't bump

View file

@ -9296,7 +9296,11 @@ static bool llm_load_tensors(
}
}
else {
#ifdef GGML_USE_OPENCL
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_for_weights(ctx, buft);
#else // GGML_USE_OPENCL
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
#endif
if (buf == nullptr) {
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}

View file

@ -18,6 +18,7 @@
#include <ggml.h>
#include <ggml-alloc.h>
#include <ggml-backend.h>
#include <ggml-opencl2.h>
#include <algorithm>
#include <array>
@ -466,7 +467,18 @@ struct test_case {
add_sentinel(ctx);
// allocate
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
// We need to use this function to properly support GGML_OPENCL_SMALL_ALLOC.
// In fact, `ggml_backend_alloc_ctx_tensors_from_buft_for_weights` is a
// bit misdenomer. It is initially created for allocating weights. But
// it can be used for allocating any tensors that needs small alloc.
// Something like `ggml_backend_alloc_ctx_tensors_from_buft_2` or
// `ggml_backend_alloc_ctx_tensors_from_buft_small_alloc` would be better.
//
// This is intrusive. We intend to remove SMALL_ALLOC path once the we fully
// migrate to the non SMALL_ALLOC path.
ggml_backend_buffer_t buf = ggml_backend_is_opencl2(backend1) == false ? ggml_backend_alloc_ctx_tensors(ctx, backend1) :
ggml_backend_alloc_ctx_tensors_from_buft_for_weights(
ctx, ggml_backend_get_default_buffer_type(backend1));
if (buf == NULL) {
printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
ggml_free(ctx);
@ -618,7 +630,13 @@ struct test_case {
printf("%*s", last - len, "");
// allocate
#ifdef GGML_USE_OPENCL
ggml_backend_buffer_t buf =
ggml_backend_alloc_ctx_tensors_from_buft_for_weights(
ctx, ggml_backend_get_default_buffer_type(backend));
#else
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
#endif
if (buf == NULL) {
printf("failed to allocate tensors\n");
ggml_free(ctx);