Merge branch 'master' into compilade/batch-splits

This commit is contained in:
Francis Couture-Harpin 2024-07-28 01:20:13 -04:00
commit 0dea4263aa
85 changed files with 2886 additions and 6522 deletions

View file

@ -50,9 +50,15 @@ else()
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
endif()
if (CMAKE_CROSSCOMPILING)
set(GGML_NATIVE_DEFAULT OFF)
else()
set(GGML_NATIVE_DEFAULT ON)
endif()
# general
option(GGML_STATIC "ggml: static link libraries" OFF)
option(GGML_NATIVE "ggml: enable -march=native flag" ON)
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
option(GGML_LTO "ggml: enable link time optimization" OFF)
option(GGML_CCACHE "ggml: use ccache if available" ON)
@ -70,7 +76,7 @@ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
# instruction set specific
if (GGML_NATIVE)
if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
set(INS_ENB OFF)
else()
set(INS_ENB ON)
@ -107,6 +113,7 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)

View file

@ -6,6 +6,9 @@
#ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS"
#elif defined(GGML_USE_MUSA)
#define GGML_CUDA_NAME "MUSA"
#define GGML_CUBLAS_NAME "muBLAS"
#else
#define GGML_CUDA_NAME "CUDA"
#define GGML_CUBLAS_NAME "cuBLAS"

View file

@ -254,18 +254,8 @@
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
fflush(stdout); \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
ggml_print_backtrace(); \
abort(); \
} \
} while (0)
#ifndef NDEBUG
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
#elif defined(__GNUC__)
#define GGML_UNREACHABLE() __builtin_unreachable()
#elif defined(_MSC_VER)
@ -274,6 +264,17 @@
#define GGML_UNREACHABLE() ((void) 0)
#endif
#ifdef __cplusplus
#define GGML_NORETURN [[noreturn]]
#elif defined(_MSC_VER)
#define GGML_NORETURN __declspec(noreturn)
#else
#define GGML_NORETURN _Noreturn
#endif
#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
// used to copy the number of elements and stride in bytes of tensors into local variables.
// main purpose is to reduce code duplication and improve readability.
//
@ -322,6 +323,9 @@
extern "C" {
#endif
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
enum ggml_status {
GGML_STATUS_ALLOC_FAILED = -2,
GGML_STATUS_FAILED = -1,
@ -636,8 +640,11 @@ extern "C" {
GGML_CGRAPH_EVAL_ORDER_COUNT
};
typedef uint32_t ggml_bitset_t;
struct ggml_hash_set {
size_t size;
ggml_bitset_t * used;
struct ggml_tensor ** keys;
};
@ -651,7 +658,7 @@ extern "C" {
struct ggml_tensor ** grads;
struct ggml_tensor ** leafs;
struct ggml_hash_set visited_hash_table;
struct ggml_hash_set visited_hash_set;
enum ggml_cgraph_eval_order order;
};
@ -698,8 +705,6 @@ extern "C" {
GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void);
GGML_API void ggml_print_backtrace(void);
// accepts a UTF-8 path, even on Windows
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
@ -2002,8 +2007,8 @@ extern "C" {
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
@ -2397,6 +2402,7 @@ extern "C" {
GGML_API int ggml_cpu_has_vsx (void);
GGML_API int ggml_cpu_has_matmul_int8(void);
GGML_API int ggml_cpu_has_cann (void);
GGML_API int ggml_cpu_has_llamafile (void);
//
// Internal types and functions exposed for tests and benchmarks

View file

@ -139,6 +139,17 @@ if (GGML_METAL)
)
endif()
if (GGML_MUSA)
set(CMAKE_C_COMPILER clang)
set(CMAKE_C_EXTENSIONS OFF)
set(CMAKE_CXX_COMPILER clang++)
set(CMAKE_CXX_EXTENSIONS OFF)
set(GGML_CUDA ON)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA)
endif()
if (GGML_OPENMP)
find_package(OpenMP)
if (OpenMP_FOUND)
@ -147,6 +158,11 @@ if (GGML_OPENMP)
add_compile_definitions(GGML_USE_OPENMP)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
if (GGML_MUSA)
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so")
endif()
else()
message(WARNING "OpenMP not found")
endif()
@ -249,7 +265,13 @@ endif()
if (GGML_CUDA)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
find_package(CUDAToolkit)
if (GGML_MUSA)
list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/")
find_package(MUSAToolkit)
set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND})
else()
find_package(CUDAToolkit)
endif()
if (CUDAToolkit_FOUND)
message(STATUS "CUDA found")
@ -268,7 +290,11 @@ if (GGML_CUDA)
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
enable_language(CUDA)
if (GGML_MUSA)
set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE})
else()
enable_language(CUDA)
endif()
file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
@ -332,21 +358,40 @@ if (GGML_CUDA)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
if (GGML_MUSA)
set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
foreach(SOURCE ${GGML_SOURCES_CUDA})
set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_22")
endforeach()
endif()
if (GGML_STATIC)
if (WIN32)
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
if (GGML_MUSA)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static)
else()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
endif()
endif()
else()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
if (GGML_MUSA)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas)
else()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
endif()
if (GGML_CUDA_NO_VMM)
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
else()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
if (GGML_MUSA)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
else()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
endif()
else()
message(WARNING "CUDA not found")
@ -510,10 +555,10 @@ if (GGML_SYCL)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
else()
if (GGML_SYCL_TARGET STREQUAL "INTEL")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} pthread m dl onemkl)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
endif()
endif()
endif()
@ -857,8 +902,10 @@ function(get_flags CCID CCVER)
set(C_FLAGS -Wdouble-promotion)
set(CXX_FLAGS -Wno-array-bounds)
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
list(APPEND CXX_FLAGS -Wno-format-truncation)
if (NOT GGML_MUSA)
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
list(APPEND CXX_FLAGS -Wno-format-truncation)
endif()
endif()
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
list(APPEND CXX_FLAGS -Wextra-semi)
@ -1264,6 +1311,7 @@ endif()
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
target_include_directories(ggml PUBLIC ../include)
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
target_compile_features (ggml PRIVATE c_std_11) # don't bump
target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})

View file

@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
#elif defined(__ARM_NEON) && defined(__aarch64__)
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE)
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) {
const void * b_ptr = vx;
const void * a_ptr = vy;
@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
#elif defined(__ARM_NEON) && defined(__aarch64__)
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
}
#endif
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
if (svcntw() == 8) {
const void * b_ptr = vx;
const void * a_ptr = vy;

View file

@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
GGML_ASSERT(!"not enough space in the buffer");
return;
GGML_ABORT("not enough space in the buffer");
}
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
return;
}
}
GGML_ASSERT(!"out of allocated_tensors");
GGML_ABORT("out of allocated_tensors");
}
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
return;
}
}
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
GGML_ASSERT(!"tensor not found");
GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
}
#endif
@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
// this should never happen
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer");
GGML_UNREACHABLE();
GGML_ABORT("not enough space in the buffer");
}
}
@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
}
}
free(galloc->hash_set.keys);
ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values);
free(galloc->bufts);
free(galloc->buffers);
@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
typedef struct ggml_gallocr * ggml_gallocr_t;
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
return &galloc->hash_values[i];
}
@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
// clear hash tables
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
ggml_hash_set_reset(&galloc->hash_set);
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
// allocate leafs
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
}
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
size_t hash_size = graph->visited_hash_table.size;
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;
// initialize hash table
if (galloc->hash_set.size < hash_size) {
free(galloc->hash_set.keys);
free(galloc->hash_values);
galloc->hash_set.size = hash_size;
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
if (galloc->hash_set.size < min_hash_size) {
ggml_hash_set_free(&galloc->hash_set);
galloc->hash_set = ggml_hash_set_new(min_hash_size);
GGML_ASSERT(galloc->hash_set.keys != NULL);
free(galloc->hash_values);
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
GGML_ASSERT(galloc->hash_values != NULL);
} else {
// reset hash table
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
}
// reset allocators
@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
}
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
return talloc->size_max >= node_size;
}

View file

@ -1055,11 +1055,10 @@ struct ggml_backend_sched {
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
ggml_gallocr_t galloc;
// hash keys of the nodes in the graph
struct ggml_hash_set hash_set;
// hash values
int * tensor_backend_id;
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
// hash map of the nodes in the graph
struct ggml_hash_set hash_set;
int * hv_tensor_backend_ids; // [hash_set.size]
struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
int * node_backend_ids; // [graph_size]
int * leaf_backend_ids; // [graph_size]
@ -1068,7 +1067,7 @@ struct ggml_backend_sched {
int * prev_leaf_backend_ids; // [graph_size]
// copy of the graph with modified inputs
struct ggml_cgraph * graph;
struct ggml_cgraph graph;
// graph splits
struct ggml_backend_sched_split * splits;
@ -1087,19 +1086,16 @@ struct ggml_backend_sched {
ggml_backend_sched_eval_callback callback_eval;
void * callback_eval_user_data;
bool debug;
char * context_buffer;
size_t context_buffer_size;
// align context_buffer to GGML_MEM_ALIGN
#ifdef _MSC_VER
__declspec(align(GGML_MEM_ALIGN))
#else
__attribute__((aligned(GGML_MEM_ALIGN)))
#endif
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
bool debug;
};
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
// returns the priority of the backend, lower id is higher priority
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@ -1169,7 +1165,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
return cur_backend_id;
}
// assign nodes that use weights to the backend of the weights
// operations with weights are preferably run on the same backend as the weights
for (int i = 0; i < GGML_MAX_SRC; i++) {
const struct ggml_tensor * src = tensor->src[i];
@ -1275,7 +1270,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->is_reset = false;
struct ggml_init_params params = {
/* .mem_size = */ sizeof(sched->context_buffer),
/* .mem_size = */ sched->context_buffer_size,
/* .mem_buffer = */ sched->context_buffer,
/* .no_alloc = */ true
};
@ -1284,39 +1279,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->ctx = ggml_init(params);
if (sched->ctx == NULL) {
fprintf(stderr, "%s: failed to initialize context\n", __func__);
GGML_ASSERT(false);
GGML_ABORT("%s: failed to initialize context\n", __func__);
}
// pass 1: assign backends to ops with pre-allocated inputs
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
int * leaf_backend_id = &tensor_backend_id(leaf);
if (*leaf_backend_id != -1) {
// do not overwrite user assignments
continue;
// do not overwrite user assignments
if (*leaf_backend_id == -1) {
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
}
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
}
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) {
// do not overwrite user assignments
continue;
}
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
// src
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
// do not overwrite user assignments
if (*node_backend_id == -1) {
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
#if 0
// src
if (node->op == GGML_OP_NONE) {
continue;
}
int * src_backend_id = &tensor_backend_id(src);
if (*src_backend_id == -1) {
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
int * src_backend_id = &tensor_backend_id(src);
if (*src_backend_id == -1) {
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
}
}
#endif
}
}
@ -1488,12 +1487,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
// pass 4: split graph, find tensors that need to be copied
// pass 5: split graph, find tensors that need to be copied
{
int i_split = 0;
struct ggml_backend_sched_split * split = &sched->splits[0];
// find the backend of the first split, skipping view ops
for (int i = 0; i < graph->n_nodes; i++) {
int i = 0;
for (; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (!ggml_is_view_op(node->op)) {
split->backend_id = tensor_backend_id(node);
@ -1502,9 +1502,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
split->i_start = 0;
split->n_inputs = 0;
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
int cur_backend_id = split->backend_id;
for (int i = 0; i < graph->n_nodes; i++) {
for (; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
@ -1513,7 +1512,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
const int node_backend_id = tensor_backend_id(node);
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
assert(node_backend_id != -1); // all nodes should be assigned by now
// check if we should start a new split based on the sources of the current node
bool need_new_split = false;
@ -1527,7 +1526,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// by starting a new split, the memory of the previously offloaded weights can be reused
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = tensor_backend_id(src);
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
if (src_backend_id != cur_backend_id) {
need_new_split = true;
break;
}
@ -1536,9 +1535,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// FIXME: count the number of inputs instead of only checking when full
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
const size_t id = hash_id(src);
int src_backend_id = sched->tensor_backend_id[id];
int src_backend_id = sched->hv_tensor_backend_ids[id];
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true;
break;
@ -1570,12 +1569,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
continue;
}
const int src_backend_id = tensor_backend_id(src);
size_t src_id = hash_id(src);
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
assert(src_backend_id != -1); // all inputs should be assigned by now
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
size_t id = hash_id(src);
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
ggml_backend_t backend = sched->backends[src_backend_id];
for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * tensor_copy;
@ -1589,7 +1588,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
SET_CAUSE(tensor_copy, "4.cpy");
}
int n_graph_inputs = sched->n_graph_inputs++;
@ -1598,11 +1597,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && !supported) {
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
// create a copy of the input in the split's backend
const size_t id = hash_id(src);
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
ggml_backend_t backend = sched->backends[cur_backend_id];
for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
@ -1611,14 +1608,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
SET_CAUSE(tensor_copy, "4.cpy");
}
int n_inputs = split->n_inputs++;
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
split->inputs[n_inputs] = src;
}
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
}
}
}
@ -1630,7 +1627,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_backend_sched_print_assignments(sched, graph);
}
// swap node_backend_ids and leaf_backend_ids and prevs
// swap node_backend_ids and leaf _backend_ids with prevs
{
int * tmp = sched->node_backend_ids;
sched->node_backend_ids = sched->prev_node_backend_ids;
@ -1641,9 +1638,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->prev_leaf_backend_ids = tmp;
}
// create copies of the graph for each split
// TODO: avoid this copy
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
if (sched->graph.size < graph_size) {
sched->graph.size = graph_size;
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
GGML_ASSERT(sched->graph.nodes != NULL);
GGML_ASSERT(sched->graph.leafs != NULL);
}
sched->graph.n_nodes = 0;
sched->graph.n_leafs = 0;
struct ggml_cgraph * graph_copy = &sched->graph;
for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
@ -1654,12 +1661,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
struct ggml_tensor * input = split->inputs[j];
const size_t input_id = hash_id(input);
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
// add a dependency to the input source so that it is not freed before the copy is done
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
input_dep->src[0] = input;
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
// add a dependency to the input copy so that it is allocated at the start of the split
@ -1681,7 +1688,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
size_t id = hash_id(input);
int backend_id = tensor_backend_id(input);
for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
}
@ -1694,7 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
struct ggml_tensor * input = split->inputs[j];
size_t id = hash_id(input);
for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
}
@ -1708,13 +1715,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
}
sched->graph = graph_copy;
}
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
bool backend_ids_changed = false;
for (int i = 0; i < sched->graph->n_nodes; i++) {
for (int i = 0; i < sched->graph.n_nodes; i++) {
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
backend_ids_changed = true;
@ -1722,7 +1727,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
}
}
if (!backend_ids_changed) {
for (int i = 0; i < sched->graph->n_leafs; i++) {
for (int i = 0; i < sched->graph.n_leafs; i++) {
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
backend_ids_changed = true;
@ -1732,14 +1737,14 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
}
// allocate graph
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
// the re-allocation may cause the split inputs to be moved to a different address
ggml_backend_sched_synchronize(sched);
#ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
#endif
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
return false;
}
@ -1760,7 +1765,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
for (int j = 0; j < split->n_inputs; j++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
struct ggml_tensor * input = split->inputs[j];
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
@ -1846,21 +1851,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
sched->n_backends = n_backends;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
// initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size);
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
sched->hash_set = ggml_hash_set_new(graph_size);
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
sched->n_backends = n_backends;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
sched->context_buffer = malloc(sched->context_buffer_size);
const int initial_splits_capacity = 16;
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
@ -1895,37 +1902,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
}
ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx);
ggml_hash_set_free(&sched->hash_set);
free(sched->splits);
free(sched->hash_set.keys);
free(sched->tensor_backend_id);
free(sched->tensor_copies);
free(sched->hv_tensor_backend_ids);
free(sched->hv_tensor_copies);
free(sched->node_backend_ids);
free(sched->leaf_backend_ids);
free(sched->prev_node_backend_ids);
free(sched->prev_leaf_backend_ids);
free(sched->context_buffer);
free(sched->graph.nodes);
free(sched->graph.leafs);
free(sched);
}
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
// reset state for the next run
if (!sched->is_reset) {
size_t hash_size = sched->hash_set.size;
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
ggml_hash_set_reset(&sched->hash_set);
memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
sched->is_reset = true;
}
sched->is_alloc = false;
}
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
ggml_backend_sched_split_graph(sched, measure_graph);
// TODO: extract this to a separate function
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
return false;
}
@ -1936,10 +1943,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
}
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
ggml_backend_sched_split_graph(sched, graph);
if (!ggml_backend_sched_alloc_splits(sched)) {
return false;
}
@ -2009,6 +2017,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
tensor_backend_id(node) = backend_index;
SET_CAUSE(node, "usr");
sched->is_reset = false;
}
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
@ -2051,9 +2060,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
GGML_ASSERT(src != NULL);
GGML_ASSERT(src->data && "graph must be allocated");
size_t id = ggml_hash_insert(hash_set, src);
if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
return node_copies[ggml_hash_find(hash_set, src)];
size_t id = ggml_hash_insert(&hash_set, src);
if (id == GGML_HASHSET_ALREADY_EXISTS) {
return node_copies[ggml_hash_find(&hash_set, src)];
}
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
@ -2078,7 +2087,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
return dst;
}
static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
size_t id = ggml_hash_find(hash_set, src);
if (node_init[id]) {
return;
@ -2105,10 +2114,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
}
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
struct ggml_hash_set hash_set = {
/* .size = */ graph->visited_hash_table.size,
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
};
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
@ -2123,7 +2129,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
fprintf(stderr, "failed to allocate context for graph copy\n");
free(hash_set.keys);
ggml_hash_set_free(&hash_set);
free(node_copies);
free(node_init);
ggml_free(ctx_allocated);
@ -2146,7 +2152,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
if (buffer == NULL) {
fprintf(stderr, "failed to allocate buffer for graph copy\n");
free(hash_set.keys);
ggml_hash_set_free(&hash_set);
free(node_copies);
free(node_init);
ggml_free(ctx_allocated);
@ -2164,19 +2170,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
// copy data and init views
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
graph_copy_init_tensor(hash_set, node_copies, node_init, node);
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
}
// build graph copy
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
graph_copy->nodes[i] = node_copy;
}
graph_copy->n_nodes = graph->n_nodes;
free(hash_set.keys);
ggml_hash_set_free(&hash_set);
free(node_copies);
free(node_init);

View file

@ -275,8 +275,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
break;
default:
fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
GGML_ASSERT(false);
GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
}
}

View file

@ -120,7 +120,7 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
file, line);
GGML_CANN_LOG_ERROR(" %s\n", stmt);
// abort with GGML_ASSERT to get a stack trace
GGML_ASSERT(!"CANN error");
GGML_ABORT("CANN error");
}
/**
@ -342,7 +342,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
// memory should always buffered. these memory may still needed by
// tasks in stream.
// TODO, fix me.
GGML_ASSERT(!"Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
}
};
@ -1559,23 +1559,18 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
return false;
}
// need open both directions for memcpyasync between devices.
ggml_cann_set_device(cann_ctx_dst->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
ggml_cann_set_device(cann_ctx_src->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
ACL_MEMCPY_DEVICE_TO_DEVICE,
cann_ctx_dst->stream()));
cann_ctx_src->stream()));
// record event on src stream
if (!cann_ctx_src->copy_event) {
ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event));
}
ACL_CHECK(
aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
// wait on dst stream for the copy to complete
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(),
cann_ctx_src->copy_event));
//TODO: workaround for Event didn`t work here.
aclrtSynchronizeStream(cann_ctx_src->stream());
} else {
// src and dst are on the same backend
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
@ -1763,8 +1758,8 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
*
* This function determines whether the CANN backend supports the given backend
* buffer type by comparing the device context of the backend and buffer type.
* It returns true if the device associated with the buffer type matches the
* device associated with the backend.
* It returns true if the devices are same between the backend context and
* buffer type context.
*
* @param backend Pointer to the CANN backend.
* @param buft Pointer to the backend buffer type to check.
@ -1773,9 +1768,14 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
*/
GGML_CALL static bool ggml_backend_cann_supports_buft(
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
GGML_UNUSED(backend);
if (ggml_backend_buft_is_cann(buft)) {
ggml_backend_cann_context * cann_ctx =
(ggml_backend_cann_context *)backend->context;
ggml_backend_cann_buffer_type_context * buft_ctx =
(ggml_backend_cann_buffer_type_context *)buft->context;
return buft_ctx->device == cann_ctx->device;
}
return false;
}
/**
@ -1874,7 +1874,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
(aclrtEvent)event->context));
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}

View file

@ -844,7 +844,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_cann_max_pool2d(ctx, dst);
break;
case GGML_OP_POOL_COUNT:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}
@ -931,9 +931,9 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb);
return;
}
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
if (dst->type == GGML_TYPE_F32) {
if (ggml_are_same_shape(src, dst)) {
@ -955,12 +955,12 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb);
return;
}
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
// TODO
GGML_ASSERT(false);
GGML_ABORT("fatal error");
} else if (src->type == GGML_TYPE_F32) {
// TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
// && nb0 == type_size)
@ -991,10 +991,10 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb);
return;
}
GGML_ASSERT(false);
GGML_ABORT("fatal error");
} else {
// TODO: dst not contiguous
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
if (dst->type == GGML_TYPE_F16) {
@ -1017,11 +1017,11 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb);
return;
}
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
// TODO
GGML_ASSERT(false);
GGML_ABORT("fatal error");
} else {
if (ggml_are_same_shape(src, dst)) {
cann_copy(ctx, acl_src, acl_dst);
@ -1029,7 +1029,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ACL_CHECK(aclDestroyTensor(acl_dst));
return;
}
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
@ -2219,7 +2219,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}
@ -2492,7 +2492,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_cann_mul_mat_q8_0(ctx, dst);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}

View file

@ -19,7 +19,11 @@ typedef half2 ggml_half2;
#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA)
#if defined(GGML_COMMON_DECL_MUSA)
#include <musa_fp16.h>
#else
#include <cuda_fp16.h>
#endif
#include <cstdint>
typedef half ggml_half;
@ -415,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
#define GGML_TABLE_END() };
#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
#include <cstdint>
#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {

View file

@ -98,7 +98,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
// abort with GGML_ASSERT to get a stack trace
GGML_ASSERT(!"CUDA error");
GGML_ABORT("CUDA error");
}
// this is faster on Windows
@ -167,7 +167,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0;
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
CUdevice device;
CU_CHECK(cuDeviceGet(&device, id));
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@ -179,7 +179,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
alloc_prop.location.id = id;
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
}
#endif // !defined(GGML_USE_HIPBLAS)
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
info.devices[id].vmm = !!device_vmm;
cudaDeviceProp prop;
@ -315,7 +315,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
};
// pool with virtual memory
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
@ -409,14 +409,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
}
};
#endif // !defined(GGML_USE_HIPBLAS)
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
if (ggml_cuda_info().devices[device].vmm) {
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
}
#endif
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
}
@ -1341,7 +1341,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {};
p.dstDevice = dstDevice;
@ -1355,7 +1355,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
GGML_UNUSED(dstDevice);
GGML_UNUSED(srcDevice);
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
#endif // !defined(GGML_USE_HIPBLAS)
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
}
static void ggml_cuda_op_mul_mat(
@ -1596,7 +1596,7 @@ static void ggml_cuda_op_mul_mat(
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
if (quantize_src1 && !src1_is_contiguous) {
@ -1828,6 +1828,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
}
}
#else
#ifdef GGML_USE_MUSA
GGML_ASSERT(false);
#else // !GGML_USE_MUSA
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
// use cublasGemmStridedBatchedEx
@ -1870,6 +1873,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
#endif // GGML_USE_MUSA
#endif
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
@ -2945,7 +2949,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
#endif
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
@ -3027,7 +3031,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
return false;
}
#if CUDART_VERSION >= 11100
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
if (err != cudaSuccess) {
// clear the error

View file

@ -81,7 +81,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
} else if (order == GGML_SORT_ORDER_DESC) {
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}

View file

@ -259,7 +259,7 @@ static void ggml_cuda_op_bin_bcast(
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}

View file

@ -12,6 +12,10 @@
#else
#define GGML_COMMON_DECL_CUDA
#define GGML_COMMON_IMPL_CUDA
#if defined(GGML_USE_MUSA)
#define GGML_COMMON_DECL_MUSA
#define GGML_COMMON_IMPL_MUSA
#endif
#endif
#include "ggml-common.h"
@ -114,6 +118,150 @@
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
#elif defined(GGML_USE_MUSA)
#include <musa_runtime.h>
#include <musa.h>
#include <mublas.h>
#include <musa_fp16.h>
// XXX: Keep the following order the same as hipBLAS
// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N MUBLAS_OP_N
#define CUBLAS_OP_T MUBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
// #define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F MUSA_R_16F
#define CUDA_R_32F MUSA_R_32F
// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
// #define cublasComputeType_t mublasComputeType_t
#define cublasCreate mublasCreate
#define cublasDestroy mublasDestroy
#define cublasGemmEx mublasGemmEx
#define cublasGemmBatchedEx mublasGemmBatchedEx
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
#define cublasHandle_t mublasHandle_t
// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetMathMode mublasSetMathMode
#define cublasSetStream mublasSetStream
#define cublasSgemm mublasSgemm
#define cublasStatus_t mublasStatus_t
#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
#define cudaDeviceProp musaDeviceProp
#define cudaDeviceSynchronize musaDeviceSynchronize
#define cudaError_t musaError_t
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags musaEventCreateWithFlags
#define cudaEventDisableTiming musaEventDisableTiming
#define cudaEventRecord musaEventRecord
#define cudaEventSynchronize musaEventSynchronize
#define cudaEvent_t musaEvent_t
#define cudaEventDestroy musaEventDestroy
#define cudaFree musaFree
#define cudaFreeHost musaFreeHost
#define cudaGetDevice musaGetDevice
#define cudaGetDeviceCount musaGetDeviceCount
#define cudaGetDeviceProperties musaGetDeviceProperties
#define cudaGetErrorString musaGetErrorString
#define cudaGetLastError musaGetLastError
#define cudaHostRegister musaHostRegister
#define cudaHostRegisterPortable musaHostRegisterPortable
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
#define cudaHostUnregister musaHostUnregister
#define cudaLaunchHostFunc musaLaunchHostFunc
#define cudaMalloc musaMalloc
#define cudaMallocHost musaMallocHost
#define cudaMemcpy musaMemcpy
#define cudaMemcpyAsync musaMemcpyAsync
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
#define cudaMemcpy2DAsync musaMemcpy2DAsync
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
#define cudaMemcpyKind musaMemcpyKind
#define cudaMemset musaMemset
#define cudaMemsetAsync musaMemsetAsync
#define cudaMemGetInfo musaMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
#define cudaSetDevice musaSetDevice
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
#define cudaStreamDestroy musaStreamDestroy
#define cudaStreamFireAndForget musaStreamFireAndForget
#define cudaStreamNonBlocking musaStreamNonBlocking
#define cudaStreamPerThread musaStreamPerThread
#define cudaStreamSynchronize musaStreamSynchronize
#define cudaStreamWaitEvent musaStreamWaitEvent
#define cudaStream_t musaStream_t
#define cudaSuccess musaSuccess
// XXX: Other CUDA => MUSA mapping
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
#define CUdevice MUdevice
#define CUdeviceptr MUdeviceptr
#define CUmemAccessDesc MUmemAccessDesc
#define CUmemAllocationProp MUmemAllocationProp
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
#define cuDeviceGet muDeviceGet
#define cuDeviceGetAttribute muDeviceGetAttribute
#define cuMemAddressFree muMemAddressFree
#define cuMemAddressReserve muMemAddressReserve
#define cuMemCreate muMemCreate
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
#define cuMemMap muMemMap
#define cuMemRelease muMemRelease
#define cuMemSetAccess muMemSetAccess
#define cuMemUnmap muMemUnmap
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
#define cudaFuncSetAttribute musaFuncSetAttribute
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
#define make_cudaExtent make_musaExtent
#define make_cudaPitchedPtr make_musaPitchedPtr
// XXX: USE_CUDA_GRAPH
#define CUDA_SUCCESS MUSA_SUCCESS
#define CUresult MUresult
#define cuGetErrorString muGetErrorString
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
#define cudaGraphDestroy musaGraphDestroy
#define cudaGraphExecDestroy musaGraphExecDestroy
#define cudaGraphExec_t musaGraphExec_t
#define cudaGraphExecUpdate musaGraphExecUpdate
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
#define cudaGraphGetNodes musaGraphGetNodes
#define cudaGraphInstantiate musaGraphInstantiate
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
#define cudaGraphLaunch musaGraphLaunch
#define cudaGraphNodeGetType musaGraphNodeGetType
#define cudaGraphNode_t musaGraphNode_t
#define cudaGraphNodeType musaGraphNodeType
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
#define cudaGraph_t musaGraph_t
#define cudaKernelNodeParams musaKernelNodeParams
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
#define cudaStreamEndCapture musaStreamEndCapture
// XXX: cuBLAS => muBLAS mapping
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define cublasComputeType_t cudaDataType_t
// XXX: Clang builtins mapping
#define __vsub4 __vsub4_musa
#define __vcmpeq4 __vcmpeq4_musa
#define __vcmpne4 __vcmpne4_musa
#else
#include <cuda_runtime.h>
#include <cuda.h>
@ -168,9 +316,13 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
#if CUDART_VERSION >= 12000
#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
static const char * cublas_get_error_str(const cublasStatus_t err) {
#ifndef GGML_USE_MUSA
return cublasGetStatusString(err);
#else
return mublasStatus_to_string(err);
#endif // GGML_USE_MUSA
}
#else
static const char * cublas_get_error_str(const cublasStatus_t err) {
@ -200,7 +352,7 @@ static const char * cu_get_error_str(CUresult err) {
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif
#if CUDART_VERSION >= 11100
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
#else
#define GGML_CUDA_ASSUME(x)
@ -214,6 +366,42 @@ typedef float dfloat; // dequantize float
typedef float2 dfloat2;
#endif //GGML_CUDA_F16
#if defined(GGML_USE_MUSA)
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
return __vsubss4(a, b);
}
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
unsigned int c;
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
#pragma unroll
for (int i = 0; i < 4; ++i) {
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
}
return c;
}
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
unsigned int c;
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
#pragma unroll
for (int i = 0; i < 4; ++i) {
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
}
return c;
}
#endif // defined(GGML_USE_MUSA)
#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300
@ -348,7 +536,7 @@ static __device__ void no_device_code(
#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__
static __device__ __forceinline__ float warp_reduce_sum(float x) {
@ -455,7 +643,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
return mask_low | mask_high;
}
#endif // CUDART_VERSION < 12000
#endif // CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

View file

@ -451,7 +451,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
} else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
@ -484,6 +484,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
} else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}

View file

@ -662,7 +662,7 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}

View file

@ -564,7 +564,7 @@ static void on_no_fattn_vec_case(const int D) {
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
fprintf(stderr, "By default only f16 KV cache is supported.\n");
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
GGML_ASSERT(false);
GGML_ABORT("fatal error");
} else if (D == 128) {
fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
fprintf(stderr, "Supported combinations:\n");
@ -572,11 +572,11 @@ static void on_no_fattn_vec_case(const int D) {
fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
GGML_ASSERT(false);
GGML_ABORT("fatal error");
} else {
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
fprintf(stderr, "Only f16 is supported.\n");
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}

View file

@ -287,7 +287,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break;
default: {
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break;
}
}

View file

@ -284,7 +284,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break;
default: {
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break;
}
}

View file

@ -38,7 +38,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
} else {
@ -63,7 +63,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
// ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
// break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}
@ -86,7 +86,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
return;
@ -114,7 +114,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
return;
@ -141,7 +141,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}

View file

@ -171,8 +171,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
break;
default:
// TODO: k-quants
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
GGML_ASSERT(false);
GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
break;
}
}

View file

@ -84,7 +84,7 @@ void ggml_cuda_op_mul_mat_q(
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}

View file

@ -75,7 +75,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
case GGML_TYPE_IQ4_NL:
return MMQ_Q8_1_DS_LAYOUT_D4;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}
@ -2898,7 +2898,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
break;
default:
fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}

View file

@ -162,7 +162,7 @@ static void mul_mat_vec_q_cuda(
rows_per_cuda_block = 2;
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}
@ -196,7 +196,7 @@ static void mul_mat_vec_q_cuda(
mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}
@ -413,7 +413,7 @@ void ggml_cuda_op_mul_mat_vec_q(
mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}

View file

@ -163,7 +163,7 @@ void quantize_mmq_q8_1_cuda(
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}

View file

@ -251,7 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
attn_factor, corr_dims, freq_factors, stream
);
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
} else {
if (src0->type == GGML_TYPE_F32) {
@ -265,7 +265,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
attn_factor, corr_dims, freq_factors, stream
);
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
}

View file

@ -634,21 +634,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
#define GGML_HASHTABLE_FULL ((size_t)-1)
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
// bitset
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
static size_t ggml_bitset_size(size_t n) {
return (n + BITSET_MASK) >> BITSET_SHR;
}
static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
}
static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
}
static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
}
// hash set
#define GGML_HASHSET_FULL ((size_t)-1)
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
struct ggml_hash_set ggml_hash_set_new(size_t size);
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
// returns the minimum size for a hash set that can hold min_sz elements
size_t ggml_hash_size(size_t min_sz);
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
// remove all elements from the hash set
void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
// returns true if key is in the hash set
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
// return index, asserts if table is full
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
// hash function for ggml_tensor
static inline size_t ggml_hash(const struct ggml_tensor * p) {
// the last 4 bits are always zero due to alignment
return (size_t)(uintptr_t)p >> 4;
}
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
size_t h = ggml_hash(key) % hash_set->size;
// linear probing
size_t i = h;
while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
i = (i + 1) % hash_set->size;
if (i == h) {
// visited all hash table entries -> not found
return GGML_HASHSET_FULL;
}
}
return i;
}
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
size_t i = ggml_hash_find(hash_set, key);
return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
}
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
size_t h = ggml_hash(key) % hash_set->size;
// linear probing
size_t i = h;
do {
if (!ggml_bitset_get(hash_set->used, i)) {
ggml_bitset_set(hash_set->used, i);
hash_set->keys[i] = key;
return i;
}
if (hash_set->keys[i] == key) {
return GGML_HASHSET_ALREADY_EXISTS;
}
i = (i + 1) % hash_set->size;
} while (i != h);
// visited all hash table entries -> not found
GGML_ABORT("fatal error");
}
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
size_t h = ggml_hash(key) % hash_set->size;
// linear probing
size_t i = h;
do {
if (!ggml_bitset_get(hash_set->used, i)) {
ggml_bitset_set(hash_set->used, i);
hash_set->keys[i] = key;
return i;
}
if (hash_set->keys[i] == key) {
return i;
}
i = (i + 1) % hash_set->size;
} while (i != h);
// visited all hash table entries -> not found
GGML_ABORT("fatal error");
}
#ifdef __cplusplus
}

View file

@ -566,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
}
if ((a % b) != 0) {
fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
GGML_ASSERT(!"safe_divide result would've had remainder");
GGML_ABORT("safe_divide result would've had remainder");
}
return a / b;
}
@ -1460,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
if (!ggml_vk_supports_op(dst)) {
fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
GGML_ASSERT(!"unsupported op");
GGML_ABORT("unsupported op");
}
const int32_t ne00 = src0 ? src0->ne[0] : 0;
@ -1562,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
default:
{
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
} break;
@ -1745,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
continue;
not_implemented: {}
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
//GGML_ASSERT(false);
//GGML_ABORT("fatal error");
}
// Evaluate sequence

View file

@ -869,7 +869,7 @@ static enum ggml_status ggml_metal_graph_compute(
NSError * error = nil;
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
GGML_ASSERT(!"capture failed");
GGML_ABORT("capture failed");
}
}
@ -931,7 +931,7 @@ static enum ggml_status ggml_metal_graph_compute(
if (!ggml_metal_supports_op(ctx, dst)) {
GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
GGML_ASSERT(!"unsupported op");
GGML_ABORT("unsupported op");
}
if (should_capture) {
@ -1068,7 +1068,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
default: GGML_ASSERT(false);
default: GGML_ABORT("fatal error");
}
bcast_row = true;
@ -1077,7 +1077,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
default: GGML_ASSERT(false);
default: GGML_ABORT("fatal error");
}
}
@ -1131,7 +1131,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
default: GGML_ASSERT(false);
default: GGML_ABORT("fatal error");
}
[encoder setComputePipelineState:pipeline];
@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
default:
{
GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
} break;
case GGML_OP_SQR:
@ -1609,7 +1609,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
default: GGML_ABORT("MUL MAT-MAT not implemented");
}
[encoder setComputePipelineState:pipeline];
@ -1782,7 +1782,7 @@ static enum ggml_status ggml_metal_graph_compute(
default:
{
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
GGML_ASSERT(false && "not implemented");
GGML_ABORT("not implemented");
}
};
@ -1911,7 +1911,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
default: GGML_ABORT("MUL_MAT_ID not implemented");
}
[encoder setComputePipelineState:pipeline];
@ -2078,7 +2078,7 @@ static enum ggml_status ggml_metal_graph_compute(
default:
{
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
GGML_ASSERT(false && "not implemented");
GGML_ABORT("not implemented");
}
};
@ -2178,7 +2178,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
default: GGML_ASSERT(false && "not implemented");
default: GGML_ABORT("not implemented");
}
[encoder setComputePipelineState:pipeline];
@ -2316,13 +2316,13 @@ static enum ggml_status ggml_metal_graph_compute(
switch (src0->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
default: GGML_ASSERT(false);
default: GGML_ABORT("fatal error");
};
} else {
switch (src0->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
default: GGML_ASSERT(false);
default: GGML_ABORT("fatal error");
};
}
@ -2399,7 +2399,7 @@ static enum ggml_status ggml_metal_graph_compute(
switch (dst->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
default: GGML_ASSERT(false);
default: GGML_ABORT("fatal error");
};
[encoder setComputePipelineState:pipeline];
@ -2556,7 +2556,7 @@ static enum ggml_status ggml_metal_graph_compute(
switch (order) {
case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
default: GGML_ASSERT(false);
default: GGML_ABORT("fatal error");
};
[encoder setComputePipelineState:pipeline];
@ -2645,7 +2645,7 @@ static enum ggml_status ggml_metal_graph_compute(
{
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
GGML_ASSERT(false && "add template specialization for this size");
GGML_ABORT("add template specialization for this size");
}
}
} else {
@ -2658,7 +2658,7 @@ static enum ggml_status ggml_metal_graph_compute(
{
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
GGML_ASSERT(false && "add template specialization for this size");
GGML_ABORT("add template specialization for this size");
}
}
}
@ -2779,7 +2779,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
default: GGML_ASSERT(false && "not implemented");
default: GGML_ABORT("not implemented");
};
} break;
case GGML_TYPE_F16:
@ -2787,10 +2787,10 @@ static enum ggml_status ggml_metal_graph_compute(
switch (dstt) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
default: GGML_ASSERT(false && "not implemented");
default: GGML_ABORT("not implemented");
};
} break;
default: GGML_ASSERT(false && "not implemented");
default: GGML_ABORT("not implemented");
}
[encoder setComputePipelineState:pipeline];
@ -2818,7 +2818,7 @@ static enum ggml_status ggml_metal_graph_compute(
default:
{
GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}

View file

@ -4190,15 +4190,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#endif
for (; ib < nb; ++ib) {
int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) {
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
const int v1 = (x[ib].qs[j] >> 4) - 8;
sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
sumi0 += (v0 * y[ib].qs[j]);
sumi1 += (v1 * y[ib].qs[j + qk/2]);
}
int sumi = sumi0 + sumi1;
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
}
@ -4474,15 +4477,18 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
sumf = hsum_float_8(acc) + summs;
#endif
for (; ib < nb; ++ib) {
int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) {
const int v0 = (x[ib].qs[j] & 0x0F);
const int v1 = (x[ib].qs[j] >> 4);
sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
sumi0 += (v0 * y[ib].qs[j]);
sumi1 += (v1 * y[ib].qs[j + qk/2]);
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
}
@ -4823,18 +4829,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));
int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16;
const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16;
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
sumi0 += (x0 * y[ib].qs[j]);
sumi1 += (x1 * y[ib].qs[j + qk/2]);
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
}
@ -5194,7 +5203,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));
int sumi = 0;
int sumi0 = 0;
int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
@ -5203,9 +5213,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
sumi0 += (x0 * y[ib].qs[j]);
sumi1 += (x1 * y[ib].qs[j + qk/2]);
}
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
}
@ -12692,7 +12704,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
printf("\n");
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
q2[2*ib+0] |= ((uint32_t) grid_index << 8*k);
q2[2*ib+1] |= (block_signs[k] << 7*k);
@ -12871,7 +12883,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
printf("\n");
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
q2[2*ib+k] = grid_index | (block_signs[k] << 9);
}
@ -13314,7 +13326,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
printf("\n");
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
if (grid_size == 256) {
q3[8*ib+k] = grid_index;
@ -13527,7 +13539,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
printf("\n");
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
qs[k] = grid_index & 255;
qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
@ -14503,7 +14515,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
printf("\n");
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
const int i8 = 2*ib + k;
y[ibl].qs[i8] = grid_index & 255;
@ -14623,7 +14635,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
}
if (nbytes % ggml_type_size(type) != 0) {
fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
return false;
}

View file

@ -1723,7 +1723,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
});
});
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
@ -2075,8 +2075,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
// GGML_SYCL_DEBUG("current device index %d\n", id);
src_ptr = (char *) extra->data_device[id];
} else {
// GGML_SYCL_DEBUG("GGML_ASSERT(false)\n");
GGML_ASSERT(false);
// GGML_SYCL_DEBUG("GGML_ABORT("fatal error")\n");
GGML_ABORT("fatal error");
}
char * dst_ptr = (char *) dst;
@ -2163,7 +2163,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
default:
// TODO: k-quants
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}
@ -2192,7 +2192,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
@ -2476,7 +2476,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
case GGML_TYPE_Q6_K:
return 64;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
@ -3101,7 +3101,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
@ -3896,7 +3896,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
} else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
(void) dst;

View file

@ -100,7 +100,7 @@ static void crash() {
const char* msg) {
fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
GGML_ASSERT(!"SYCL error");
GGML_ABORT("SYCL error");
}
#define SYCL_CHECK(err) \
@ -267,7 +267,7 @@ struct ggml_backend_sycl_context {
queue_ptr stream(int device, int stream) {
if (qptrs[device][stream] == nullptr) {
qptrs[device][stream] = &(dpct::get_current_device().default_queue());
qptrs[device][stream] = &(dpct::get_device(device).default_queue());
}
return qptrs[device][stream];
}

View file

@ -1011,7 +1011,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
break;
default:
printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}

View file

@ -588,7 +588,7 @@ namespace dpct
out = prop;
}
/// dpct device extension
/// dpct device extension
class device_ext : public sycl::device {
typedef std::mutex mutex_type;
@ -697,7 +697,7 @@ namespace dpct
std::unique_lock<mutex_type> lock(m_mutex);
lock.unlock();
for (auto &q : _queues) {
q.wait_and_throw();
q.wait_and_throw();
}
// Guard the destruct of current_queues to make sure the ref count is
// safe.
@ -734,7 +734,12 @@ namespace dpct
void destroy_queue(sycl::queue queue) {
std::lock_guard<mutex_type> lock(m_mutex);
_queues.clear();
_queues.erase(std::remove_if(_queues.begin(), _queues.end(),
[=](const sycl::queue &q) -> bool
{
return q == queue;
}),
_queues.end());
}
void set_saved_queue(sycl::queue q) {
std::lock_guard<mutex_type> lock(m_mutex);
@ -764,13 +769,13 @@ namespace dpct
if (enable_exception_handler) {
eh = exception_handler;
}
auto q = sycl::queue(*this, eh,
sycl::property_list(
_queues.push_back(sycl::queue(
*this, eh,
sycl::property_list(
#ifdef DPCT_PROFILING_ENABLED
sycl::property::queue::enable_profiling(),
sycl::property::queue::enable_profiling(),
#endif
properties...));
_queues.push_back(q);
properties...)));
return _queues.back();
}
@ -783,8 +788,8 @@ namespace dpct
if (enable_exception_handler) {
eh = exception_handler;
}
_queues.push_back(
sycl::queue(device, eh,
_queues.push_back(sycl::queue(
device, eh,
sycl::property_list(
#ifdef DPCT_PROFILING_ENABLED
sycl::property::queue::enable_profiling(),
@ -855,15 +860,75 @@ namespace dpct
unsigned int get_device_id(const sycl::device &dev)
{
unsigned int id = 0;
for (auto dev_item : _devs)
for (auto &dev_item : _devs)
{
if (*dev_item == dev)
{
break;
return id;
}
id++;
}
return id;
return -1;
}
inline std::string get_preferred_gpu_platform_name() {
std::string result;
std::string filter = "level-zero";
char* env = getenv("ONEAPI_DEVICE_SELECTOR");
if (env) {
if (std::strstr(env, "level_zero")) {
filter = "level-zero";
}
else if (std::strstr(env, "opencl")) {
filter = "opencl";
}
else if (std::strstr(env, "cuda")) {
filter = "cuda";
}
else if (std::strstr(env, "hip")) {
filter = "hip";
}
else {
throw std::runtime_error("invalid device filter: " + std::string(env));
}
}
auto plaform_list = sycl::platform::get_platforms();
for (const auto& platform : plaform_list) {
auto devices = platform.get_devices();
auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
return d.is_gpu();
});
if (gpu_dev == devices.end()) {
// cout << "platform [" << platform_name
// << "] does not contain GPU devices, skipping\n";
continue;
}
auto platform_name = platform.get_info<sycl::info::platform::name>();
std::string platform_name_low_case;
platform_name_low_case.resize(platform_name.size());
std::transform(
platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower);
if (platform_name_low_case.find(filter) == std::string::npos) {
// cout << "platform [" << platform_name
// << "] does not match with requested "
// << filter << ", skipping\n";
continue;
}
result = platform_name;
}
if (result.empty())
throw std::runtime_error("can not find preferred GPU platform");
return result;
}
template <class DeviceSelector>
@ -910,7 +975,7 @@ namespace dpct
if (backend == "opencl:cpu") return 4;
if (backend == "opencl:acc") return 5;
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
static bool compare_backend(std::string &backend1, std::string &backend2) {
return convert_backend_index(backend1) < convert_backend_index(backend2);
@ -930,10 +995,15 @@ namespace dpct
// Keep track of the number of devices per backend
std::map<sycl::backend, size_t> DeviceNums;
std::map<std::string, std::vector<sycl::device>> backend_devices;
auto preferred_platform_name = get_preferred_gpu_platform_name();
while (!Platforms.empty()) {
auto Platform = Platforms.back();
Platforms.pop_back();
auto platform_name = Platform.get_info<sycl::info::platform::name>();
if (platform_name.compare(preferred_platform_name) != 0) {
continue;
}
auto devices = Platform.get_devices();
std::string backend_type = get_device_backend_and_type(devices[0]);
for (const auto &device : devices) {
@ -1989,6 +2059,11 @@ namespace dpct
return dev_mgr::instance().current_device();
}
static inline device_ext &get_device(unsigned int id)
{
return dev_mgr::instance().get_device(id);
}
static inline sycl::queue &get_in_order_queue()
{
return dev_mgr::instance().current_device().in_order_queue();

View file

@ -1799,7 +1799,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q4_0_PASCAL;
nwarps = NWARPS_Q4_0_PASCAL;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -1914,7 +1914,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q4_1_PASCAL;
nwarps = NWARPS_Q4_1_PASCAL;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2029,7 +2029,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q5_0_PASCAL;
nwarps = NWARPS_Q5_0_PASCAL;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2144,7 +2144,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q5_1_PASCAL;
nwarps = NWARPS_Q5_1_PASCAL;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2259,7 +2259,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q8_0_PASCAL;
nwarps = NWARPS_Q8_0_PASCAL;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2374,7 +2374,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q2_K_PASCAL;
nwarps = NWARPS_Q2_K_PASCAL;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2497,7 +2497,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q3_K_PASCAL;
nwarps = NWARPS_Q3_K_PASCAL;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2625,7 +2625,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q4_K_PASCAL;
nwarps = NWARPS_Q4_K_PASCAL;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2746,7 +2746,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q5_K_PASCAL;
nwarps = NWARPS_Q5_K_PASCAL;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2867,7 +2867,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q6_K_PASCAL;
nwarps = NWARPS_Q6_K_PASCAL;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -3016,7 +3016,7 @@ void ggml_sycl_op_mul_mat_q(
ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}

View file

@ -1017,7 +1017,7 @@ void ggml_sycl_op_mul_mat_vec_q(
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
break;
}
}

View file

@ -251,7 +251,7 @@ void ggml_sycl_op_rope(
attn_factor, corr_dims, freq_factors, main_stream
);
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
} else {
if (src0->type == GGML_TYPE_F32) {
@ -265,7 +265,7 @@ void ggml_sycl_op_rope(
attn_factor, corr_dims, freq_factors, main_stream
);
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}

View file

@ -236,8 +236,8 @@ struct vk_device_struct {
};
struct vk_buffer_struct {
vk::Buffer buffer;
vk::DeviceMemory device_memory;
vk::Buffer buffer = VK_NULL_HANDLE;
vk::DeviceMemory device_memory = VK_NULL_HANDLE;
vk::MemoryPropertyFlags memory_property_flags;
void * ptr;
size_t size = 0;
@ -1961,7 +1961,7 @@ void ggml_vk_instance_init() {
// Make sure at least one device exists
if (devices.empty()) {
std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
// Default to using all dedicated GPUs
@ -2459,7 +2459,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
// Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
// Check if src is pinned memory
vk_buffer buf;
@ -2527,7 +2527,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
staging = ctx->device->sync_staging;
staging_offset = 0;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
@ -2563,7 +2563,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
// Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
// Check if src is pinned memory
vk_buffer buf = nullptr;
@ -2602,7 +2602,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
staging_buffer = dst->device->sync_staging;
staging_offset = 0;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
@ -2704,7 +2704,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
staging_buffer = src->device->sync_staging;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
@ -2913,7 +2913,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
}
std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
@ -3499,7 +3499,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
if (mmp == nullptr) {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
// Not implemented
@ -4078,7 +4078,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
std::cerr << " and " << ggml_type_name(src1->type);
}
std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
op_func(ctx, subctx, src0, src1, dst);
@ -4521,7 +4521,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
} else if (type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
fprintf(stderr, "% 7.2f ", val);
} else {
@ -4555,7 +4555,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_s;
shname = "F16_ALIGNED_S";
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
} else if (shader_size == 1) {
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@ -4571,7 +4571,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_m;
shname = "F16_ALIGNED_M";
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
} else if (shader_size == 2) {
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@ -4587,7 +4587,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_l;
shname = "F16_ALIGNED_L";
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
} else {
GGML_ASSERT(0);
@ -4668,7 +4668,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
} else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
for (size_t i = 0; i < y_ne; i++) {
@ -4679,7 +4679,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
// y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
}
@ -4727,14 +4727,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
} else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
src0_type = GGML_TYPE_F16;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
if (std::is_same<float, Y_TYPE>()) {
src1_type = GGML_TYPE_F32;
} else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
src1_type = GGML_TYPE_F16;
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch);
@ -4841,7 +4841,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
} else if (tensor->type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
fprintf(stderr, "% 7.2f ", val);
} else {
@ -5391,7 +5391,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
std::cerr << std::endl;
}
GGML_ASSERT(false);
GGML_ABORT("fatal error");
#endif
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
@ -5486,7 +5486,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
break;
default:
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
GGML_ASSERT(false);
GGML_ABORT("fatal error");
return;
}
@ -6498,7 +6498,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
} else if (tensor->type == GGML_TYPE_I32) {
val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
fprintf(stderr, "% 7.2f ", val);
} else {
@ -6620,7 +6620,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
}
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@ -6662,7 +6662,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
}
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@ -6720,7 +6720,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
}
} else {
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@ -6797,7 +6797,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
break;
default:
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
if (src1 == nullptr) {
@ -6825,7 +6825,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
} else {
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
@ -6912,7 +6912,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
}
} else {
std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
@ -6935,7 +6935,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
std::cerr << std::endl;
std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done);
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
first_error[0] = i0;
@ -7006,7 +7006,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
std::cerr << std::endl;
std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done);
GGML_ASSERT(false);
GGML_ABORT("fatal error");
} else {
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
}

File diff suppressed because it is too large Load diff