use hipblas based on cublas

2023-04-20 02:04:00 +03:00 · 2023-04-20 02:04:00 +03:00 · 0fd8363adc
commit 0fd8363adc
parent 2005469ea1
4 changed files with 69 additions and 2 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -67,6 +67,7 @@ endif()
 option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
 option(LLAMA_OPENBLAS               "llama: use OpenBLAS"                                   OFF)
 option(LLAMA_CUBLAS                 "llama: use cuBLAS"                                     OFF)
+option(LLAMA_HIPBLAS                "llama: use hipBLAS"                                    OFF)

 option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
@ -168,6 +169,31 @@ if (LLAMA_CUBLAS)
    endif()
 endif()

+if (LLAMA_HIPBLAS)
+    cmake_minimum_required(VERSION 3.21)
+
+    find_package(hip)
+    find_package(hipblas)
+
+    if (hipblas_FOUND)
+        message(STATUS "hipBLAS found")
+
+        set(LLAMA_HIPBLAS_PLATFORM "AMD" CACHE STRING "hip device type" FORCE)
+        set_property(CACHE LLAMA_HIPBLAS_PLATFORM PROPERTY STRINGS "AMD" "NVIDIA")
+
+        add_compile_definitions(GGML_USE_HIPBLAS "__HIP_PLATFORM_${LLAMA_HIPBLAS_PLATFORM}__")
+
+        add_library(ggml-hip OBJECT ggml-cuda.cu)
+        set_source_files_properties(ggml-cuda.cu PROPERTIES  LANGUAGE  CXX)
+        target_link_libraries(ggml-hip hip::device)
+
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::host roc::hipblas ggml-hip)
+
+    else()
+        message(WARNING "hipBLAS not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(c_flags
--- a/4
+++ b/4
@ -107,6 +107,10 @@ ifdef LLAMA_CUBLAS
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	nvcc -arch=native -c -o $@ $<
 endif
+ifdef LLAMA_HIPBLAS
+	CFLAGS  += -DGGML_USE_HIPBLAS -D__HIP_PLATFORM_AMD__ -I/opt/rocm/include
+	LDFLAGS += -lhipblas -lamdhip64 -L/opt/rocm/lib
+endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -1,5 +1,11 @@
 #include <stdint.h>
+#if defined(__HIP_PLATFORM_AMD__)
+#include "hip/hip_runtime.h"
+#define cudaStream_t hipStream_t
+#define __half _Float16
+#else
 #include <cuda_fp16.h>
+#endif
 #include "ggml-cuda.h"

 typedef uint16_t ggml_fp16_t;
--- a/ggml.c
+++ b/ggml.c
@ -147,9 +147,41 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include <Accelerate/Accelerate.h>
 #elif defined(GGML_USE_OPENBLAS)
 #include <cblas.h>
-#elif defined(GGML_USE_CUBLAS)
+#elif defined(GGML_USE_CUBLAS) || defined(GGML_USE_HIPBLAS)
+
+#if defined(GGML_USE_HIPBLAS)
+#include "hipblas/hipblas.h"
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define cublasCreate hipblasCreate
+#define cublasGemmEx hipblasGemmEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define cudaError_t hipError_t
+#define cudaFree hipFree
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaMalloc hipMalloc
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaStream_t hipStream_t
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaSuccess hipSuccess
+#define GGML_USE_CUBLAS
+#else
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
+#endif
 #include "ggml-cuda.h"

 #define CUDA_CHECK(err)                                                        \
@ -8073,7 +8105,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
                const float * x = wdata;
 #endif

-
 #if defined(GGML_USE_CUBLAS)
                // copy data to device
                CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream));