allocator: cleanup, more comments

llama.cpp: print input/output buffers size
allocator cleanup
2023-07-22 15:05:24 +02:00 · 2023-07-22 13:31:06 +02:00 · 2023-07-22 13:29:44 +02:00 · 2023-07-22 02:34:21 +02:00 · 2023-07-21 16:51:50 +02:00 · 2023-07-21 12:59:26 +02:00
15 changed files with 5018 additions and 4806 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -308,13 +308,13 @@ jobs:
          path: |
            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

-  windows-latest-cmake-cublas:
+  windows-latest-cmake-cuda:
    runs-on: windows-latest

    strategy:
      matrix:
        cuda: ['12.1.0', '11.7.1']
-        build: ['cublas']
+        build: ['cuda']

    steps:
      - name: Clone
@ -333,7 +333,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
+          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON
          cmake --build . --config Release

      - name: Get commit hash
@ -395,7 +395,7 @@ jobs:
      - macOS-latest-make
      - macOS-latest-cmake
      - windows-latest-cmake
-      - windows-latest-cmake-cublas
+      - windows-latest-cmake-cuda

    steps:
      - name: Download artifacts
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -67,7 +67,7 @@ endif()
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                OFF)
+option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
@ -239,18 +239,18 @@ if (LLAMA_K_QUANTS)
    endif()
 endif()

-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
    cmake_minimum_required(VERSION 3.17)

    find_package(CUDAToolkit)
    if (CUDAToolkit_FOUND)
-        message(STATUS "cuBLAS found")
+        message(STATUS "CUDA found")

        enable_language(CUDA)

        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

-        add_compile_definitions(GGML_USE_CUBLAS)
+        add_compile_definitions(GGML_USE_CUDA)
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@ -280,7 +280,7 @@ if (LLAMA_CUBLAS)
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

    else()
-        message(WARNING "cuBLAS not found")
+        message(WARNING "CUDA not found")
    endif()
 endif()

--- a/30
+++ b/30
@ -55,6 +55,12 @@ else
 	CXXFLAGS += -DNDEBUG
 endif

+ifdef LLAMA_SANITIZE
+	CFLAGS   += -g -fsanitize=$(LLAMA_SANITIZE) -fno-omit-frame-pointer
+	CXXFLAGS += -g -fsanitize=$(LLAMA_SANITIZE) -fno-omit-frame-pointer
+	LDFLAGS  += -g -fsanitize=$(LLAMA_SANITIZE)
+endif
+
 ifdef LLAMA_SERVER_VERBOSE
 	CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
 endif
@ -163,13 +169,17 @@ ifdef LLAMA_BLIS
 	LDFLAGS += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS

-ifdef LLAMA_CUBLAS
-	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
-	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+ifdef LLAMA_CUDA
+	CFLAGS    += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	CXXFLAGS  += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
 	NVCCFLAGS = --forward-unknown-to-host-compiler
+	NVCCV 	  := $(shell $(NVCC) --version | tail -n 1)
+ifdef LLAMA_DEBUG
+	NVCCFLAGS += -lineinfo
+endif # LLAMA_DEBUG
 ifdef CUDA_DOCKER_ARCH
 	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
 else
@ -198,10 +208,9 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
-
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda-kern.h ggml-cuda-quant.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA

 ifdef LLAMA_CLBLAST
 	CFLAGS   += -DGGML_USE_CLBLAST
@ -275,6 +284,9 @@ $(info I CXXFLAGS: $(CXXFLAGS))
 $(info I LDFLAGS:  $(LDFLAGS))
 $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
+ifdef LLAMA_CUDA
+$(info I NVCC:     $(NVCCV))
+endif # LLAMA_CUDA
 $(info )

 #
@ -284,6 +296,12 @@ $(info )
 ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@

+# temporary, probably will be added to ggml.c
+ggml-backend.o: ggml-backend.c ggml-backend.h ggml.h
+	$(CC)  $(CFLAGS)   -c $< -o $@
+
+OBJS += ggml-backend.o
+
 llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -327,24 +327,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.n_gpu_layers = std::stoi(argv[i]);
 #else
            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU support\n");
 #endif
        } else if (arg == "--main-gpu" || arg == "-mg") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
            params.main_gpu = std::stoi(argv[i]);
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
+      fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set a main GPU.\n");
 #endif
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
            std::string arg_next = argv[i];

            // split string by , and /
@ -361,14 +361,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                }
            }
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
-#endif // GGML_USE_CUBLAS
+      fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUDA
        } else if (arg == "--low-vram" || arg == "-lv") {
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
            params.low_vram = true;
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
-#endif // GGML_USE_CUBLAS
+      fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set lower vram usage.\n");
+#endif // GGML_USE_CUDA
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--mtest") {
--- a/ggml-backend.c
+++ b/ggml-backend.c
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -0,0 +1,162 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+    struct ggml_backend;
+
+    // backend buffer
+    typedef void * ggml_buffer_context_t;
+    struct ggml_backend_buffer;
+
+    struct ggml_backend_buffer_interface {
+        // allocator functions
+        void   (*free_buffer)   (struct ggml_backend_buffer * alloc);
+        void   (*alloc_tensor)  (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor);
+        void   (*free_tensor)   (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor);
+        void   (*reset)         (struct ggml_backend_buffer * alloc);
+        // functions overriden by the backend
+        size_t (*get_alloc_size)(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor); // pre-allocation callback
+        void   (*init_tensor)   (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor); // post-allocation callback
+        void   (*free_data)     (struct ggml_backend_buffer * alloc); // free backend-specific data // TODO: better name
+    };
+
+    struct ggml_backend_buffer {
+        struct ggml_backend_buffer_interface interface;
+        ggml_buffer_context_t context;
+        struct ggml_backend * backend;
+        void * backend_data;
+        bool measure;
+        size_t max_size;
+    };
+
+    // backend buffer helper functions
+    GGML_API      void ggml_backend_buffer_free(struct ggml_backend_buffer * alloc);
+    static inline void ggml_backend_buffer_tensor_alloc(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.alloc_tensor(alloc, tensor); }
+    static inline void ggml_backend_buffer_tensor_free(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); }
+    static inline void ggml_backend_buffer_reset(struct ggml_backend_buffer * alloc) { alloc->interface.reset(alloc); }
+
+    // default buffer allocator
+    GGML_API struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment);
+
+    // buffer
+
+    // buffers have space for the tensor structs in host memory, and tensor data in backend-specific memory
+    struct ggml_buffer {
+        // host memory
+        size_t mem_size;
+        void * mem_buffer;
+
+        // tensor data
+        struct ggml_backend_buffer * backend_buffer;
+    };
+
+    GGML_API struct ggml_buffer * ggml_buffer_alloc        (struct ggml_backend * backend, size_t size, size_t max_tensors);
+    GGML_API struct ggml_buffer * ggml_buffer_measure_alloc(struct ggml_backend * backend, size_t max_tensors);
+    // measure buffers only calculate the maximum size of the buffer without allocating it - useful for pre-allocation
+    GGML_API void ggml_buffer_free(struct ggml_buffer * buffer);
+
+    // backend
+    typedef void * ggml_backend_context_t;
+    typedef void * ggml_graph_plan_t;
+
+    struct ggml_backend_interface {
+        const char * (*get_name)(struct ggml_backend * backend);
+
+        void (*free)(struct ggml_backend * backend);
+
+        // buffer allocation
+        struct ggml_backend_buffer * (*alloc_buffer)(struct ggml_backend * backend, size_t size);
+
+        // tensor data access
+        // these functions can be asynchronous. helper functions are provided for synchronous access that automatically call synchronize
+        void (*set_tensor_async)(struct ggml_backend * backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)(struct ggml_backend * backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+        void (*synchronize)     (struct ggml_backend * backend);
+
+        // (optional) copy tensor between different backends, allow for single-copy tranfers
+        void (*cpy_tensor_from)(struct ggml_backend * backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to)  (struct ggml_backend * backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+        // compute graph with a plan
+        ggml_graph_plan_t (*graph_plan_create) (struct ggml_backend * backend, struct ggml_cgraph * cgraph);
+        void              (*graph_plan_free)   (struct ggml_backend * backend, ggml_graph_plan_t plan);
+        void              (*graph_plan_compute)(struct ggml_backend * backend, ggml_graph_plan_t plan);
+
+        // compute graph without a plan
+        void              (*graph_compute)     (struct ggml_backend * backend, struct ggml_cgraph * cgraph);
+
+        // check if a backend supports a given operation
+        // this could be used to fallback automatically to the CPU backend if a backend doesn't support an operation
+        // bool (*supports_op)(struct ggml_backend * backend, struct ggml_tensor * op);
+    };
+
+    struct ggml_backend {
+        struct ggml_backend_interface interface;
+        ggml_backend_context_t context;
+    };
+
+    // backend helper functions
+    static inline const char * ggml_backend_name(struct ggml_backend * backend) { return backend->interface.get_name(backend); }
+    static inline void ggml_backend_free(struct ggml_backend * backend) { backend->interface.free(backend); }
+    static inline void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { tensor->backend->interface.set_tensor_async(tensor->backend, tensor, data, offset, size); }
+    static inline void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { tensor->backend->interface.get_tensor_async(tensor->backend, tensor, data, offset, size); }
+    static inline void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { tensor->backend->interface.set_tensor_async(tensor->backend, tensor, data, offset, size); tensor->backend->interface.synchronize(tensor->backend); }
+    static inline void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { tensor->backend->interface.get_tensor_async(tensor->backend, tensor, data, offset, size); tensor->backend->interface.synchronize(tensor->backend); }
+    static inline void ggml_backend_synchronize(struct ggml_backend * backend) { backend->interface.synchronize(backend); }
+    static inline ggml_graph_plan_t ggml_backend_graph_plan_create(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { return backend->interface.graph_plan_create(backend, cgraph); }
+    static inline void ggml_backend_graph_plan_free(struct ggml_backend * backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_free(backend, plan); }
+    static inline void ggml_backend_graph_plan_compute(struct ggml_backend * backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_compute(backend, plan); }
+    static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface.graph_compute(backend, cgraph); }
+
+    // tensor copy between different backends
+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // CPU backend
+    GGML_API struct ggml_backend * ggml_backend_cpu_init(void);
+    GGML_API void ggml_backend_cpu_set_n_threads(struct ggml_backend * backend_cpu, int n_threads);
+
+    ///////////////////////////
+
+    // graph splitting
+    #define GGML_MAX_SPLITS 200
+    #define GGML_MAX_SPLIT_INPUTS 4
+
+    struct ggml_graph_split {
+        char name[GGML_MAX_NAME];
+        struct ggml_context * ctx;
+        struct ggml_tensor  * src_inputs[GGML_MAX_SPLIT_INPUTS + 1];
+        struct ggml_tensor  * dst_inputs[GGML_MAX_SPLIT_INPUTS + 1];
+        struct ggml_cgraph  * graph;
+    };
+
+    // TODO: this shouldn't be fixed size, allocate from ggml_context
+    struct ggml_graph_splits {
+        int n_splits;
+        struct ggml_graph_split splits[GGML_MAX_SPLITS];
+    };
+
+    // TODO: allocate in ggml_context
+    struct ggml_graph_splits ggml_graph_split_init(void);
+    // this won't be needed once we can allocate graphs from a ggml_context
+    GGML_API void ggml_graph_splits_free(struct ggml_graph_splits * splits);
+
+    // add a split to the graph - single and multiple inputs versions
+    GGML_API void ggml_graph_splits_add(struct ggml_graph_splits * splits, struct ggml_tensor ** input, struct ggml_context * ctx, const char * fmt, ...);
+    GGML_API void ggml_graph_splits_add_n(struct ggml_graph_splits * splits, struct ggml_tensor *** inputs, struct ggml_context * ctx, const char * fmt, ...);
+
+    // build graphs for all splits
+    GGML_API void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct ggml_tensor * output);
+
+    // compute
+    GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits);
+
+    // graph tensor allocator
+    GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph, struct ggml_context * ctx);
+    GGML_API void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml-cuda-kern.h
+++ b/ggml-cuda-kern.h
@ -0,0 +1,468 @@
+// kernels for ggml-cuda
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+
+template<typename dst_t>
+using to_t_cuda_t = void (*)(const void * x, dst_t * y, int k, cudaStream_t stream);
+
+// support for vector types in generic code
+template<typename T> struct vec2_t_impl;
+template<> struct vec2_t_impl<half>   { typedef half2 type; };
+template<> struct vec2_t_impl<float>  { typedef float2 type; };
+
+template<typename T> using vec2_t = typename vec2_t_impl<T>::type;
+
+template<typename T> inline __host__ __device__ vec2_t<T> make_vec2_t(const T & x, const T & y);
+template<> inline __host__ __device__ vec2_t<half>  make_vec2_t(const  half & x, const  half & y) { return make_half2 (x, y); }
+template<> inline __host__ __device__ vec2_t<float> make_vec2_t(const float & x, const float & y) { return make_float2(x, y); }
+
+// the cuda headers define operators for half2, but not for float2
+// they are defined here to simplify generic code
+inline __host__ __device__ float2   operator+(const float2 & a, const float2 & b) { return make_float2(a.x + b.x, a.y + b.y); }
+inline __host__ __device__ float2   operator-(const float2 & a, const float2 & b) { return make_float2(a.x - b.x, a.y - b.y); }
+inline __host__ __device__ float2   operator*(const float2 & a, const float2 & b) { return make_float2(a.x * b.x, a.y * b.y); }
+inline __host__ __device__ float2   operator/(const float2 & a, const float2 & b) { return make_float2(a.x / b.x, a.y / b.y); }
+inline __host__ __device__ float2 & operator+=(     float2 & a, const float2 & b) { a.x += b.x; a.y += b.y; return a; }
+inline __host__ __device__ float2 & operator-=(     float2 & a, const float2 & b) { a.x -= b.x; a.y -= b.y; return a; }
+inline __host__ __device__ float2 & operator*=(     float2 & a, const float2 & b) { a.x *= b.x; a.y *= b.y; return a; }
+inline __host__ __device__ float2 & operator/=(     float2 & a, const float2 & b) { a.x /= b.x; a.y /= b.y; return a; }
+
+template<typename dst_t>
+using dequantize_kernel_t = void (*)(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v);
+
+__device__ half  sqrt(const half x) { return hsqrt(x); }
+__device__ half  exp(const half x) { return hexp(x); }
+__device__ half2 exp(const half2 x) { return h2exp(x); }
+__device__ half  cos(const half x) { return hcos(x); }
+__device__ half  sin(const half x) { return hsin(x); }
+__device__ half  max(const half x, const half y) { return __hmax(x, y); }
+__device__ half2 max(const half2 x, const half2 y) { return __hmax2(x, y); }
+
+
+template<typename T> struct op_max { __device__ T operator()(T a, T b) const { return max(a, b); } };
+template<typename T> struct op_sum { __device__ T operator()(T a, T b) const { return a + b; } };
+
+template<template<typename> class op_t, typename T>
+static inline __device__ T warp_reduce_all(T val) {
+    op_t<T> op;
+#pragma unroll
+    for (int mask = warpSize/2; mask > 0; mask /= 2)  {
+        val = op(val, __shfl_xor_sync(0xffffffff, val, mask, 32));
+    }
+    return val;
+}
+
+template<typename T>
+static __device__ T zero_init() { return T(0); }
+template<>
+__device__ half2 zero_init() { return half2(0.0f, 0.0f); }
+
+template<template<typename> class op_t, typename T>
+static __device__ T block_reduce_all(const T val, const T init = zero_init<T>()) {
+    const int warp_id = threadIdx.x / warpSize; // warp id within the block
+    const int lane_id = threadIdx.x % warpSize; // lane id within the warp
+    const int num_warps = blockDim.x / warpSize; // number of warps in the block
+
+    __shared__ T lane_result[32]; // max 32 warps per block
+
+    // reduce warps
+    T warp_reduction = warp_reduce_all<op_t>(val);
+
+    __syncthreads();
+
+    // first thread within a warp writes reduction to shared memory
+    if (lane_id == 0) {
+        lane_result[warp_id] = warp_reduction;
+    }
+
+    // wait for all warps to finish writing their reductions
+    __syncthreads();
+
+    // reduce the results of all warps
+    T block_reduction = init;
+    if (lane_id < num_warps) {
+        block_reduction = lane_result[lane_id];
+    }
+
+    block_reduction = warp_reduce_all<op_t>(block_reduction);
+
+    return block_reduction;
+}
+
+template<typename dst_t>
+static __device__ void convert_fp16(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v) {
+    const half * x = (const half *) vx;
+
+    v.x = (dst_t)(x[ib + iqs + 0]);
+    v.y = (dst_t)(x[ib + iqs + 1]);
+}
+
+template<typename dst_t>
+static __device__ void convert_fp32(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v) {
+    const float * x = (const float *) vx;
+
+    v.x = (dst_t)(x[ib + iqs + 0]);
+    v.y = (dst_t)(x[ib + iqs + 1]);
+}
+
+template<typename src0_t, typename src1_t, typename dst_t>
+static __global__ void k_mul_mat_p021(const src0_t * vx, const src1_t * y, dst_t * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
+    const src0_t * x = vx;
+    // const int col_x = blockDim.x*blockIdx.x + threadIdx.x;
+    // const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
+
+    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
+    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
+
+    const int nrows_y = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst = row_x;
+
+    dst_t tmp = 0;
+
+    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
+        const int col_x = col_x0 + threadIdx.x;
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        // x is transposed and permuted
+        const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
+        const dst_t xi = (dst_t)(x[ix]);
+
+        const int row_y = col_x;
+
+        // y is not transposed but permuted
+        const int iy = channel*nrows_y + row_y;
+
+        tmp += xi * y[iy];
+    }
+
+    // dst is not transposed and not permuted
+    const int idst = channel*nrows_dst + row_dst;
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+template<typename src0_t, typename src1_t, typename dst_t>
+static __global__ void k_mul_mat_vec_nc(
+    const src0_t * vx, const src1_t * y, dst_t * dst, const int ncols_x, const int nrows_x,
+    const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
+
+    const src0_t * x = vx;
+
+    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
+    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
+
+    const int nrows_y = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst = row_x;
+
+    const int idst = channel*nrows_dst + row_dst;
+
+    dst_t tmp = 0;
+
+    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
+        const int col_x = col_x0 + threadIdx.x;
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
+        const dst_t xi = (dst_t)(x[ix]);
+
+        const int row_y = col_x;
+
+        const int iy = channel*nrows_y + row_y;
+
+        tmp += xi * y[iy];
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+template <typename src_t, typename dst_t>
+static __global__ void k_cpy(const char * cx, char * cdst, const int ne,
+                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+                                   const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int i02 = i / (ne00*ne01);
+    const int i01 = (i - i02*ne01*ne00) / ne00;
+    const int i00 = i - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+
+    const int i12 = i / (ne10*ne11);
+    const int i11 = (i - i12*ne10*ne11) / ne10;
+    const int i10 = i - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+
+    *(dst_t *)(cdst + dst_offset) = *(const src_t *)(cx + x_offset);
+}
+
+template<typename src0_t, typename src1_t, typename dst_t>
+static __global__ void k_add(const src0_t * x, const src1_t * y, dst_t * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = (dst_t)x[i] + (dst_t)y[i];
+}
+
+template<typename src0_t, typename src1_t, typename dst_t>
+static __global__ void k_mul(const src0_t * x, const src1_t * y, dst_t * dst, const int kx, const int ky) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= kx) {
+        return;
+    }
+    dst[i] = (dst_t)x[i] * (dst_t)y[i%ky];
+}
+
+template<typename src0_t, typename dst_t>
+static __global__ void k_silu(const src0_t * x, dst_t * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] / (src0_t(1) + exp(-x[i]));
+}
+
+// TODO: unstable with f16 compute, using f32 compute for now
+template<typename src0_t, typename dst_t>
+static __global__ void k_rms_norm(const src0_t * x, dst_t * dst, const int ncols) {
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+
+    const float eps  = 1e-6;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        const float xi = x[row*ncols + col];
+        tmp += xi * xi;
+    }
+
+    // sum up partial sums
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    const float mean = tmp / (float)ncols;
+    const float scale = 1.0f / sqrtf(mean + eps);
+
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        dst[row*ncols + col] = scale * (float)x[row*ncols + col];
+    }
+}
+
+template<typename src0_t, typename dst_t>
+static __global__ void k_rope(const src0_t * x, dst_t * dst, const int ncols, const float p, const float theta_scale) {
+    const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+    const int i = row*ncols + col;
+
+    const dst_t theta = p * powf(theta_scale, col/2);
+    const dst_t sin_theta = sin(theta);
+    const dst_t cos_theta = cos(theta);
+
+    const dst_t x0 = x[i + 0];
+    const dst_t x1 = x[i + 1];
+
+    dst[i + 0] = (dst_t)x0*cos_theta - (dst_t)x1*sin_theta;
+    dst[i + 1] = (dst_t)x0*sin_theta + (dst_t)x1*cos_theta;
+}
+
+template<typename src0_t, typename dst_t>
+static __global__ void k_diag_mask_inf(const src0_t * x, dst_t * dst, const int ncols, const int rows_per_channel, const int n_past) {
+    const int col = blockDim.x*blockIdx.x + threadIdx.x;
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int i = row*ncols + col;
+    //dst[i] = col > (n_past + row % rows_per_channel) ? (dst_t)-INFINITY : (dst_t)x[i];
+    dst[i] = (dst_t)x[i] - (dst_t)((col > n_past + row % rows_per_channel) * INT_MAX); // equivalent within rounding error but slightly faster on GPU
+}
+
+// TODO: numerically stable version - low prio since the softmax is computed in the fused attention kernel
+// check: https://arxiv.org/pdf/2001.04438.pdf
+template<typename src0_t, typename dst_t>
+static __global__ void k_soft_max_orig(const src0_t * x, dst_t * dst, const int ncols) {
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+    const int block_size = blockDim.x;
+    const int tid = threadIdx.x;
+
+    float tmp = 0;
+
+    for (int block_start = 0; block_start < ncols; block_start += block_size) {
+        const int col = block_start + tid;
+
+        if (col >= ncols) {
+            break;
+        }
+
+        const int i = row*ncols + col;
+        const float val = expf(x[i]);
+        tmp += val;
+        dst[i] = val;
+    }
+
+    // sum up partial sums
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    for (int block_start = 0; block_start < ncols; block_start += block_size) {
+        const int col = block_start + tid;
+
+        if (col >= ncols) {
+            break;
+        }
+
+        const int i = row*ncols + col;
+        dst[i] /= tmp;
+    }
+}
+
+template<typename src_t, typename dst_t, int pack_size, int block_size>
+static __global__ void k_soft_max(const src_t * x, dst_t * dst, const int64_t nrows, const int64_t ncols) {
+    //assert(ncols % pack_size == 0);
+    const int tid = threadIdx.x;
+    const int num_packs = ncols / pack_size;
+
+    for (int row = blockIdx.x; row < nrows; row += gridDim.x) {
+        src_t th_max = -INFINITY;
+        // row max thread
+        #pragma unroll
+        for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+            // load pack
+            src_t pack[pack_size];
+            #pragma unroll
+            for (int i = 0; i < pack_size; i++) {
+                pack[i] = x[row * ncols + pack_id * pack_size + i];
+            }
+            // reduce max pack
+            #pragma unroll
+            for (int i = 0; i < pack_size; ++i) {
+                th_max = max(th_max, pack[i]);
+            }
+        }
+        // reduce max row warp threads
+        src_t row_max = block_reduce_all<op_max>(th_max, (src_t)-INFINITY);
+
+        // row exp sum thread
+        src_t th_sum = 0;
+        #pragma unroll
+        for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+            // load pack
+            src_t pack[pack_size];
+            #pragma unroll
+            for (int i = 0; i < pack_size; i++) {
+                pack[i] = x[row * ncols + pack_id * pack_size + i];
+            }
+            // reduce pack
+            #pragma unroll
+            for (int i = 0; i < pack_size; ++i) {
+                th_sum += exp(pack[i] - row_max);
+            }
+        }
+
+        // reduce row exp sum all threads
+        src_t row_sum = block_reduce_all<op_sum>(th_sum);
+
+        // store (row - row_max) / row exp sum
+        #pragma unroll
+        for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+            // load pack
+            src_t pack[pack_size];
+            #pragma unroll
+            for (int i = 0; i < pack_size; i++) {
+                pack[i] = x[row * ncols + pack_id * pack_size + i];
+            }
+            // reduce pack
+            #pragma unroll
+            for (int i = 0; i < pack_size; ++i) {
+                pack[i] = exp(pack[i] - row_max) / row_sum;
+            }
+
+            // store pack
+            #pragma unroll
+            for (int i = 0; i < pack_size; i++) {
+                dst[row * ncols + pack_id * pack_size + i] = pack[i];
+            }
+        }
+    }
+}
+
+template<typename src0_t, typename src1_t, typename dst_t>
+static __global__ void k_scale(const src0_t * x, dst_t * dst, const src1_t * scale, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = (dst_t)(*scale) * (dst_t)x[i];
+}
+
+template<typename dst_t, int qk, int qr, dequantize_kernel_t<dst_t> dequantize_kernel>
+static __global__ void k_get_rows(const void * x, const int * y, dst_t * dst, const int ncols) {
+    const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int r = y[row];
+
+    // copy x[r*ncols + col] to dst[row*ncols + col]
+    const int xi = r*ncols + col;
+    const int di = row*ncols + col;
+
+    const int ib = xi/qk; // block index
+    const int iqs = (xi%qk)/qr; // quant index
+    const int iybs = di - di%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    vec2_t<dst_t> v;
+    dequantize_kernel(x, ib, iqs, v);
+    dst[iybs + iqs + 0]        = v.x;
+    dst[iybs + iqs + y_offset] = v.y;
+}
--- a/ggml-cuda-quant.h
+++ b/ggml-cuda-quant.h
@ -0,0 +1,920 @@
+// quants kernels for ggml-cuda
+
+// QK = number of values after dequantization
+// QR = QK / number of values before dequantization
+// QI = number of 32 bit integers before dequantization
+
+#define QK4_0 32
+#define QR4_0 2
+#define QI4_0 4
+typedef struct {
+    half    d;              // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+#define QR4_1 2
+#define QI4_1 4
+typedef struct {
+    half    d;              // delta
+    half    m;              // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+#define QR5_0 2
+#define QI5_0 4
+typedef struct {
+    half d;                 // delta
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+#define QR5_1 2
+#define QI5_1 4
+typedef struct {
+    half d;                 // delta
+    half m;                 // min
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+#define QR8_0 1
+#define QI8_0 8
+typedef struct {
+    half    d;              // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
+
+#define QK8_1 32
+#define QR8_1 1
+#define QI8_1 8
+typedef struct {
+    half    d;              // delta
+    half    s;              // unquantized sum
+    int8_t  qs[QK8_0];      // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
+
+//================================= k-quants
+
+#define QK_K 256
+
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    half d;                  // super-block scale for quantized scales
+    half dmin;               // super-block scale for quantized mins
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+typedef struct {
+    uint8_t hmask[QK_K/8];
+    uint8_t qs[QK_K/4]; // nibbles / quants
+    uint8_t scales[3*QK_K/64];
+    half d;
+} block_q3_K;
+static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
+
+typedef struct {
+    half d;                    // super-block scale for quantized scales
+    half dmin;                 // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
+
+typedef struct {
+    half    d;                   // super-block scale for quantized scales
+    half    dmin;                // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+
+typedef struct {
+    uint8_t ql[QK_K/2];   // quants, lower 4 bits
+    uint8_t qh[QK_K/4];   // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales
+    half    d;         // delta
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
+
+
+template<typename src1_t, typename dst_t>
+using dot_kernel_k_t = void (*)(const void * vx, const int ib, const int iqs, const src1_t * y, dst_t & v);
+
+template<typename dst_t>
+using vec_dot_q_cuda_t = dst_t (*)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
+
+
+// TODO: f16
+template<typename src_t>
+static __global__ void quantize_q8_1(const src_t * x, void * vy, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    block_q8_1 * y = (block_q8_1 *) vy;
+
+    const int ib = i / QK8_0; // block index
+    const int iqs = i % QK8_0; // quant index
+
+    const float xi = x[i];
+    float amax = fabsf(xi);
+    float sum = xi;
+
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
+        sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
+    }
+
+    const float d = amax / 127;
+    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
+
+    y[ib].qs[iqs] = q;
+
+    if (iqs > 0) {
+        return;
+    }
+
+    y[ib].d = d;
+    y[ib].s = sum;
+}
+
+template<typename dst_t>
+static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dst_t d = x[ib].d;
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    v.x = vui & 0xF;
+    v.y = vui >> 4;
+
+    const vec2_t<dst_t> off2 = make_vec2_t<dst_t>(8, 8);
+    const vec2_t<dst_t> d2   = make_vec2_t<dst_t>(d, d);
+
+    v = (v - off2) * d2;
+}
+
+template<typename dst_t>
+static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const dst_t d = x[ib].d;
+    const dst_t m = x[ib].m;
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    v.x = vui & 0xF;
+    v.y = vui >> 4;
+
+    const vec2_t<dst_t> d2 = make_vec2_t<dst_t>(d, d);
+    const vec2_t<dst_t> m2 = make_vec2_t<dst_t>(m, m);
+
+    v = v * d2 + m2;
+}
+
+template<typename dst_t>
+static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const dst_t d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+    const vec2_t<dst_t> off2 = make_vec2_t<dst_t>(16, 16);
+    const vec2_t<dst_t> d2   = make_vec2_t<dst_t>(d, d);
+
+    v = (v - off2) * d2;
+}
+
+template<typename dst_t>
+static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const dst_t d = x[ib].d;
+    const dst_t m = x[ib].m;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+    const vec2_t<dst_t> d2 = make_vec2_t<dst_t>(d, d);
+    const vec2_t<dst_t> m2 = make_vec2_t<dst_t>(m, m);
+
+    v = v * d2 + m2;
+}
+
+template<typename dst_t>
+static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const dst_t d = x[ib].d;
+
+    v.x = x[ib].qs[iqs + 0];
+    v.y = x[ib].qs[iqs + 1];
+
+    const vec2_t<dst_t> d2 = make_vec2_t<dst_t>(d, d);
+
+    v = v * d2;
+}
+
+//================================== k-quants
+
+static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
+
+    const int i   = blockIdx.x;
+    const int tid = threadIdx.x;
+    const int n   = tid/32;
+    const int l   = tid - 32*n;
+    const int is  = 8*n + l/16;
+
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    const uint8_t q = x[i].qs[32*n + l];
+    float * y = yy + i*QK_K + 128*n;
+
+    float dall = x[i].d;
+    float dmin = x[i].dmin;
+    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
+    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
+    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
+
+}
+
+static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
+
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    // if n is 0, we want to do the lower 128, else the upper 128,
+    // covering y[l+0],  y[l+32], y[l+64], y[l+96] and
+    //          y[l+16], y[l+48], y[l+80], y[l+112]
+    int n = iqs/128;                // 0 or 1
+    int r = iqs - 128*n;            // 0...120 in steps of 8
+    int l = r/8;                    // 0...15 in steps of 1
+
+    const float   * y = yy + 128*n + l;
+    const uint8_t * q = x[ib].qs + 32*n + l;
+    const uint8_t * s = x[ib].scales + 8*n;
+
+    const float dall = x[ib].d;
+    const float dmin = x[ib].dmin;
+
+    float sum = y[  0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
+              + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
+              + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
+              + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
+              + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
+              + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
+              + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
+              + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
+
+    result = sum;
+
+}
+
+static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
+
+    int r = threadIdx.x/4;
+    int i = blockIdx.x;
+    int tid = r/2;
+    int is0 = r%2;
+    int l0 = 16*is0 + 4*(threadIdx.x%4);
+    int n = tid / 4;
+    int j = tid - 4*n;
+
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+    uint8_t m = 1 << (4*n + j);
+    int is = 8*n + 2*j + is0;
+    int shift = 2*j;
+
+    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
+                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
+                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
+                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
+    float d_all = x[i].d;
+    float dl = d_all * (us - 32);
+
+    float * y = yy + i*QK_K + 128*n + 32*j;
+    const uint8_t * q = x[i].qs + 32*n;
+    const uint8_t * hm = x[i].hmask;
+
+    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
+
+}
+
+static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
+
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+    const uint32_t kmask1 = 0x03030303;
+    const uint32_t kmask2 = 0x0f0f0f0f;
+
+    uint32_t aux[3];
+    uint32_t utmp[4];
+
+    // if n is 0, we want to do the lower 128, else the upper 128,
+    // covering y[l+0],  y[l+32], y[l+64], y[l+96] and
+    //          y[l+16], y[l+48], y[l+80], y[l+112]
+    int n = iqs/128;                // 0 or 1
+    int r = iqs - 128*n;            // 0...120 in steps of 8
+    int l = r/8;                    // 0...15 in steps of 1
+
+    const float   * y = yy + 128*n + l;
+    const uint8_t * q = x[ib].qs + 32*n + l;
+    const uint8_t * hm = x[ib].hmask + l;
+    const int8_t  * s = (const int8_t *)utmp + 8*n;
+
+    memcpy(aux, x[ib].scales, 12);
+    utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
+    utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
+    utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
+    utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
+
+    const float dall = x[ib].d;
+
+    const uint8_t m = 1 << (4*n);
+
+    float sum = y[  0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
+              + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
+              + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
+              + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
+              + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
+              + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
+              + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
+              + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
+
+    result = sum * dall;
+
+}
+
+static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63; m = q[j + 4] & 63;
+    } else {
+        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+
+static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+    const int i = blockIdx.x;
+
+    //// assume 64 threads - this is very slightly better than the one below
+    //const int tid = threadIdx.x;
+    //const int il  = tid/16;
+    //const int ir  = tid%16;
+    //const int is  = 2*il;
+    //const int n   = 2;
+
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int is  = 2*il;
+    const int n   = 4;
+
+    float * y = yy + i*QK_K + 64*il + n*ir;
+
+    const float dall = x[i].d;
+    const float dmin = x[i].dmin;
+
+    const uint8_t * q = x[i].qs + 32*il + n*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+    for (int l = 0; l < n; ++l) {
+        y[l + 0] = d1 * (q[l] & 0xF) - m1;
+        y[l +32] = d2 * (q[l] >>  4) - m2;
+    }
+}
+
+static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
+
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+                                    // iqs is in 0...248 in steps of 8 =>
+    const int j  = iqs / 64;        // j  is in 0...3
+    const int ir = (iqs - 64*j)/2;  // ir is in 0...28 in steps of 4
+    const int is = 2*j;             // is is in 0...6 in steps of 2
+
+    const float   * y = yy + 64*j + ir;
+    const uint8_t * q = x[ib].qs + 32*j + ir;
+
+    const float dall = x[ib].d;
+    const float dmin = x[ib].dmin;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[ib].scales, sc, m);
+    const float d1 = dall * sc;
+    const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[ib].scales, sc, m);
+    const float d2 = dall * sc;
+    const float m2 = dmin * m;
+
+    float sum = 0;
+    for (int k = 0; k < 4; ++k) {
+        sum += y[k +  0] * (d1 * (q[k] & 0xF) - m1);
+        sum += y[k + 32] * (d2 * (q[k] >>  4) - m2);
+    }
+    result = sum;
+
+}
+
+static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+    const int i = blockIdx.x;
+
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = threadIdx.x;
+    const int il  = tid/16;   // il is in 0...3
+    const int ir  = tid%16;   // ir is in 0...15
+    const int is  = 2*il;     // is is in 0...6
+
+    float * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const float dall = x[i].d;
+    const float dmin = x[i].dmin;
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+}
+
+static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
+
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+                                    // iqs is in 0...248 in steps of 8 =>
+    const int j  = iqs / 64;        // j  is in 0...3
+    const int ir = (iqs - 64*j)/2;  // ir is in 0...28 in steps of 4
+    const int is = 2*j;             // is is in 0...6 in steps of 2
+
+    const float   * y  = yy + 64*j + ir;
+    const uint8_t * ql = x[ib].qs + 32*j + ir;
+    const uint8_t * qh = x[ib].qh + ir;
+
+    const float dall = x[ib].d;
+    const float dmin = x[ib].dmin;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[ib].scales, sc, m);
+    const float d1 = dall * sc;
+    const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[ib].scales, sc, m);
+    const float d2 = dall * sc;
+    const float m2 = dmin * m;
+
+    uint8_t   hm  = 1 << is;
+    float sum = 0;
+    for (int k = 0; k < 4; ++k) {
+        sum += y[k +  0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
+    }
+    hm <<= 1;
+    for (int k = 0; k < 4; ++k) {
+        sum += y[k + 32] * (d2 * ((ql[k] >>  4) + (qh[k] & hm ? 16 : 0)) - m2);
+    }
+    result = sum;
+
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_q6_K(const void * vx, dst_t * yy) {
+    const block_q6_K * x = (const block_q6_K *) vx;
+
+    const int i = blockIdx.x;
+
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = threadIdx.x;
+    const int ip  = tid/32;   // ip is 0 or 1
+    const int il  = tid - 32*ip; // 0...32
+    const int is  = 8*ip + il/16;
+
+    // TODO: fp16 compute
+    dst_t * y = yy + i*QK_K + 128*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t * ql = x[i].ql + 64*ip + il;
+    const uint8_t   qh = x[i].qh[32*ip + il];
+    const int8_t  * sc = x[i].scales + is;
+
+    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+}
+
+template<typename src1_t, typename dst_t>
+static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const src1_t * yy, dst_t * dst, const int ncols, int nrows) {
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q6_K * x = (const block_q6_K *)vx + ib0;
+
+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+#if K_QUANTS_PER_ITERATION == 1
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
+    const int is = 0;
+#else
+    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
+    const int is = in / 4;
+#endif
+    const int ql_offset = 64*im + l0;
+    const int qh_offset = 32*im + l0;
+    const int s_offset  =  8*im + is;
+    const int y_offset = 128*im + l0;
+
+    dst_t tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const src1_t  * y  = yy + i * QK_K + y_offset;
+        const uint8_t * ql = x[i].ql + ql_offset;
+        const uint8_t * qh = x[i].qh + qh_offset;
+        const int8_t  * s  = x[i].scales + s_offset;
+
+        const dst_t d = x[i].d;
+
+#if K_QUANTS_PER_ITERATION == 1
+        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
+        tmp += sum;
+#else
+        dst_t sum = 0;
+        for (int l = 0; l < 4; ++l) {
+            sum += (dst_t)y[l+ 0] * (dst_t)s[0] * d * (dst_t)((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+                 + (dst_t)y[l+32] * (dst_t)s[2] * d * (dst_t)((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+                 + (dst_t)y[l+64] * (dst_t)s[4] * d * (dst_t)((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+                 + (dst_t)y[l+96] * (dst_t)s[6] * d * (dst_t)((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
+        }
+        tmp += sum;
+#endif
+
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <typename dst_t, int qk, int qr, dequantize_kernel_t<dst_t> dequantize_kernel>
+static __global__ void dequantize_block(const void * vx, dst_t * y, const int k) {
+    const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    vec2_t<dst_t> v;
+    dequantize_kernel(vx, ib, iqs, v);
+
+    y[iybs + iqs + 0]        = v.x;
+    y[iybs + iqs + y_offset] = v.y;
+}
+
+template<typename dst_t>
+static __device__ __forceinline__ dst_t vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
+
+    int vi;
+    memcpy(&vi,  &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
+    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
+
+    const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
+
+    // subtract 8 from each quantized value
+    const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
+    const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
+
+    // SIMD dot product of quantized values
+    int sumi = __dp4a(vi0, ui0, 0);
+    sumi     = __dp4a(vi1, ui1, sumi);
+
+    return sumi*d;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= 600
+}
+
+template<typename dst_t>
+static __device__ __forceinline__ dst_t vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+
+    const int vi  = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
+
+    const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
+    const float m = bq4_1->m;
+    const float s = bq8_1->s;
+
+    const int vi0 = (vi >> 0) & 0x0F0F0F0F;
+    const int vi1 = (vi >> 4) & 0x0F0F0F0F;
+
+    // SIMD dot product of quantized values
+    int sumi = __dp4a(vi0, ui0, 0);
+    sumi     = __dp4a(vi1, ui1, sumi);
+
+    return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= 600
+}
+
+template<typename dst_t>
+static __device__ __forceinline__ dst_t vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
+
+    int qs;
+    memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
+    const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
+    const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
+    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
+
+    const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
+
+    int vi0 = (qs  >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
+    vi0    |= (qh0 <<  4) & 0x00000010; // 1 ->  5
+    vi0    |= (qh0 << 11) & 0x00001000; // 2 -> 13
+    vi0    |= (qh0 << 18) & 0x00100000; // 3 -> 21
+    vi0    |= (qh0 << 25) & 0x10000000; // 4 -> 29
+    vi0     = __vsub4(vi0,  0x10101010); // subtract 16 from quantized values
+    int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
+
+    int vi1 = (qs  >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
+    vi1    |= (qh1 <<  4) & 0x00000010; // 1 ->  5
+    vi1    |= (qh1 << 11) & 0x00001000; // 2 -> 13
+    vi1    |= (qh1 << 18) & 0x00100000; // 3 -> 21
+    vi1    |= (qh1 << 25) & 0x10000000; // 4 -> 29
+    vi1     = __vsub4(vi1,  0x10101010); // subtract 16 from quantized values
+    sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
+
+    return sumi*d;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= 600
+}
+
+template<typename dst_t>
+static __device__ __forceinline__ dst_t vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+
+    const int qs  = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
+    const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
+    const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
+    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
+
+    const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
+    const float m = bq5_1->m;
+    const float s = bq8_1->s;
+
+    int vi0 = (qs  >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
+    vi0    |= (qh0 <<  4) & 0x00000010; // 1 ->  5
+    vi0    |= (qh0 << 11) & 0x00001000; // 2 -> 13
+    vi0    |= (qh0 << 18) & 0x00100000; // 3 -> 21
+    vi0    |= (qh0 << 25) & 0x10000000; // 4 -> 29
+    int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
+
+    int vi1 = (qs  >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
+    vi1    |= (qh1 <<  4) & 0x00000010; // 1 ->  5
+    vi1    |= (qh1 << 11) & 0x00001000; // 2 -> 13
+    vi1    |= (qh1 << 18) & 0x00100000; // 3 -> 21
+    vi1    |= (qh1 << 25) & 0x10000000; // 4 -> 29
+    sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
+
+    return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= 600
+}
+
+template<typename dst_t>
+static __device__ __forceinline__ dst_t vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
+#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
+
+    int vi;
+    memcpy(&vi,  &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
+    const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+
+    const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
+
+    // SIMD dot product of quantized values
+    int sumi = __dp4a(vi, ui, 0);
+
+    return sumi*d;
+#else
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= 600
+}
+
+template <typename dst_t, int qk, int qi, typename block_q_t, vec_dot_q_cuda_t<dst_t> vec_dot_q_cuda>
+static __global__ void mul_mat_vec_q(const void * vx, const void * vy, dst_t * dst, const int ncols, const int nrows) {
+    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = WARP_SIZE / qi;
+
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
+
+        const int iby = i + threadIdx.x / qi; // y block index
+
+        const int iqs  = threadIdx.x % qi; // x block quant index when casting the quants to int
+
+        tmp += (float)vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (threadIdx.x == 0) {
+        dst[row] = (dst_t)tmp;
+    }
+}
+
+template <typename src1_t, typename dst_t, int qk, int qr, dequantize_kernel_t<dst_t> dequantize_kernel>
+static __global__ void dequantize_mul_mat_vec(const void * vx, const src1_t * y, dst_t * dst, const int ncols, const int nrows) {
+    // qk = quantized weights per x block
+    // qr = number of quantized weights per data value in x block
+    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int tid = threadIdx.x;
+
+    const int iter_stride = 2*GGML_CUDA_DMMV_X;
+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    vec2_t<dst_t> tmp2 = make_vec2_t<dst_t>(0, 0); // partial sum for thread in warp
+
+    for (int i = 0; i < ncols; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
+        const int iybs = col - col%qk; // y block start index
+
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+
+            // dequantize
+            vec2_t<dst_t> xc;
+            dequantize_kernel(vx, ib, iqs + j/qr, xc);
+
+            // matrix multiplication
+            vec2_t<dst_t> yc = make_vec2_t<dst_t>(
+                y[iybs + iqs + j/qr + 0],
+                y[iybs + iqs + j/qr + y_offset]);
+            tmp2 += xc * yc;
+        }
+    }
+
+    // sum up partial sums and write back result
+    // TODO: reducing as half2 may be faster, but requires special handling for float2
+    dst_t tmp = tmp2.x + tmp2.y;
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <typename src1_t, typename dst_t, int n_thread, dot_kernel_k_t<src1_t, dst_t> dot_kernel>
+static __global__ void dequantize_mul_mat_vec_k(const void * vx, const src1_t * y, dst_t * dst, const int ncols) {
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+
+    const int iter_stride = QK_K;
+    const int vals_per_iter = iter_stride / n_thread;
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    dst_t tmp = 0; // partial sum for thread in warp
+
+    for (int i = 0; i < ncols; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = ib0 + col/QK_K; // x block index
+        const int iqs = col%QK_K; // x quant index
+        const int iybs = col - col%QK_K; // y block start index
+
+        dst_t v;
+        dot_kernel(vx, ib, iqs, y + iybs, v);
+        tmp += v;
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -6,30 +6,15 @@
 extern "C" {
 #endif

-#define GGML_CUDA_MAX_DEVICES       16
+GGML_API void * ggml_cuda_host_malloc(size_t size);
+GGML_API void   ggml_cuda_host_free(void * ptr);
+GGML_API void   ggml_cuda_host_register(void * ptr, size_t size);
+GGML_API void   ggml_cuda_host_unregister(void * ptr);

-void   ggml_init_cublas(void);
-void   ggml_cuda_set_tensor_split(const float * tensor_split);
+// backend API

-void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+GGML_API struct ggml_backend * ggml_backend_cuda_init();

-// TODO: export these with GGML_API
-void * ggml_cuda_host_malloc(size_t size);
-void   ggml_cuda_host_free(void * ptr);
-
-void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
-
-void   ggml_cuda_free_data(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
-void   ggml_cuda_set_main_device(int main_device);
-void   ggml_cuda_set_scratch_size(size_t scratch_size);
-void   ggml_cuda_free_scratch(void);
-bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);

 #ifdef  __cplusplus
 }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -199,6 +199,7 @@
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          48
+#define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4


@ -285,12 +286,6 @@ extern "C" {
        GGML_TYPE_COUNT,
    };

-    enum ggml_backend {
-        GGML_BACKEND_CPU = 0,
-        GGML_BACKEND_GPU = 10,
-        GGML_BACKEND_GPU_SPLIT = 20,
-    };
-
    // model file types
    enum ggml_ftype {
        GGML_FTYPE_UNKNOWN     = -1,
@ -405,8 +400,9 @@ extern "C" {

    // n-dimensional tensor
    struct ggml_tensor {
-        enum ggml_type    type;
-        enum ggml_backend backend;
+        struct ggml_backend * backend;
+
+        enum ggml_type type;

        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
@ -418,11 +414,18 @@ extern "C" {
        // compute data
        enum ggml_op op;

+        // op params - allocated as int32_t for alignment
+        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
+
        bool is_param;

        struct ggml_tensor * grad;
        struct ggml_tensor * src[GGML_MAX_SRC];

+        bool visited;   // used to build graphs
+        int n_children; // used by the allocator
+        int n_views;
+
        // performance
        int     perf_runs;
        int64_t perf_cycles;
@ -430,11 +433,11 @@ extern "C" {

        void * data;

-        char name[GGML_MAX_NAME];
-
        void * extra; // extra things e.g. for ggml-cuda.cu

-        char padding[8];
+        char name[GGML_MAX_NAME];
+
+        char padding[12];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -459,6 +462,7 @@ extern "C" {
    struct ggml_cgraph {
        int n_nodes;
        int n_leafs;
+        bool closed;

        struct ggml_tensor * nodes[GGML_MAX_NODES];
        struct ggml_tensor * grads[GGML_MAX_NODES];
@ -470,23 +474,21 @@ extern "C" {
        int64_t perf_time_us;
    };

-    // scratch buffer
-    struct ggml_scratch {
-        size_t offs;
-        size_t size;
-        void * data;
+    enum ggml_alloc_mode {
+        GGML_ALLOC_NONE,            // do not allocate tensors
+        GGML_ALLOC_IMMEDIATE,       // allocate tensors immediately
+        GGML_ALLOC_COMPUTE_SEQ,     // delay allocation until graph build time, allocate tensors for sequential graph computation
+        //GGML_ALLOC_COMPUTE_PAR,     // allocate tensors for parallel graph computation
    };

+    // context parameters
    struct ggml_init_params {
-        // memory pool
-        size_t mem_size;   // bytes
-        void * mem_buffer; // if NULL, memory will be allocated internally
-        bool   no_alloc;   // don't allocate memory for the tensor data
+        struct ggml_buffer * buffer;
+        enum ggml_alloc_mode alloc_mode;   // tensor allocation mode
+        enum ggml_type       compute_type; // type of intermediate results
    };

-
-    // compute types
-
+    // task types
    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
    enum ggml_task_type {
@ -547,19 +549,20 @@ extern "C" {
    GGML_API size_t ggml_tensor_overhead(void);

    // main
+    GGML_API struct ggml_init_params ggml_init_params_default(void);
+    GGML_API struct ggml_context *   ggml_init(struct ggml_init_params params);
+    GGML_API void                    ggml_free(struct ggml_context * ctx);

-    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
-    GGML_API void                  ggml_free(struct ggml_context * ctx);
+    GGML_API void    ggml_set_alloc_mode(struct ggml_context * ctx, enum ggml_alloc_mode mode);

+    // TODO: update for ggml_buffer
    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
-
-    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
-    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
-
    GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);
    GGML_API size_t  ggml_get_mem_size       (const struct ggml_context * ctx);
    GGML_API size_t  ggml_get_max_tensor_size(const struct ggml_context * ctx);

+    GGML_API struct ggml_buffer * ggml_get_buffer(const struct ggml_context * ctx);
+
    GGML_API struct ggml_tensor * ggml_new_tensor(
            struct ggml_context * ctx,
            enum   ggml_type type,
@ -1121,6 +1124,17 @@ extern "C" {
            int                   mode,
            int                   n_ctx);

+    // custom RoPE
+    GGML_API struct ggml_tensor * ggml_rope_custom(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode,
+            float                 freq_base,
+            float                 freq_scale,
+            int                   n_ctx);
+
    // custom RoPE, in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
            struct ggml_context * ctx,
@ -1347,6 +1361,8 @@ extern "C" {
    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);

+    GGML_API void ggml_graph_close  (struct ggml_cgraph * cgraph);
+
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
@ -1561,9 +1577,8 @@ extern "C" {
    GGML_API int ggml_cpu_has_fp16_va    (void);
    GGML_API int ggml_cpu_has_wasm_simd  (void);
    GGML_API int ggml_cpu_has_blas       (void);
-    GGML_API int ggml_cpu_has_cublas     (void);
+    GGML_API int ggml_cpu_has_cuda       (void);
    GGML_API int ggml_cpu_has_clblast    (void);
-    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_vsx        (void);

@ -1594,3 +1609,6 @@ extern "C" {
 #ifdef  __cplusplus
 }
 #endif
+
+
+#include "ggml-backend.h"
--- a/llama-util.h
+++ b/llama-util.h
@ -203,6 +203,17 @@ struct llama_mmap {
        }
    }

+    void discard(void * addr, size_t len) {
+        // align to the page size
+        int page_size = sysconf(_SC_PAGESIZE);
+        addr = (void *) (((uintptr_t) addr) & ~(page_size - 1));
+        len = (len + page_size - 1) & ~(page_size - 1);
+        if (madvise(addr, len, MADV_DONTNEED)) {
+            fprintf(stderr, "warning: madvise(.., MADV_DONTNEED) failed: %s\n",
+                    strerror(errno));
+        }
+    }
+
    ~llama_mmap() {
        munmap(addr, size);
    }
@ -247,6 +258,10 @@ struct llama_mmap {
        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
    }

+    void discard(void * addr, size_t len) {
+        VirtualAlloc(addr, len, MEM_RESET, PAGE_NOACCESS);
+    }
+
    ~llama_mmap() {
        if (!UnmapViewOfFile(addr)) {
            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
@ -262,6 +277,13 @@ struct llama_mmap {

        throw std::runtime_error(std::string("mmap not supported"));
    }
+
+    void discard(void * addr, size_t len) {
+        (void) addr;
+        (void) len;
+
+        throw std::runtime_error(std::string("mmap not supported"));
+    }
 #endif
 };

@ -419,28 +441,13 @@ struct llama_buffer {
    llama_buffer() = default;

    void resize(size_t len) {
-#ifdef GGML_USE_METAL
-        free(addr);
-        int result = posix_memalign((void **) &addr, getpagesize(), len);
-        if (result == 0) {
-            memset(addr, 0, len);
-        }
-        else {
-            addr = NULL;
-        }
-#else
        delete[] addr;
        addr = new uint8_t[len];
-#endif
        size = len;
    }

    ~llama_buffer() {
-#ifdef GGML_USE_METAL
-        free(addr);
-#else
        delete[] addr;
-#endif
        addr = NULL;
    }

@ -451,54 +458,4 @@ struct llama_buffer {
    llama_buffer& operator=(llama_buffer&&) = delete;
 };

-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-struct llama_ctx_buffer {
-    uint8_t * addr = NULL;
-    bool is_cuda;
-    size_t size = 0;
-
-    llama_ctx_buffer() = default;
-
-    void resize(size_t size) {
-        free();
-
-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
-        if (addr) {
-            is_cuda = true;
-        }
-        else {
-            // fall back to pageable memory
-            addr = new uint8_t[size];
-            is_cuda = false;
-        }
-        this->size = size;
-    }
-
-    void free() {
-        if (addr) {
-            if (is_cuda) {
-                ggml_cuda_host_free(addr);
-            }
-            else {
-                delete[] addr;
-            }
-        }
-        addr = NULL;
-    }
-
-    ~llama_ctx_buffer() {
-        free();
-    }
-
-    // disable copy and move
-    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
-    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
-    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
-    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
-};
-#else
-typedef llama_buffer llama_ctx_buffer;
-#endif
-
 #endif
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -2,12 +2,7 @@
 #define LLAMA_H

 #include "ggml.h"
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
-#else
 #define LLAMA_MAX_DEVICES 1
-#endif // GGML_USE_CUBLAS
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
@ -48,7 +43,7 @@

 #define LLAMA_DEFAULT_SEED           0xFFFFFFFF

-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
Author	SHA1	Message	Date
slaren	d273bfd2c9	allocator: cleanup, more comments	2023-07-22 15:05:24 +02:00
slaren	5141472e2b	llama.cpp: print input/output buffers size	2023-07-22 13:31:06 +02:00
slaren	e2b9575951	allocator cleanup	2023-07-22 13:29:44 +02:00
slaren	7de7882537	allocator: fix partial offloading	2023-07-22 02:34:21 +02:00
slaren	e87840f9fd	allocator: automatic inplace operations	2023-07-21 16:51:50 +02:00
slaren	3d679827e7	improved memory management fixes	2023-07-21 12:59:26 +02:00
slaren	56e9ae062c	llama.cpp: partially restore state support, graph export	2023-07-21 12:39:51 +02:00
slaren	37d3f6a260	remove unused code	2023-07-21 02:33:06 +02:00
slaren	cd6f5dec92	improved memory management	2023-07-21 00:44:35 +02:00
slaren	de69f8f20d	initial implementation of delayed graph allocation	2023-07-20 15:57:48 +02:00
slaren	cb205c0d13	automatically calculate compute buffer sizes (without graph allocator)	2023-07-20 02:42:36 +02:00
slaren	77ac8deaf1	llama.cpp: remove backend-specific code where possible	2023-07-20 01:01:51 +02:00
slaren	295f85654a	allocators wip renamed ggml_backend functions changed ggml_buffer and ggml_backend to always be used as pointers rename ggml_tensor::params -> op_params	2023-07-19 02:43:44 +02:00
slaren	1102ff56db	fix double-free with --no-mmap	2023-07-17 12:00:17 +02:00
slaren	4e94af3060	improve layer backend printing with ranges	2023-07-17 11:53:01 +02:00
slaren	c2beeb8e3a	only allocate as much memory as is required in each backend for the model	2023-07-17 11:21:32 +02:00
slaren	9c72e7e916	rebase to master (except ggml-cuda)	2023-07-16 15:10:46 +02:00
slaren	33ab185dd1	fix NVCC version on Makefile, __halves2half2 -> make_half2	2023-07-16 14:56:52 +02:00
slaren	24cc6f008f	minor fixes	2023-07-16 14:56:52 +02:00
slaren	5765d7a587	restore simple.cpp for now	2023-07-16 14:56:52 +02:00
slaren	0d2b66c638	ggml backend interface wip refactor ggml-cuda	2023-07-16 14:56:46 +02:00