diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 49b478d99..c98cbcbbe 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
   push:
     branches:
       - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
 
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -151,21 +151,21 @@ jobs:
     env:
       OPENBLAS_VERSION: 0.3.23
       OPENCL_VERSION: 2023.04.17
-      CLBLAST_VERSION: 1.5.3
+      CLBLAST_VERSION: 1.6.0
 
     strategy:
       matrix:
         include:
           - build: 'avx2'
-            defines: ''
+            defines: '-DLLAMA_BUILD_SERVER=ON'
           - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
           - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'clblast'
-            defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
           - build: 'openblas'
-            defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
 
     steps:
       - name: Clone
@@ -184,13 +184,13 @@ jobs:
         id: get_clblast
         if: ${{ matrix.build == 'clblast' }}
         run: |
-          curl.exe -o $env:RUNNER_TEMP/clblast.zip -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-Windows-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
           curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/clblast
-          tar.exe -xvf $env:RUNNER_TEMP/clblast.zip -C $env:RUNNER_TEMP/clblast
+          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
+          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
           foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
             $txt = Get-Content -Path $f -Raw
-            $txt.Replace('C:/dependencies/opencl/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
+            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
           }
 
       - name: Download OpenBLAS
@@ -213,7 +213,6 @@ jobs:
           cd build
           cmake .. ${{ matrix.defines }}
           cmake --build . --config Release
-          cp ../LICENSE ./bin/Release/llama.cpp.txt
 
       - name: Add clblast.dll
         id: add_clblast_dll
@@ -258,6 +257,7 @@ jobs:
         id: pack_artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
+          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
           7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
 
       - name: Upload artifacts
@@ -292,7 +292,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_CUBLAS=ON
+          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
           cmake --build . --config Release
 
       - name: Get commit hash
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3471e44f2..21f4ec9dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,42 +37,44 @@ endif()
 #
 
 # general
-option(LLAMA_STATIC                 "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                 "llama: enable -march=native flag"                      OFF)
-option(LLAMA_LTO                    "llama: enable link time optimization"                  OFF)
+option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
+option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
 
 # debug
-option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
-option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
-option(LLAMA_GPROF                  "llama: enable gprof"                                   OFF)
+option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
+option(LLAMA_ALL_WARNINGS_3RD_PARTY     "llama: enable all compiler warnings in 3rd party libs" OFF)
+option(LLAMA_GPROF                      "llama: enable gprof"                                   OFF)
 
 # sanitizers
-option(LLAMA_SANITIZE_THREAD        "llama: enable thread sanitizer"                        OFF)
-option(LLAMA_SANITIZE_ADDRESS       "llama: enable address sanitizer"                       OFF)
-option(LLAMA_SANITIZE_UNDEFINED     "llama: enable undefined sanitizer"                     OFF)
+option(LLAMA_SANITIZE_THREAD            "llama: enable thread sanitizer"                        OFF)
+option(LLAMA_SANITIZE_ADDRESS           "llama: enable address sanitizer"                       OFF)
+option(LLAMA_SANITIZE_UNDEFINED         "llama: enable undefined sanitizer"                     OFF)
 
 # instruction set specific
-option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
-option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
-option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
-option(LLAMA_AVX512_VBMI            "llama: enable AVX512-VBMI"                             OFF)
-option(LLAMA_AVX512_VNNI            "llama: enable AVX512-VNNI"                             OFF)
-option(LLAMA_FMA                    "llama: enable FMA"                                     ON)
+option(LLAMA_AVX                        "llama: enable AVX"                                     ON)
+option(LLAMA_AVX2                       "llama: enable AVX2"                                    ON)
+option(LLAMA_AVX512                     "llama: enable AVX512"                                  OFF)
+option(LLAMA_AVX512_VBMI                "llama: enable AVX512-VBMI"                             OFF)
+option(LLAMA_AVX512_VNNI                "llama: enable AVX512-VNNI"                             OFF)
+option(LLAMA_FMA                        "llama: enable FMA"                                     ON)
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
-    option(LLAMA_F16C               "llama: enable F16C"                                    ON)
+    option(LLAMA_F16C                   "llama: enable F16C"                                    ON)
 endif()
 
 # 3rd party libs
-option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
-option(LLAMA_BLAS                   "llama: use BLAS"                                       OFF)
-option(LLAMA_BLAS_VENDOR            "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
-option(LLAMA_CUBLAS                 "llama: use cuBLAS"                                     OFF)
-option(LLAMA_CLBLAST                "llama: use CLBlast"                                    OFF)
+option(LLAMA_ACCELERATE                 "llama: enable Accelerate framework"                    ON)
+option(LLAMA_BLAS                       "llama: use BLAS"                                       OFF)
+set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
+option(LLAMA_CUBLAS                     "llama: use cuBLAS"                                     OFF)
+set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
+set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING  "llama: y block size for dmmv CUDA kernels")
+option(LLAMA_CLBLAST                    "llama: use CLBlast"                                    OFF)
 
-option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER           "llama: build server example"                           OFF)
+option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER               "llama: build server example"                           OFF)
 
 #
 # Build info header
@@ -184,6 +186,8 @@ if (LLAMA_CUBLAS)
         set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
 
         add_compile_definitions(GGML_USE_CUBLAS)
+        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
+        add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
 
         if (LLAMA_STATIC)
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -201,7 +205,7 @@ if (LLAMA_CLBLAST)
     if (CLBlast_FOUND)
         message(STATUS "CLBlast found")
 
-        set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
+        set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
 
         add_compile_definitions(GGML_USE_CLBLAST)
 
diff --git a/Makefile b/Makefile
index 9e2f8aa3c..70bd5e90a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,11 @@
 # Define the default target now so that it is always the first target
-default: main quantize quantize-stats perplexity embedding vdot
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
+
+ifdef LLAMA_BUILD_SERVER
+	BUILD_TARGETS += server
+endif
+
+default: $(BUILD_TARGETS)
 
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@@ -133,11 +139,22 @@ ifdef LLAMA_CUBLAS
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
 	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
+ifdef LLAMA_CUDA_DMMV_X
+	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+else
+	NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
+endif # LLAMA_CUDA_DMMV_X
+ifdef LLAMA_CUDA_DMMV_Y
+	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
+else
+	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
+endif # LLAMA_CUDA_DMMV_Y
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
-endif
+endif # LLAMA_CUBLAS
 ifdef LLAMA_CLBLAST
 	CFLAGS  += -DGGML_USE_CLBLAST
+	CXXFLAGS  += -DGGML_USE_CLBLAST
 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
 		LDFLAGS += -lclblast -framework OpenCL
@@ -145,8 +162,8 @@ ifdef LLAMA_CLBLAST
 		LDFLAGS += -lclblast -lOpenCL
 	endif
 	OBJS    += ggml-opencl.o
-ggml-opencl.o: ggml-opencl.c ggml-opencl.h
-	$(CC) $(CFLAGS) -c $< -o $@
+ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
@@ -199,7 +216,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h
 
 #
 # Examples
@@ -226,6 +243,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
diff --git a/README.md b/README.md
index 102cde43f..00571d8e1 100644
--- a/README.md
+++ b/README.md
@@ -240,11 +240,11 @@ In order to build llama.cpp you have three different options.
 
 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
 
-- Accelerate Framework:
+- **Accelerate Framework**:
 
   This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
 
-- OpenBLAS:
+- **OpenBLAS**:
 
   This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
 
@@ -278,11 +278,11 @@ Building the program with BLAS support may lead to some performance improvements
       cmake --build . --config Release
       ```
 
-- BLIS
+- **BLIS**
 
   Check [BLIS.md](BLIS.md) for more information.
 
-- Intel MKL
+- **Intel MKL**
 
   By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
 
@@ -293,7 +293,7 @@ Building the program with BLAS support may lead to some performance improvements
   cmake --build . -config Release
   ```
 
-- cuBLAS
+- **cuBLAS**
 
   This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
   - Using `make`:
@@ -308,8 +308,81 @@ Building the program with BLAS support may lead to some performance improvements
     cmake .. -DLLAMA_CUBLAS=ON
     cmake --build . --config Release
     ```
+  Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
 
-Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
+- **CLBlast**
+
+  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
+
+  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
+    - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
+
+    - <details>
+        <summary>Installing the OpenCL SDK from source</summary>
+
+        ```sh
+        git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
+        mkdir OpenCL-SDK/build
+        cd OpenCL-SDK/build
+        cmake .. -DBUILD_DOCS=OFF \
+          -DBUILD_EXAMPLES=OFF \
+          -DBUILD_TESTING=OFF \
+          -DOPENCL_SDK_BUILD_SAMPLES=OFF \
+          -DOPENCL_SDK_TEST_SAMPLES=OFF
+        cmake --build . --config Release
+        cmake --install . --prefix /some/path
+        ```
+      </details>
+
+  Installing CLBlast: it may be found in your operating system's packages.
+
+  - <details>
+    <summary>If not, then installing from source:</summary>
+
+      ```sh
+      git clone https://github.com/CNugteren/CLBlast.git
+      mkdir CLBlast/build
+      cd CLBLast/build
+      cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
+      cmake --build . --config Release
+      cmake --install . --prefix /some/path
+      ```
+
+      Where `/some/path` is where the built library will be installed (default is `/usr/loca`l`).
+    </details>
+
+  Building:
+
+  - Build with make:
+    ```sh
+    make LLAMA_CLBLAST=1
+    ```
+  - CMake:
+    ```sh
+    mkdir build
+    cd build
+    cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
+    cmake --build . --config Release
+    ```
+
+  Running:
+
+  The CLBlast build supports `--gpu-layers|-ngl` like  the CUDA version does.
+
+  To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
+  The selection can be a number (starting from 0) or a text string to search:
+
+  ```sh
+  GGML_OPENCL_PLATFORM=1 ./main ...
+  GGML_OPENCL_DEVICE=2 ./main ...
+  GGML_OPENCL_PLATFORM=Intel ./main ...
+  GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
+  ```
+
+  The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
+  Using the variables it is possible to select a CPU-based driver as well, if so desired.
+
+  You can get a list of platforms and devices from the `clinfo -l` command, etc.
 
 ### Prepare Data & Run
 
@@ -391,6 +464,25 @@ Note the use of `--color` to distinguish between user input and generated text.
 
 ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
 
+### Persistent Interaction
+
+The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
+
+```bash
+# Start a new chat
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+
+# Resume that chat
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+
+# Start a different chat with the same prompt/model
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
+
+# Different prompt cache for different prompt/model
+PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
+    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
+```
+
 ### Instruction mode with Alpaca
 
 1. First, download the `ggml` Alpaca model into the `./models` folder
diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh
index b32284b49..e0c251e5b 100755
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -23,8 +23,8 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
 NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
 NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
 
-SESSION_SIZE_MSG_PATTERN='main: session file matches \d+ / \d+'
-SAMPLE_TIME_MSG_PATTERN='sample time =\s+\d+.\d+ ms /\s+\d+'
+SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
+SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
 SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
 
 CTX_SIZE=2048
diff --git a/examples/main/README.md b/examples/main/README.md
index 7c03f92c8..e71ba6173 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -272,7 +272,7 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 ### Prompt Caching
 
--   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.
+-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
 
 ### Quantization
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 47b418d97..c7c591537 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -134,8 +134,6 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
 
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
@@ -155,6 +153,7 @@ int main(int argc, char ** argv) {
                 return 1;
             }
             session_tokens.resize(n_token_count_out);
+            llama_set_rng_seed(ctx, params.seed);
 
             fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
         } else {
@@ -163,7 +162,16 @@ int main(int argc, char ** argv) {
     }
 
     // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> embd_inp;
+
+    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+        // Add a space in front of the first character to match OG llama tokenizer behavior
+        params.prompt.insert(0, 1, ' ');
+
+        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    } else {
+        embd_inp = session_tokens;
+    }
 
     const int n_ctx = llama_n_ctx(ctx);
 
@@ -181,7 +189,9 @@ int main(int argc, char ** argv) {
             }
             n_matching_session_tokens++;
         }
-        if (n_matching_session_tokens >= embd_inp.size()) {
+        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
+            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+        } else if (n_matching_session_tokens >= embd_inp.size()) {
             fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
         } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
             fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7209a2b52..3904412cb 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -61,7 +61,7 @@ struct llama_server_context
     std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
     // compare the evaluated prompt with the new prompt
     int new_prompt_len = 0;
-    for (int i = 0;i < prompt_tokens.size(); i++) {
+    for (size_t i = 0; i < prompt_tokens.size(); i++) {
       if (i < processed_tokens.size() &&
         processed_tokens[i] == prompt_tokens[i])
       {
@@ -71,7 +71,7 @@ struct llama_server_context
       {
         embd_inp.push_back(prompt_tokens[i]);
         if(new_prompt_len == 0) {
-          if(i - 1 < n_past) {
+          if(int32_t(i) - 1 < n_past) {
             processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
           }
           // Evaluate the new fragment prompt from the last token processed.
@@ -136,7 +136,7 @@ struct llama_server_context
     {
       // out of user input, sample next token
       const float temp = params.temp;
-      const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+      // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
       const float top_p = params.top_p;
       const float tfs_z = params.tfs_z;
       const float typical_p = params.typical_p;
@@ -306,12 +306,12 @@ struct llama_server_context
     // Avoid add the no show words to the response
     for (std::vector<llama_token> word_tokens : no_show_words)
     {
-      int match_token = 1;
+      size_t match_token = 1;
       if (tokens_predicted.front() == word_tokens.front())
       {
         bool execute_matching = true;
         if (tokens_predicted.size() > 1) { // if previus tokens had been tested
-          for (int i = 1; i < word_tokens.size(); i++)
+          for (size_t i = 1; i < word_tokens.size(); i++)
           {
             if (i >= tokens_predicted.size()) {
               match_token = i;
@@ -601,7 +601,7 @@ int main(int argc, char **argv)
 
   Server svr;
 
-  svr.Get("/", [](const Request &req, Response &res)
+  svr.Get("/", [](const Request &, Response &res)
           { res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
 
   svr.Post("/completion", [&llama](const Request &req, Response &res)
@@ -649,7 +649,7 @@ int main(int argc, char **argv)
                       {"tokens_predicted", llama.num_tokens_predicted}};
                   return res.set_content(data.dump(), "application/json");
                 }
-                catch (json::exception e)
+                catch (const json::exception &e)
                 {
                   // Some tokens have bad UTF-8 strings, the json parser is very sensitive
                   json data = {
@@ -701,7 +701,7 @@ int main(int argc, char **argv)
                         {"content", result },
                         {"stop", !llama.has_next_token }};
               return res.set_content(data.dump(), "application/json");
-            } catch (json::exception e) {
+            } catch (const json::exception &e) {
               // Some tokens have bad UTF-8 strings, the json parser is very sensitive
               json data = {
                         {"content", "" },
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 35d2e457c..98170a3ae 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -83,9 +83,19 @@ typedef struct {
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
 
+#define WARP_SIZE 32
+
 #define CUDA_MUL_BLOCK_SIZE 256
+
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-#define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec
+
+// dmmv = dequantize_mul_mat_vec
+#ifndef GGML_CUDA_DMMV_X
+#define GGML_CUDA_DMMV_X 32
+#endif
+#ifndef GGML_CUDA_DMMV_Y
+#define GGML_CUDA_DMMV_Y 1
+#endif
 
 static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
     const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -200,41 +210,51 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
     dequantize_kernel(vx, ib, iqs, v0, v1);
 }
 
-template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
 static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
-    const int row = blockIdx.x;
+    // qk = quantized weights per x block
+    // qr = number of quantized weights per data value in x block
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
     const int tid = threadIdx.x;
 
+    const int iter_stride = 2*GGML_CUDA_DMMV_X;
+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
     const int y_offset = qr == 1 ? 1 : qk/2;
 
-    __shared__ float tmp[block_size]; // separate sum for each thread
-    tmp[tid] = 0;
+    float tmp = 0; // partial sum for thread in warp
 
-    for (int i = 0; i < ncols/block_size; i += 2) {
-        const int col = i*block_size + 2*tid;
-        const int ib = (row*ncols + col)/qk; // block index
-        const int iqs = (col%qk)/qr; // quant index
+    for (int i = 0; i < ncols; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
         const int iybs = col - col%qk; // y block start index
 
-        // dequantize
-        float v0, v1;
-        dequantize_kernel(vx, ib, iqs, v0, v1);
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
 
-        // matrix multiplication
-        tmp[tid] += v0 * y[iybs + iqs + 0];
-        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+            // dequantize
+            float v0, v1;
+            dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+
+            // matrix multiplication
+            tmp += v0 * y[iybs + iqs + j/qr + 0];
+            tmp += v1 * y[iybs + iqs + j/qr + y_offset];
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+        }
     }
 
     // sum up partial sums and write back result
     __syncthreads();
-    for (int s=block_size/2; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        __syncthreads();
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
     }
+
     if (tid == 0) {
-        dst[row] = tmp[0];
+        dst[row] = tmp;
     }
 }
 
@@ -269,33 +289,43 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu
 }
 
 static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_0, QR4_0, dequantize_q4_0>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }
 
 static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, QR4_1, dequantize_q4_1>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }
 
 static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, QR5_0, dequantize_q5_0>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }
 
 static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_1, QR5_1, dequantize_q5_1>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }
 
 static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK8_0, QR8_0, dequantize_q8_0>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }
 
 static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -304,9 +334,11 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
 }
 
 static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, 32, 1, convert_f16>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<1, 1, convert_f16>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }
 
 static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
diff --git a/ggml-opencl.c b/ggml-opencl.c
deleted file mode 100644
index e26631fcf..000000000
--- a/ggml-opencl.c
+++ /dev/null
@@ -1,474 +0,0 @@
-#include "ggml-opencl.h"
-
-#define CL_TARGET_OPENCL_VERSION 110
-#include <clblast_c.h>
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "ggml.h"
-
-#define MULTILINE_QUOTE(...) #__VA_ARGS__
-static const char * program_source = MULTILINE_QUOTE(
-
-typedef char int8_t;
-typedef uchar uint8_t;
-typedef int int32_t;
-typedef uint uint32_t;
-
-struct __attribute__ ((packed)) block_q4_0
-{
-    half d;
-    uint8_t qs[16]; /* QK4_0 / 2 */
-};
-
-struct __attribute__ ((packed)) block_q4_1
-{
-    half d;
-    half m;
-    uint8_t qs[16]; /* QK4_1 / 2 */
-};
-
-struct __attribute__ ((packed)) block_q5_0
-{
-    half d;
-    uint32_t qh;
-    uint8_t qs[16]; /* QK5_0 / 2 */
-};
-
-struct __attribute__ ((packed)) block_q5_1
-{
-    half d;
-    half m;
-    uint32_t qh;
-    uint8_t qs[16]; /* QK5_1 / 2 */
-};
-
-struct __attribute__ ((packed)) block_q8_0
-{
-    half d;
-    int8_t qs[32]; /* QK8_0 */
-};
-
-
-__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
-    const uint i = get_global_id(0) / 32; /* QK4_0 */
-    const uint j = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &x[i].d);
-
-    const int x0 = (x[i].qs[j] & 0xf) - 8;
-    const int x1 = (x[i].qs[j] >>  4) - 8;
-
-    y[i*32 + j + 0 ] = x0*d;
-    y[i*32 + j + 16] = x1*d;
-}
-
-__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
-    const uint i = get_global_id(0) / 32; /* QK4_1 */
-    const uint j = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &x[i].d);
-    const float m = vload_half(0, (__global half*) &x[i].m);
-
-    const int x0 = (x[i].qs[j] & 0xf);
-    const int x1 = (x[i].qs[j] >>  4);
-
-    y[i*32 + j + 0 ] = x0*d + m;
-    y[i*32 + j + 16] = x1*d + m;
-}
-
-__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
-    const uint i = get_global_id(0) / 32; /* QK5_0 */
-    const uint j = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &x[i].d);
-
-    uint32_t qh = x[i].qh;
-
-    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-    const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
-    const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
-
-    y[i*32 + j + 0 ] = x0*d;
-    y[i*32 + j + 16] = x1*d;
-}
-
-__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
-    const uint i = get_global_id(0) / 32; /* QK5_1 */
-    const uint j = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &x[i].d);
-    const float m = vload_half(0, (__global half*) &x[i].m);
-
-    uint32_t qh = x[i].qh;
-
-    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-    const int x0 = (x[i].qs[j] & 0xf) | xh_0;
-    const int x1 = (x[i].qs[j] >>  4) | xh_1;
-
-    y[i*32 + j + 0 ] = x0*d + m;
-    y[i*32 + j + 16] = x1*d + m;
-}
-
-__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
-    const uint i = get_global_id(0) / 32; /* QK8_0 */
-    const uint j = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &x[i].d);
-    y[i*32 + j] = x[i].qs[j]*d;
-}
-
-);
-
-#define CL_CHECK(err)                                               \
-    do {                                                            \
-        cl_int err_ = (err);                                        \
-        if (err_ != CL_SUCCESS) {                                   \
-            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
-                #err, err_, __FILE__, __LINE__);                    \
-            exit(1);                                                \
-        }                                                           \
-    } while (0)
-
-#define CLBLAST_CHECK(err)                                          \
-    do {                                                            \
-        CLBlastStatusCode err_ = (err);                             \
-        if (err_ != CLBlastSuccess) {                               \
-            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
-                #err, err_, __FILE__, __LINE__);                    \
-            exit(1);                                                \
-        }                                                           \
-    } while (0)
-
-static cl_platform_id platform;
-static cl_device_id device;
-static cl_context context;
-static cl_command_queue queue;
-static cl_program program;
-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
-static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
-static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
-
-static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
-    cl_program p;
-    char *program_log;
-    size_t program_size, log_size;
-    int err;
-
-    program_size = strlen(program_buffer);
-
-    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
-    if(err < 0) {
-        fprintf(stderr, "OpenCL error creating program");
-        exit(1);
-    }
-
-    err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
-    if(err < 0) {
-
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-        program_log = (char*) malloc(log_size + 1);
-        program_log[log_size] = '\0';
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
-        printf("%s\n", program_log);
-        free(program_log);
-        exit(1);
-    }
-
-    return p;
-}
-
-void ggml_cl_init(void) {
-    cl_int err = 0;
-
-    struct cl_device;
-    struct cl_platform {
-        cl_platform_id id;
-        unsigned number;
-        char name[128];
-        char vendor[128];
-        struct cl_device * devices;
-        unsigned n_devices;
-        struct cl_device * default_device;
-    };
-
-    struct cl_device {
-        struct cl_platform * platform;
-        cl_device_id id;
-        unsigned number;
-        cl_device_type type;
-        char name[128];
-    };
-
-    enum { NPLAT = 16, NDEV = 16 };
-
-    struct cl_platform platforms[NPLAT];
-    unsigned n_platforms = 0;
-    struct cl_device devices[NDEV];
-    unsigned n_devices = 0;
-    struct cl_device * default_device = NULL;
-
-    platform = NULL;
-    device = NULL;
-
-    cl_platform_id platform_ids[NPLAT];
-    CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
-
-    for (unsigned i = 0; i < n_platforms; i++) {
-        struct cl_platform * p = &platforms[i];
-        p->number = i;
-        p->id = platform_ids[i];
-        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
-        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
-
-        cl_device_id device_ids[NDEV];
-        cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
-        if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
-            p->n_devices = 0;
-        } else {
-            CL_CHECK(clGetDeviceIDsError);
-        }
-        p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
-        p->default_device = NULL;
-
-        for (unsigned j = 0; j < p->n_devices; j++) {
-            struct cl_device * d = &devices[n_devices];
-            d->number = n_devices++;
-            d->id = device_ids[j];
-            d->platform = p;
-            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
-            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
-
-            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
-                p->default_device = d;
-            }
-        }
-
-        if (default_device == NULL && p->default_device != NULL) {
-            default_device = p->default_device;
-        }
-    }
-
-    if (n_devices == 0) {
-        fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
-        exit(1);
-    }
-
-    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
-    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
-    int user_platform_number = -1;
-    int user_device_number = -1;
-
-    unsigned n;
-    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
-        user_platform_number = (int)n;
-    }
-    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
-        user_device_number = (int)n;
-    }
-
-    struct cl_device * selected_devices = devices;
-    unsigned n_selected_devices = n_devices;
-
-    if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
-        for (unsigned i = 0; i < n_platforms; i++) {
-            struct cl_platform * p = &platforms[i];
-            if (strstr(p->name, user_platform_string) != NULL ||
-                strstr(p->vendor, user_platform_string) != NULL) {
-                user_platform_number = (int)i;
-                break;
-            }
-        }
-        if (user_platform_number == -1) {
-            fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
-            exit(1);
-        }
-    }
-    if (user_platform_number != -1) {
-        struct cl_platform * p = &platforms[user_platform_number];
-        selected_devices = p->devices;
-        n_selected_devices = p->n_devices;
-        default_device = p->default_device;
-        if (n_selected_devices == 0) {
-            fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
-            exit(1);
-        }
-    }
-
-    if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
-        for (unsigned i = 0; i < n_selected_devices; i++) {
-            struct cl_device * d = &selected_devices[i];
-            if (strstr(d->name, user_device_string) != NULL) {
-                user_device_number = d->number;
-                break;
-            }
-        }
-        if (user_device_number == -1) {
-            fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
-            exit(1);
-        }
-    }
-    if (user_device_number != -1) {
-        selected_devices = &devices[user_device_number];
-        n_selected_devices = 1;
-        default_device = &selected_devices[0];
-    }
-
-    GGML_ASSERT(n_selected_devices > 0);
-
-    if (default_device == NULL) {
-        default_device = &selected_devices[0];
-    }
-
-    fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
-    fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
-    if (default_device->type != CL_DEVICE_TYPE_GPU) {
-        fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
-    }
-
-    platform = default_device->platform->id;
-    device = default_device->id;
-
-    cl_context_properties properties[] = {
-        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
-    };
-
-    CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
-
-    CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
-        (err != CL_INVALID_PROPERTY && err != CL_INVALID_VALUE ? err :
-        (queue = clCreateCommandQueue(context, device, 0, &err), err)
-    )));
-
-    program = build_program_from_source(context, device, program_source);
-
-    // Prepare dequantize kernels
-    CL_CHECK((kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
-    CL_CHECK((kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
-    CL_CHECK((kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
-    CL_CHECK((kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
-    CL_CHECK((kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
-}
-
-static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
-    if (req_size <= *cur_size) {
-        return;
-    }
-
-    // Reallocate buffer with enough space
-    if (*cur_size > 0) {
-        clReleaseMemObject(*buf);
-    }
-    cl_int err;
-    CL_CHECK((*buf = clCreateBuffer(context, flags, req_size, NULL, &err), err));
-    *cur_size = req_size;
-}
-
-void ggml_cl_sgemm_wrapper(
-        const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
-        const int m, const int n, const int k,
-        const float alpha, const void *host_a, const int lda,
-        const float *host_b, const int ldb, const float beta,
-        float *host_c, const int ldc, const int btype) {
-
-    cl_kernel kernel;
-    size_t global = n * k, local, size_qb;
-    bool dequant;
-
-    switch (btype) {
-    case GGML_TYPE_F32:
-        dequant = false;
-        break;
-    case GGML_TYPE_Q4_0:
-        dequant = true;
-        kernel = kernel_q4_0;
-        local = 16;
-        size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
-        break;
-    case GGML_TYPE_Q4_1:
-        dequant = true;
-        kernel = kernel_q4_1;
-        local = 16;
-        size_qb = global * (sizeof(ggml_fp16_t) * 2 + local) / 32;
-        break;
-    case GGML_TYPE_Q5_0:
-        dequant = true;
-        kernel = kernel_q5_0;
-        local = 16;
-        size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
-        break;
-    case GGML_TYPE_Q5_1:
-        dequant = true;
-        kernel = kernel_q5_1;
-        local = 16;
-        size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
-        break;
-    case GGML_TYPE_Q8_0:
-        dequant = true;
-        kernel = kernel_q8_0;
-        local = 32;
-        size_qb = global * (sizeof(ggml_fp16_t) + local) / 32;
-        break;
-    default:
-        fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
-        abort();
-    }
-
-    const size_t size_a =  m * k * sizeof(float);
-    const size_t size_b =  n * k * sizeof(float);
-    const size_t size_c =  m * n * sizeof(float);
-
-    // Prepare buffers
-    ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
-    if (dequant) {
-        ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
-    }
-    ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
-    ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
-
-    cl_event ev_a, ev_qb, ev_b;
-
-    if (dequant) {
-        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb));
-        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b));
-        CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb));
-    } else {
-        CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b));
-    }
-
-    CL_CHECK(clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a));
-    if (dequant) {
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b));
-        CL_CHECK(clReleaseEvent(ev_qb));
-    }
-    CL_CHECK(clWaitForEvents(1, &ev_a));
-    CL_CHECK(clWaitForEvents(1, &ev_b));
-    CL_CHECK(clReleaseEvent(ev_a));
-    CL_CHECK(clReleaseEvent(ev_b));
-
-    cl_event ev_sgemm;
-    CLBLAST_CHECK(CLBlastSgemm(
-        (CLBlastLayout)order,
-        (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
-        m, n, k,
-        alpha,
-        cl_buffer_a, 0, lda,
-        cl_buffer_b, 0, ldb,
-        beta,
-        cl_buffer_c, 0, ldc,
-        &queue, &ev_sgemm));
-
-    cl_event ev_c;
-    CL_CHECK(clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c));
-
-    // Wait for completion
-    CL_CHECK(clWaitForEvents(1, &ev_c));
-    CL_CHECK(clReleaseEvent(ev_sgemm));
-    CL_CHECK(clReleaseEvent(ev_c));
-}
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
new file mode 100644
index 000000000..667f55e79
--- /dev/null
+++ b/ggml-opencl.cpp
@@ -0,0 +1,1034 @@
+#include "ggml-opencl.h"
+
+#include <array>
+#include <atomic>
+#include <sstream>
+
+#define CL_TARGET_OPENCL_VERSION 110
+#include <clblast.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "ggml.h"
+
+#define CL_DMMV_BLOCK_SIZE 32;
+
+#define MULTILINE_QUOTE(...) #__VA_ARGS__
+static std::string program_source = MULTILINE_QUOTE(
+
+typedef char int8_t;
+typedef uchar uint8_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+struct __attribute__ ((packed)) block_q4_0
+{
+    half d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+struct __attribute__ ((packed)) block_q4_1
+{
+    half d;
+    half m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+struct __attribute__ ((packed)) block_q5_0
+{
+    half d;
+    uint32_t qh;
+    uint8_t qs[QK5_0 / 2];
+};
+
+struct __attribute__ ((packed)) block_q5_1
+{
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[QK5_1 / 2];
+};
+
+struct __attribute__ ((packed)) block_q8_0
+{
+    half d;
+    int8_t qs[QK8_0];
+};
+
+
+__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
+    const uint i = get_global_id(0);
+
+    y[i] = vload_half(0, &x[i]);
+}
+
+void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    *v0 = (vi0 - 8)*d;
+    *v1 = (vi1 - 8)*d;
+}
+void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+    const float m = vload_half(0, &x[ib].m);
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    *v0 = vi0*d + m;
+    *v1 = vi1*d + m;
+}
+void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+
+    uint32_t qh = x[ib].qh;
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1) - 16;
+
+    *v0 = x0*d;
+    *v1 = x1*d;
+}
+void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+    const float m = vload_half(0, &x[ib].m);
+
+    uint32_t qh = x[ib].qh;
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+    *v0 = x0*d + m;
+    *v1 = x1*d + m;
+}
+void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
+    const float d = vload_half(0, &x[ib].d);
+
+    const int8_t vi0 = x[ib].qs[iqs + 0];
+    const int8_t vi1 = x[ib].qs[iqs + 1];
+
+    *v0 = vi0*d;
+    *v1 = vi1*d;
+}
+void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
+    *v0 = vload_half(0, &x[ib + 0]);
+    *v1 = vload_half(0, &x[ib + 1]);
+}
+);
+
+std::string dequant_template = MULTILINE_QUOTE(
+__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
+    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
+
+    if (i >= get_global_size(0)) {
+        return;
+    }
+
+    const uint qk = QUANT_K;
+    const uint qr = QUANT_R;
+
+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    float v0, v1;
+    DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
+    y[iybs + iqs + 0] = v0;
+    y[iybs + iqs + y_offset] = v1;
+}
+);
+
+std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
+__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
+    const int block_size = get_local_size(0);
+    const int row = get_global_id(0) / block_size;
+    const int tid = get_local_id(0);
+
+    const uint qk = QUANT_K;
+    const uint qr = QUANT_R;
+
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    tmp[tid] = 0;
+
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // dequantize
+        float v0, v1;
+        DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+);
+
+#define CL_CHECK(err)                                               \
+    do {                                                            \
+        cl_int err_ = (err);                                        \
+        if (err_ != CL_SUCCESS) {                                   \
+            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
+                #err, err_, __FILE__, __LINE__);                    \
+            exit(1);                                                \
+        }                                                           \
+    } while (0)
+
+#define CLBLAST_CHECK(err)                                          \
+    do {                                                            \
+        CLBlastStatusCode err_ = (err);                             \
+        if (err_ != CLBlastSuccess) {                               \
+            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
+                #err, err_, __FILE__, __LINE__);                    \
+            exit(1);                                                \
+        }                                                           \
+    } while (0)
+
+std::array<std::string, 5> dequant_str_keys = {
+    "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
+};
+
+std::array<std::string, 30> dequant_str_values = {
+    "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
+    "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
+    "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
+    "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
+    "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
+    "convert_row_f16", "half", "1", "1", "convert_f16"
+};
+
+std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
+    "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
+    "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
+    "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
+    "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
+    "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
+    "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
+};
+
+std::string& replace(std::string& s, const std::string& from, const std::string& to) {
+    size_t pos = 0;
+    while ((pos = s.find(from, pos)) != std::string::npos) {
+         s.replace(pos, from.length(), to);
+         pos += to.length();
+    }
+    return s;
+}
+
+std::string generate_kernels() {
+    std::stringstream src;
+    src << program_source << '\n';
+    for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
+        std::string dequant_kernel = dequant_template;
+        std::string dmmv_kernel = dequant_mul_mat_vec_template;
+        for (size_t j = 0; j < dequant_str_keys.size(); j++) {
+            replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
+            replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
+        }
+        src << dequant_kernel << '\n';
+        src << dmmv_kernel << '\n';
+    }
+    return src.str();
+}
+
+static cl_platform_id platform;
+static cl_device_id device;
+static cl_context context;
+static cl_command_queue queue;
+static cl_program program;
+static cl_kernel convert_row_f16_cl;
+static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
+static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
+static bool fp16_support;
+
+static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
+    cl_program p;
+    char *program_log;
+    size_t program_size;
+    size_t log_size;
+    int err;
+
+    program_size = strlen(program_buffer);
+
+    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
+    if(err < 0) {
+        fprintf(stderr, "OpenCL error creating program");
+        exit(1);
+    }
+
+    const char* compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
+                               "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1";
+
+    err = clBuildProgram(p, 0, NULL, compile_opts, NULL, NULL);
+    if(err < 0) {
+
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        program_log = (char*) malloc(log_size + 1);
+        program_log[log_size] = '\0';
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
+        fprintf(stderr, "ggml_opencl: kernel compile error:\n\n%s\n", program_log);
+        free(program_log);
+        exit(1);
+    }
+
+    return p;
+}
+
+void ggml_cl_init(void) {
+    cl_int err;
+
+    struct cl_device;
+    struct cl_platform {
+        cl_platform_id id;
+        unsigned number;
+        char name[128];
+        char vendor[128];
+        struct cl_device * devices;
+        unsigned n_devices;
+        struct cl_device * default_device;
+    };
+
+    struct cl_device {
+        struct cl_platform * platform;
+        cl_device_id id;
+        unsigned number;
+        cl_device_type type;
+        char name[128];
+    };
+
+    enum { NPLAT = 16, NDEV = 16 };
+
+    struct cl_platform platforms[NPLAT];
+    unsigned n_platforms = 0;
+    struct cl_device devices[NDEV];
+    unsigned n_devices = 0;
+    struct cl_device * default_device = NULL;
+
+    platform = NULL;
+    device = NULL;
+
+    cl_platform_id platform_ids[NPLAT];
+    CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
+
+    for (unsigned i = 0; i < n_platforms; i++) {
+        struct cl_platform * p = &platforms[i];
+        p->number = i;
+        p->id = platform_ids[i];
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
+        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
+
+        cl_device_id device_ids[NDEV];
+        cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
+        if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
+            p->n_devices = 0;
+        } else {
+            CL_CHECK(clGetDeviceIDsError);
+        }
+        p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
+        p->default_device = NULL;
+
+        for (unsigned j = 0; j < p->n_devices; j++) {
+            struct cl_device * d = &devices[n_devices];
+            d->number = n_devices++;
+            d->id = device_ids[j];
+            d->platform = p;
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
+            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
+
+            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
+                p->default_device = d;
+            }
+        }
+
+        if (default_device == NULL && p->default_device != NULL) {
+            default_device = p->default_device;
+        }
+    }
+
+    if (n_devices == 0) {
+        fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
+        exit(1);
+    }
+
+    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
+    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
+    int user_platform_number = -1;
+    int user_device_number = -1;
+
+    unsigned n;
+    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
+        user_platform_number = (int)n;
+    }
+    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
+        user_device_number = (int)n;
+    }
+    if (user_platform_number != -1 && user_device_number != -1) {
+        cl_platform* platform = &platforms[user_platform_number];
+        if ((unsigned)user_device_number >= platform->n_devices) {
+            fprintf(stderr, "ggml_opencl: invalid device number %d\n", user_device_number);
+            exit(1);
+        }
+        default_device = &platform->devices[user_device_number];
+    } else {
+
+        struct cl_device * selected_devices = devices;
+        unsigned n_selected_devices = n_devices;
+
+        if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
+            for (unsigned i = 0; i < n_platforms; i++) {
+                struct cl_platform * p = &platforms[i];
+                if (strstr(p->name, user_platform_string) != NULL ||
+                    strstr(p->vendor, user_platform_string) != NULL) {
+                    user_platform_number = (int)i;
+                    break;
+                }
+            }
+            if (user_platform_number == -1) {
+                fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
+                exit(1);
+            }
+        }
+        if (user_platform_number != -1) {
+            struct cl_platform * p = &platforms[user_platform_number];
+            selected_devices = p->devices;
+            n_selected_devices = p->n_devices;
+            default_device = p->default_device;
+            if (n_selected_devices == 0) {
+                fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
+                exit(1);
+            }
+        }
+
+        if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
+            for (unsigned i = 0; i < n_selected_devices; i++) {
+                struct cl_device * d = &selected_devices[i];
+                if (strstr(d->name, user_device_string) != NULL) {
+                    user_device_number = d->number;
+                    break;
+                }
+            }
+            if (user_device_number == -1) {
+                fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
+                exit(1);
+            }
+        }
+        if (user_device_number != -1) {
+            selected_devices = &devices[user_device_number];
+            n_selected_devices = 1;
+            default_device = &selected_devices[0];
+        }
+
+        GGML_ASSERT(n_selected_devices > 0);
+
+        if (default_device == NULL) {
+            default_device = &selected_devices[0];
+        }
+    }
+
+    fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
+    fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
+    if (default_device->type != CL_DEVICE_TYPE_GPU) {
+        fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
+    }
+
+    platform = default_device->platform->id;
+    device = default_device->id;
+
+    size_t ext_str_size;
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
+    char* ext_buffer = (char*) malloc(sizeof(char) * ext_str_size);
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
+    // Check if ext_buffer contains cl_khr_fp16
+    for (size_t i = 0; i < ext_str_size - 12; i++) {
+        if (memcmp(ext_buffer + i, "cl_khr_fp16", 11) == 0) {
+            fp16_support = true;
+            break;
+        }
+    }
+    free(ext_buffer);
+    fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
+
+    cl_context_properties properties[] = {
+        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
+    };
+
+    CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
+
+    CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
+        (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
+        (queue = clCreateCommandQueue(context, device, 0, &err), err)
+    )));
+
+    const std::string kernel_src = generate_kernels();
+
+    program = build_program_from_source(context, device, kernel_src.c_str());
+
+    // FP16 to FP32 kernel
+    CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err));
+
+    // Dequantize kernels
+    CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
+    CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
+    CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
+    CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
+    CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
+
+    // dequant mul mat kernel
+    CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
+    CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
+    CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
+}
+
+static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return &dequantize_row_q4_0_cl;
+        case GGML_TYPE_Q4_1:
+            return &dequantize_row_q4_1_cl;
+        case GGML_TYPE_Q5_0:
+            return &dequantize_row_q5_0_cl;
+        case GGML_TYPE_Q5_1:
+            return &dequantize_row_q5_1_cl;
+        case GGML_TYPE_Q8_0:
+            return &dequantize_row_q8_0_cl;
+        case GGML_TYPE_F16:
+            return &convert_row_f16_cl;
+        default:
+            return nullptr;
+    }
+}
+
+static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return &dequantize_mul_mat_vec_q4_0_cl;
+        case GGML_TYPE_Q4_1:
+            return &dequantize_mul_mat_vec_q4_1_cl;
+        case GGML_TYPE_Q5_0:
+            return &dequantize_mul_mat_vec_q5_0_cl;
+        case GGML_TYPE_Q5_1:
+            return &dequantize_mul_mat_vec_q5_1_cl;
+        case GGML_TYPE_Q8_0:
+            return &dequantize_mul_mat_vec_q8_0_cl;
+        case GGML_TYPE_F16:
+            return &convert_mul_mat_vec_f16_cl;
+        default:
+            return nullptr;
+    }
+}
+
+// buffer pool for cl
+#define MAX_CL_BUFFERS 256
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+struct cl_buffer {
+    cl_mem mem;
+    size_t size = 0;
+};
+
+static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
+static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
+
+static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
+    scoped_spin_lock lock(g_cl_pool_lock);
+    cl_int err;
+
+    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
+        cl_buffer& b = g_cl_buffer_pool[i];
+        if (b.size > 0 && b.size >= size) {
+            cl_mem mem = b.mem;
+            *actual_size = b.size;
+            b.size = 0;
+            return mem;
+        }
+    }
+    cl_mem mem;
+    CL_CHECK((mem = clCreateBuffer(context, flags, size, NULL, &err), err));
+    *actual_size = size;
+    return mem;
+}
+
+static void ggml_cl_pool_free(cl_mem mem, size_t size) {
+    scoped_spin_lock lock(g_cl_pool_lock);
+
+    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
+        cl_buffer& b = g_cl_buffer_pool[i];
+        if (b.size == 0) {
+            b.mem = mem;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n");
+    clReleaseMemObject(mem);
+}
+
+static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
+    cl_int err;
+    const uint64_t ne0 = src->ne[0];
+    const uint64_t ne1 = src->ne[1];
+    const uint64_t nb0 = src->nb[0];
+    const uint64_t nb1 = src->nb[1];
+    const uint64_t nb2 = src->nb[2];
+    const uint64_t nb3 = src->nb[3];
+    const enum ggml_type type = src->type;
+    const size_t ts = ggml_type_size(type);
+    const size_t bs = ggml_blck_size(type);
+
+    const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
+        return err;
+    }
+    if (nb0 == ts) {
+        const size_t buffer_origin[3] = { offset, 0, 0 };
+        const size_t host_origin[3] = { 0, 0, 0 };
+        const size_t region[3] = { ts*ne0/bs, ne1, 1 };
+        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
+        return err;
+    }
+    for (uint64_t i1 = 0; i1 < ne1; i1++) {
+        // pretend the row is a matrix with cols=1
+        const size_t buffer_origin[3] = { offset, i1, 0 };
+        const size_t host_origin[3] = { 0, 0, 0 };
+        const size_t region[3] = { ts/bs, ne0, 1 };
+        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
+        if (err != CL_SUCCESS) {
+            break;
+        }
+    }
+    return err;
+}
+
+static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+
+    size_t x_size;
+    size_t y_size;
+    size_t d_size;
+    cl_mem d_X;
+    if (src0->backend == GGML_BACKEND_CL) {
+        d_X = *(cl_mem*) src0->data;
+    } else {
+        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
+    }
+    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
+    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            // copy data to device
+            if (src0->backend != GGML_BACKEND_CL) {
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+            }
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
+
+            CL_CHECK(clFinish(queue));
+
+            // compute
+            cl_event ev_sgemm;
+            clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
+                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                       ne01, ne11, ne10,
+                                                       alpha,
+                                                       d_X, 0, ne00,
+                                                       d_Y, 0, ne10,
+                                                       beta,
+                                                       d_D, 0, ne01,
+                                                       &queue, &ev_sgemm);
+
+            if (status != clblast::StatusCode::kSuccess) {
+                GGML_ASSERT(false);
+            }
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
+        }
+    }
+
+    if (src0->backend != GGML_BACKEND_CL) {
+        ggml_cl_pool_free(d_X, x_size);
+    }
+    ggml_cl_pool_free(d_Y, y_size);
+    ggml_cl_pool_free(d_D, d_size);
+}
+
+static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
+    GGML_ASSERT(fp16_support);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb10 = src1->nb[0];
+    const int nb11 = src1->nb[1];
+    const int nb12 = src1->nb[2];
+    const int nb13 = src1->nb[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
+    const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+
+    size_t x_size;
+    size_t y_size;
+    size_t d_size;
+    cl_mem d_X;
+    if (src0->backend == GGML_BACKEND_CL) {
+        d_X = *(cl_mem*) src0->data;
+    } else {
+        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
+    }
+    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
+    cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
+
+    bool src1_cont_rows = nb10 == sizeof(float);
+    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            // copy src0 to device
+            if (src0->backend != GGML_BACKEND_CL) {
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
+            }
+
+            // convert src1 to fp16
+            // TODO: use multiple threads
+            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
+            char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
+            if (src1_cont_rows) {
+                if (src1_cont_cols) {
+                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+                }
+                else {
+                    for (int64_t i01 = 0; i01 < ne11; i01++) {
+                        ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
+                    }
+                }
+            }
+            else {
+                for (int64_t i01 = 0; i01 < ne11; i01++) {
+                    for (int64_t i00 = 0; i00 < ne10; i00++) {
+                        // very slow due to no inlining
+                        tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
+                    }
+                }
+            }
+
+            // copy src1 to device
+            CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
+
+            CL_CHECK(clFinish(queue));
+
+            // compute
+            cl_event ev_sgemm;
+            clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
+                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                       ne01, ne11, ne10,
+                                                       alpha,
+                                                       d_X, 0, ne00,
+                                                       d_Y, 0, ne10,
+                                                       beta,
+                                                       d_D, 0, ne01,
+                                                       &queue, &ev_sgemm);
+
+            if (status != clblast::StatusCode::kSuccess) {
+                GGML_ASSERT(false);
+            }
+
+            // copy dst to host, then convert to float
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
+
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+
+            ggml_fp16_to_fp32_row(tmp, d, d_ne);
+        }
+    }
+
+    if (src0->backend != GGML_BACKEND_CL) {
+        ggml_cl_pool_free(d_X, x_size);
+    }
+    ggml_cl_pool_free(d_Y, y_size);
+    ggml_cl_pool_free(d_D, d_size);
+}
+
+static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    const ggml_type type = src0->type;
+    const bool mul_mat_vec = ne11 == 1;
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
+
+    size_t x_size;
+    size_t y_size;
+    size_t d_size;
+    size_t q_size;
+    cl_mem d_X;
+    if (!mul_mat_vec) {
+        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
+    }
+    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
+    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
+    cl_mem d_Q;
+    if (src0->backend == GGML_BACKEND_CPU) {
+        d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
+    }
+
+    cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
+    cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
+    GGML_ASSERT(to_fp32_cl != nullptr);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            cl_event ev_sgemm;
+
+            // copy src0 to device if necessary
+            if (src0->backend == GGML_BACKEND_CPU) {
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL));
+            } else if (src0->backend == GGML_BACKEND_CL) {
+                d_Q = *(cl_mem*) src0->data;
+            } else {
+                GGML_ASSERT(false);
+            }
+            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
+                // copy src1 to device
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
+
+                // compute
+                const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
+                const size_t local = CL_DMMV_BLOCK_SIZE;
+                const cl_int ncols = ne00;
+                CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
+                CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
+                CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
+                CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
+                CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
+                CL_CHECK(clFinish(queue));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, 0, NULL, &ev_sgemm));
+            } else { // general dequantization kernel + CLBlast matrix matrix multiplication
+                // convert src0 to fp32 on device
+                const size_t global = x_ne;
+                CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
+                CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
+                CL_CHECK(clFinish(queue));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL));
+
+                // copy src1 to device
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
+
+                // wait for conversion
+                CL_CHECK(clFinish(queue));
+
+                // compute
+                clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
+                                                           clblast::Transpose::kYes, clblast::Transpose::kNo,
+                                                           ne01, ne11, ne10,
+                                                           alpha,
+                                                           d_X, 0, ne00,
+                                                           d_Y, 0, ne10,
+                                                           beta,
+                                                           d_D, 0, ne01,
+                                                           &queue, &ev_sgemm);
+
+                if (status != clblast::StatusCode::kSuccess) {
+                    GGML_ASSERT(false);
+                }
+            }
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
+            clReleaseEvent(ev_sgemm);
+        }
+    }
+
+    if (!mul_mat_vec) {
+        ggml_cl_pool_free(d_X, x_size);
+    }
+    ggml_cl_pool_free(d_Y, y_size);
+    ggml_cl_pool_free(d_D, d_size);
+    if (src0->backend == GGML_BACKEND_CPU) {
+        ggml_cl_pool_free(d_Q, q_size);
+    }
+}
+
+
+bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        src1->type == GGML_TYPE_F32 &&
+        dst->type == GGML_TYPE_F32 &&
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
+        return true;
+    }
+
+    return false;
+}
+
+bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
+    // If device doesn't support FP16
+    if (!fp16_support) {
+        return false;
+    }
+
+    size_t src0_sz = ggml_nbytes(src0);
+    size_t src1_sz = ggml_nbytes(src1);
+
+    // mul_mat_q: src0 is converted to fp32 on device
+    size_t mul_mat_q_transfer = src0_sz + src1_sz;
+
+    // mul_mat_f16: src1 is converted to fp16 on cpu
+    size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1);
+
+    // choose the smaller one to transfer to the device
+    // TODO: this is not always the best choice due to the overhead of converting to fp16
+    return mul_mat_f16_transfer < mul_mat_q_transfer;
+}
+
+void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
+    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
+
+    if (src0->type == GGML_TYPE_F32) {
+        ggml_cl_mul_mat_f32(src0, src1, dst);
+    }
+    else if (src0->type == GGML_TYPE_F16) {
+        if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
+            ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
+        }
+        else {
+            ggml_cl_mul_mat_q_f32(src0, src1, dst);
+        }
+    }
+    else if (ggml_is_quantized(src0->type)) {
+        ggml_cl_mul_mat_q_f32(src0, src1, dst);
+    }
+    else {
+        GGML_ASSERT(false);
+    }
+}
+
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
+        return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+    }
+    return 0;
+}
+
+void ggml_cl_transform_tensor(ggml_tensor * tensor) {
+    const int64_t ne0 = tensor->ne[0];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne3 = tensor->ne[3];
+
+    const ggml_type type = tensor->type;
+    const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
+
+    size_t q_size;
+    cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem));
+    *dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
+
+    // copy tensor to device
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            int i = i3*ne2 + i2;
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL));
+        }
+    }
+
+    CL_CHECK(clFinish(queue));
+
+    tensor->data = dst;
+    tensor->backend = GGML_BACKEND_CL;
+}
diff --git a/ggml-opencl.h b/ggml-opencl.h
index 7bcc603ef..5a1a50093 100644
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@@ -1,23 +1,21 @@
 #pragma once
 
+#include "ggml.h"
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
 void ggml_cl_init(void);
 
-enum ggml_blas_order {
-    GGML_BLAS_ORDER_ROW_MAJOR = 101,
-    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
-};
+bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
-enum ggml_blas_op {
-    GGML_BLAS_OP_N = 111,
-    GGML_BLAS_OP_T = 112,
-    GGML_BLAS_OP_C = 113,
-};
+void * ggml_cl_host_malloc(size_t size);
+void   ggml_cl_host_free(void * ptr);
 
-void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
+void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml.c b/ggml.c
index d36bb2281..07ea92a58 100644
--- a/ggml.c
+++ b/ggml.c
@@ -186,10 +186,12 @@ typedef double ggml_float;
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
+#if !defined(__riscv)
 #include <immintrin.h>
 #endif
 #endif
 #endif
+#endif
 
 #ifdef __F16C__
 
@@ -3494,7 +3496,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
 };
 static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
 
-static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
+static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "NONE",
 
     "DUP",
@@ -3749,6 +3751,9 @@ const char * ggml_type_name(enum ggml_type type) {
     return GGML_TYPE_NAME[type];
 }
 
+const char * ggml_op_name(enum ggml_op op) {
+    return GGML_OP_NAME[op];
+}
 
 size_t ggml_element_size(const struct ggml_tensor * tensor) {
     return GGML_TYPE_SIZE[tensor->type];
@@ -3805,6 +3810,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
     return wtype;
 }
 
+size_t ggml_tensor_overhead(void) {
+    return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
+}
+
 static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
     return tensor->nb[0] > tensor->nb[1];
 }
@@ -4017,6 +4026,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
     return result;
 }
 
+void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
+    ctx->no_alloc = no_alloc;
+}
+
 // IMPORTANT:
 // when creating "opt" tensors, always save and load the scratch buffer
 // this is an error prone process, but it is necessary to support inplace
@@ -4061,7 +4074,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
     struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
 
     if (ctx->scratch.data == NULL || data != NULL) {
-        size_needed += sizeof(struct ggml_tensor);
+        size_needed += GGML_TENSOR_SIZE;
 
         if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
             GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
@@ -4077,14 +4090,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
         };
     } else {
         if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
-            GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
+            GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
+                    __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
             assert(false);
             return NULL;
         }
 
-        if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
+        if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
             GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                    __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
+                    __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
             assert(false);
             return NULL;
         }
@@ -4093,7 +4107,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
 
         *obj_new = (struct ggml_object) {
             .offs = cur_end + GGML_OBJECT_SIZE,
-            .size = sizeof(struct ggml_tensor),
+            .size = GGML_TENSOR_SIZE,
             .next = NULL,
         };
 
@@ -9431,7 +9445,7 @@ static void ggml_compute_forward_rms_norm_back(
 
 // ggml_compute_forward_mul_mat
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
 static bool ggml_compute_forward_mul_mat_use_blas(
@@ -9472,7 +9486,7 @@ static void ggml_compute_forward_mul_mat_f32(
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     const int64_t ne10 = src1->ne[0];
 #endif
     const int64_t ne11 = src1->ne[1];
@@ -9536,9 +9550,16 @@ static void ggml_compute_forward_mul_mat_f32(
         }
         return;
     }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
 #endif
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -9558,21 +9579,11 @@ static void ggml_compute_forward_mul_mat_f32(
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
-#if defined(GGML_USE_CLBLAST)
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        GGML_TYPE_F32);
-#else
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
-#endif
             }
         }
         //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@@ -9711,9 +9722,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
         }
         return;
     }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
 #endif
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_ASSERT(nb10 == sizeof(float));
 
@@ -9743,20 +9761,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                     assert(id*sizeof(float) <= params->wsize);
                 }
 
-#if defined(GGML_USE_CLBLAST)
-                const float * x = wdata;
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        GGML_TYPE_F32);
-#else
                 const float * x = wdata;
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
 
@@ -9768,7 +9772,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
-#endif
             }
         }
 
@@ -9931,9 +9934,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
         }
         return;
     }
+#elif defined(GGML_USE_CLBLAST)
+    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
 #endif
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -9956,9 +9966,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
-#if defined(GGML_USE_CLBLAST)
-                const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
-#else
                 {
                     size_t id = 0;
                     for (int64_t i01 = 0; i01 < ne01; ++i01) {
@@ -9970,23 +9977,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
                 }
 
                 const float * x = wdata;
-#endif
 
-#if defined(GGML_USE_CLBLAST)
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        type);
-#else
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
                                  x, ne00,
                         0.0f,    d, ne01);
-#endif
             }
         }
 
@@ -13810,11 +13806,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
         // reached a leaf node, not part of the gradient graph (e.g. a constant)
         GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
 
+        if (strlen(node->name) == 0) {
+            snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
+        }
+
         cgraph->leafs[cgraph->n_leafs] = node;
         cgraph->n_leafs++;
     } else {
         GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
 
+        if (strlen(node->name) == 0) {
+            snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
+        }
+
         cgraph->nodes[cgraph->n_nodes] = node;
         cgraph->grads[cgraph->n_nodes] = node->grad;
         cgraph->n_nodes++;
@@ -14165,9 +14169,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
                         }
                         else
+#elif defined(GGML_USE_CLBLAST)
+                        if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
+                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                                                //       the threads are still spinning
+                            cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
+                        }
+                        else
 #endif
                         if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1; // TODO: this actually is doing nothing
                                                    //       the threads are still spinning
@@ -14181,13 +14192,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 #endif
                         } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
                             cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                             }
 #endif
                         } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -14521,6 +14532,26 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
     }
 }
 
+struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) {
+    for (int i = 0; i < cgraph->n_leafs; i++) {
+        struct ggml_tensor * leaf = cgraph->leafs[i];
+
+        if (strcmp(leaf->name, name) == 0) {
+            return leaf;
+        }
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * node = cgraph->nodes[i];
+
+        if (strcmp(node->name, name) == 0) {
+            return node;
+        }
+    }
+
+    return NULL;
+}
+
 void ggml_graph_print(const struct ggml_cgraph * cgraph) {
     int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
 
@@ -14538,7 +14569,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
         GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                 i,
                 node->ne[0], node->ne[1], node->ne[2],
-                GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
+                GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
                 (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
                 (double) node->perf_time_us / 1000.0,
@@ -14552,7 +14583,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
         GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
                 i,
                 node->ne[0], node->ne[1],
-                GGML_OP_LABEL[node->op]);
+                GGML_OP_NAME[node->op]);
     }
 
     for (int i = 0; i < GGML_OP_COUNT; i++) {
@@ -14560,7 +14591,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
             continue;
         }
 
-        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
+        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
     }
 
     GGML_PRINT("========================================\n");
diff --git a/ggml.h b/ggml.h
index 51a616c50..558138280 100644
--- a/ggml.h
+++ b/ggml.h
@@ -198,6 +198,7 @@
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_OPT           4
+#define GGML_MAX_NAME          32
 #define GGML_DEFAULT_N_THREADS 4
 
 #define GGML_ASSERT(x) \
@@ -249,6 +250,7 @@ extern "C" {
     enum ggml_backend {
         GGML_BACKEND_CPU = 0,
         GGML_BACKEND_CUDA = 1,
+        GGML_BACKEND_CL = 2,
     };
 
     // model file types
@@ -371,11 +373,13 @@ extern "C" {
 
         void * data;
 
-        char name[32];
+        char name[GGML_MAX_NAME];
 
         char padding[16];
     };
 
+    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
+
     // computation graph
     struct ggml_cgraph {
         int n_nodes;
@@ -428,6 +432,7 @@ extern "C" {
     GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
 
     GGML_API const char * ggml_type_name(enum ggml_type type);
+    GGML_API const char * ggml_op_name  (enum ggml_op   op);
 
     GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
 
@@ -436,6 +441,9 @@ extern "C" {
     // TODO: temporary until model loading of ggml examples is refactored
     GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
 
+    // use this to compute the memory overhead of a tensor
+    GGML_API size_t ggml_tensor_overhead(void);
+
     // main
 
     GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -444,6 +452,7 @@ extern "C" {
     GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
 
     GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
 
     GGML_API struct ggml_tensor * ggml_new_tensor(
             struct ggml_context * ctx,
@@ -969,6 +978,8 @@ extern "C" {
     GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
     GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
 
+    GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name);
+
     // print info and performance information for the graph
     GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
 
diff --git a/llama.cpp b/llama.cpp
index 06f28b9f2..e1b5317d9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12,6 +12,8 @@
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
+#elif defined(GGML_USE_CLBLAST)
+#include "ggml-opencl.h"
 #endif
 
 #include <array>
@@ -1092,7 +1094,7 @@ static void llama_model_load_internal(
             fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
         }
         fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
-#else
+#elif !defined(GGML_USE_CLBLAST)
         (void) n_gpu_layers;
 #endif
     }
@@ -1125,7 +1127,33 @@ static void llama_model_load_internal(
             done_size += lt.size;
         }
     }
-#endif // GGML_USE_CUBLAS
+#elif defined(GGML_USE_CLBLAST)
+    {
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
+
+        size_t vram_total = 0;
+
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+
+            ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
+            ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
+            ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
+            ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
+            ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
+            ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
+            ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+        }
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
+            ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+        }
+
+        fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
+    }
+#endif
 
     if (progress_callback) {
         progress_callback(1.0f, progress_callback_user_data);