Merge branch 'master' into text-from-scratch

# Conflicts: # ggml.c : number of operations and GGML_ASSERT vs assert
2023-05-30 16:57:17 +02:00 · 2023-05-30 16:57:17 +02:00 · a5317498c2
commit a5317498c2
parent 1074a81e81 7552ac5863
37 changed files with 38038 additions and 1285 deletions
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip
+    apt-get install -y build-essential python3 python3-pip git

 COPY requirements.txt requirements.txt

--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential
+    apt-get install -y build-essential git

 WORKDIR /app

--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -10,10 +10,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp']

 env:
 BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@ -151,21 +151,21 @@ jobs:
    env:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
-      CLBLAST_VERSION: 1.5.3
+      CLBLAST_VERSION: 1.6.0

    strategy:
      matrix:
        include:
          - build: 'avx2'
-            defines: ''
+            defines: '-DLLAMA_BUILD_SERVER=ON'
          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'clblast'
-            defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas'
-            defines: '-DLLAMA_OPENBLAS=ON -DBLAS_LIBRARIES="/LIBPATH:$env:RUNNER_TEMP/openblas/lib" -DOPENBLAS_INC="$env:RUNNER_TEMP/openblas/include"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'

    steps:
      - name: Clone
@ -184,13 +184,13 @@ jobs:
        id: get_clblast
        if: ${{ matrix.build == 'clblast' }}
        run: |
-          curl.exe -o $env:RUNNER_TEMP/clblast.zip -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-Windows-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/clblast
-          tar.exe -xvf $env:RUNNER_TEMP/clblast.zip -C $env:RUNNER_TEMP/clblast
+          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
+          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
            $txt = Get-Content -Path $f -Raw
-            $txt.Replace('C:/dependencies/opencl/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
+            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
          }

      - name: Download OpenBLAS
@ -213,7 +213,6 @@ jobs:
          cd build
          cmake .. ${{ matrix.defines }}
          cmake --build . --config Release
-          cp ../LICENSE ./bin/Release/llama.cpp.txt

      - name: Add clblast.dll
        id: add_clblast_dll
@ -258,6 +257,7 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
+          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*

      - name: Upload artifacts
@ -292,7 +292,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_CUBLAS=ON
+          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
          cmake --build . --config Release

      - name: Get commit hash
--- a/BLIS.md
+++ b/BLIS.md
@ -0,0 +1,67 @@
+BLIS Installation Manual
+------------------------
+
+BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
+
+Project URL: https://github.com/flame/blis
+
+### Prepare:
+
+Compile BLIS:
+
+```bash
+git clone https://github.com/flame/blis
+cd blis
+./configure --enable-cblas -t openmp,pthreads auto
+# will install to /usr/local/ by default.
+make -j
+```
+
+Install BLIS:
+
+```bash
+sudo make install
+```
+
+We recommend using openmp since it's easier to modify the cores been used.
+
+### llama.cpp compilation
+
+Makefile:
+
+```bash
+make LLAMA_BLIS=1 -j
+# make LLAMA_BLIS=1 benchmark-matmult
+```
+
+CMake:
+
+```bash
+mkdir build
+cd build
+cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
+make -j
+```
+
+### llama.cpp execution
+
+According to the BLIS documentation, we could set the following
+environment variables to modify the behavior of openmp:
+
+```
+export GOMP_GPU_AFFINITY="0-19"
+export BLIS_NUM_THREADS=14
+```
+
+And then run the binaries as normal.
+
+
+### Intel specific issue
+
+Some might get the error message saying that `libimf.so` cannot be found.
+Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
+
+### Reference:
+
+1. https://github.com/flame/blis#getting-started
+2. https://github.com/flame/blis/blob/master/docs/Multithreading.md
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -37,40 +37,44 @@ endif()
 #

 # general
-option(LLAMA_STATIC                 "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                 "llama: enable -march=native flag"                      OFF)
-option(LLAMA_LTO                    "llama: enable link time optimization"                  OFF)
+option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
+option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)

 # debug
-option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
-option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
-option(LLAMA_GPROF                  "llama: enable gprof"                                   OFF)
+option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
+option(LLAMA_ALL_WARNINGS_3RD_PARTY     "llama: enable all compiler warnings in 3rd party libs" OFF)
+option(LLAMA_GPROF                      "llama: enable gprof"                                   OFF)

 # sanitizers
-option(LLAMA_SANITIZE_THREAD        "llama: enable thread sanitizer"                        OFF)
-option(LLAMA_SANITIZE_ADDRESS       "llama: enable address sanitizer"                       OFF)
-option(LLAMA_SANITIZE_UNDEFINED     "llama: enable undefined sanitizer"                     OFF)
+option(LLAMA_SANITIZE_THREAD            "llama: enable thread sanitizer"                        OFF)
+option(LLAMA_SANITIZE_ADDRESS           "llama: enable address sanitizer"                       OFF)
+option(LLAMA_SANITIZE_UNDEFINED         "llama: enable undefined sanitizer"                     OFF)

 # instruction set specific
-option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
-option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
-option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
-option(LLAMA_AVX512_VBMI            "llama: enable AVX512-VBMI"                             OFF)
-option(LLAMA_AVX512_VNNI            "llama: enable AVX512-VNNI"                             OFF)
-option(LLAMA_FMA                    "llama: enable FMA"                                     ON)
+option(LLAMA_AVX                        "llama: enable AVX"                                     ON)
+option(LLAMA_AVX2                       "llama: enable AVX2"                                    ON)
+option(LLAMA_AVX512                     "llama: enable AVX512"                                  OFF)
+option(LLAMA_AVX512_VBMI                "llama: enable AVX512-VBMI"                             OFF)
+option(LLAMA_AVX512_VNNI                "llama: enable AVX512-VNNI"                             OFF)
+option(LLAMA_FMA                        "llama: enable FMA"                                     ON)
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
-    option(LLAMA_F16C               "llama: enable F16C"                                    ON)
+    option(LLAMA_F16C                   "llama: enable F16C"                                    ON)
 endif()

 # 3rd party libs
-option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
-option(LLAMA_OPENBLAS               "llama: use OpenBLAS"                                   OFF)
-option(LLAMA_CUBLAS                 "llama: use cuBLAS"                                     OFF)
-option(LLAMA_CLBLAST                "llama: use CLBlast"                                    OFF)
+option(LLAMA_ACCELERATE                 "llama: enable Accelerate framework"                    ON)
+option(LLAMA_BLAS                       "llama: use BLAS"                                       OFF)
+set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
+option(LLAMA_CUBLAS                     "llama: use cuBLAS"                                     OFF)
+set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
+set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING  "llama: y block size for dmmv CUDA kernels")
+option(LLAMA_CLBLAST                    "llama: use CLBlast"                                    OFF)

-option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER               "llama: build server example"                           OFF)

 #
 # Build info header
@ -145,36 +149,28 @@ if (APPLE AND LLAMA_ACCELERATE)
    endif()
 endif()

-if (LLAMA_OPENBLAS)
+if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
    endif()
-
-    set(BLA_VENDOR OpenBLAS)
+    if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
+        set(BLA_SIZEOF_INTEGER 8)
+    endif()
+    set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
    find_package(BLAS)
    if (BLAS_FOUND)
-        message(STATUS "OpenBLAS found")
+        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")

+        add_compile_options(${BLAS_LINKER_FLAGS})
        add_compile_definitions(GGML_USE_OPENBLAS)
-        add_link_options(${BLAS_LIBRARIES})
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})

-        # find header file
-        set(OPENBLAS_INCLUDE_SEARCH_PATHS
-            /usr/include
-            /usr/include/openblas
-            /usr/include/openblas-base
-            /usr/local/include
-            /usr/local/include/openblas
-            /usr/local/include/openblas-base
-            /opt/OpenBLAS/include
-            $ENV{OpenBLAS_HOME}
-            $ENV{OpenBLAS_HOME}/include
-            )
-        find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
-        add_compile_options(-I${OPENBLAS_INC})
+        message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
+        include_directories(${BLAS_INCLUDE_DIRS})
    else()
-        message(WARNING "OpenBLAS not found")
+        message(WARNING "BLAS not found, please refer to "
+        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
+        " to set correct LLAMA_BLAS_VENDOR")
    endif()
 endif()

@ -190,6 +186,8 @@ if (LLAMA_CUBLAS)
        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)

        add_compile_definitions(GGML_USE_CUBLAS)
+        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
+        add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})

        if (LLAMA_STATIC)
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@ -207,7 +205,7 @@ if (LLAMA_CLBLAST)
    if (CLBlast_FOUND)
        message(STATUS "CLBlast found")

-        set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
+        set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)

        add_compile_definitions(GGML_USE_CLBLAST)

--- a/61
+++ b/61
@ -1,5 +1,11 @@
 # Define the default target now so that it is always the first target
-default: main quantize quantize-stats perplexity embedding vdot
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot
+
+ifdef LLAMA_BUILD_SERVER
+	BUILD_TARGETS += server
+endif
+
+default: $(BUILD_TARGETS)

 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@ -38,7 +44,11 @@ CFLAGS   = -I.              -O3 -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
 LDFLAGS  =

-ifndef LLAMA_DEBUG
+ifdef LLAMA_DEBUG
+	CFLAGS   += -O0 -g
+	CXXFLAGS += -O0 -g
+	LDFLAGS  += -g
+else
 	CFLAGS   += -DNDEBUG
 	CXXFLAGS += -DNDEBUG
 endif
@ -74,6 +84,15 @@ ifeq ($(UNAME_S),Haiku)
 	CXXFLAGS += -pthread
 endif

+ifdef LLAMA_GPROF
+	CFLAGS   += -pg
+	CXXFLAGS += -pg
+endif
+ifdef LLAMA_PERF
+	CFLAGS   += -DGGML_PERF
+	CXXFLAGS += -DGGML_PERF
+endif
+
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
@ -106,13 +125,17 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif
 ifdef LLAMA_OPENBLAS
-	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
 		LDFLAGS += -lopenblas -lcblas
 	else
 		LDFLAGS += -lopenblas
 	endif
 endif
+ifdef LLAMA_BLIS
+	CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
+	LDFLAGS += -lblis -L/usr/local/lib
+endif
 ifdef LLAMA_CUBLAS
 	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
@ -120,11 +143,22 @@ ifdef LLAMA_CUBLAS
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
 	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
+ifdef LLAMA_CUDA_DMMV_X
+	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+else
+	NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
+endif # LLAMA_CUDA_DMMV_X
+ifdef LLAMA_CUDA_DMMV_Y
+	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
+else
+	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
+endif # LLAMA_CUDA_DMMV_Y
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
-endif
+endif # LLAMA_CUBLAS
 ifdef LLAMA_CLBLAST
 	CFLAGS  += -DGGML_USE_CLBLAST
+	CXXFLAGS  += -DGGML_USE_CLBLAST
 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
 		LDFLAGS += -lclblast -framework OpenCL
@ -132,16 +166,8 @@ ifdef LLAMA_CLBLAST
 		LDFLAGS += -lclblast -lOpenCL
 	endif
 	OBJS    += ggml-opencl.o
-ggml-opencl.o: ggml-opencl.c ggml-opencl.h
-	$(CC) $(CFLAGS) -c $< -o $@
-endif
-ifdef LLAMA_GPROF
-	CFLAGS   += -pg
-	CXXFLAGS += -pg
-endif
-ifdef LLAMA_PERF
-	CFLAGS   += -DGGML_PERF
-	CXXFLAGS += -DGGML_PERF
+ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
@ -194,7 +220,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h

 #
 # Examples
@ -221,6 +247,9 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
@ -240,6 +269,6 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-.PHONY: tests
+.PHONY: tests clean
 tests:
 	bash ./tests/run-tests.sh
--- a/README.md
+++ b/README.md
@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 **Hot topics:**

+- Quantization formats `Q4` and `Q8` have changed again (19 May) - [(info)](https://github.com/ggerganov/llama.cpp/pull/1508)
 - Quantization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
 - [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)

@ -55,7 +56,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
 - Mixed F16 / F32 precision
 - 4-bit, 5-bit and 8-bit integer quantization support
 - Runs on the CPU
- OpenBLAS support
+- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
 - cuBLAS and CLBlast support

 The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
@ -80,6 +81,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
 - [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
+- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)

 **Bindings:**

@ -238,11 +240,11 @@ In order to build llama.cpp you have three different options.

 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:

- Accelerate Framework:
+- **Accelerate Framework**:

  This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.

- OpenBLAS:
+- **OpenBLAS**:

  This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.

@ -272,11 +274,26 @@ Building the program with BLAS support may lead to some performance improvements
      ```bash
      mkdir build
      cd build
-      cmake .. -DLLAMA_OPENBLAS=ON
+      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
      cmake --build . --config Release
      ```

- cuBLAS
+- **BLIS**
+
+  Check [BLIS.md](BLIS.md) for more information.
+
+- **Intel MKL**
+
+  By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
+
+  ```bash
+  mkdir build
+  cd build
+  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+  cmake --build . -config Release
+  ```
+
+- **cuBLAS**

  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
  - Using `make`:
@ -291,8 +308,81 @@ Building the program with BLAS support may lead to some performance improvements
    cmake .. -DLLAMA_CUBLAS=ON
    cmake --build . --config Release
    ```
+  Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.

-Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
+- **CLBlast**
+
+  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
+
+  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
+    - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
+
+    - <details>
+        <summary>Installing the OpenCL SDK from source</summary>
+
+        ```sh
+        git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
+        mkdir OpenCL-SDK/build
+        cd OpenCL-SDK/build
+        cmake .. -DBUILD_DOCS=OFF \
+          -DBUILD_EXAMPLES=OFF \
+          -DBUILD_TESTING=OFF \
+          -DOPENCL_SDK_BUILD_SAMPLES=OFF \
+          -DOPENCL_SDK_TEST_SAMPLES=OFF
+        cmake --build . --config Release
+        cmake --install . --prefix /some/path
+        ```
+      </details>
+
+  Installing CLBlast: it may be found in your operating system's packages.
+
+  - <details>
+    <summary>If not, then installing from source:</summary>
+
+      ```sh
+      git clone https://github.com/CNugteren/CLBlast.git
+      mkdir CLBlast/build
+      cd CLBLast/build
+      cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
+      cmake --build . --config Release
+      cmake --install . --prefix /some/path
+      ```
+
+      Where `/some/path` is where the built library will be installed (default is `/usr/loca`l`).
+    </details>
+
+  Building:
+
+  - Build with make:
+    ```sh
+    make LLAMA_CLBLAST=1
+    ```
+  - CMake:
+    ```sh
+    mkdir build
+    cd build
+    cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
+    cmake --build . --config Release
+    ```
+
+  Running:
+
+  The CLBlast build supports `--gpu-layers|-ngl` like  the CUDA version does.
+
+  To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
+  The selection can be a number (starting from 0) or a text string to search:
+
+  ```sh
+  GGML_OPENCL_PLATFORM=1 ./main ...
+  GGML_OPENCL_DEVICE=2 ./main ...
+  GGML_OPENCL_PLATFORM=Intel ./main ...
+  GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
+  ```
+
+  The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
+  Using the variables it is possible to select a CPU-based driver as well, if so desired.
+
+  You can get a list of platforms and devices from the `clinfo -l` command, etc.

 ### Prepare Data & Run

@ -333,16 +423,16 @@ Several quantization methods are supported. They differ in the resulting model d

 | Model | Measure      | F16    | Q4_0   | Q4_1   | Q5_0   | Q5_1   | Q8_0   |
 |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
-|    7B | perplexity   | 5.9066 | 6.1565 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
-|    7B | file size    |  13.0G |   4.0G |   4.8G |   4.4G |   4.8G |   7.1G |
-|    7B | ms/tok @ 4th |    128 |     50 |     54 |     75 |     83 |     75 |
-|    7B | ms/tok @ 8th |    123 |     44 |     52 |     53 |     58 |     72 |
-|    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
-|   13B | perplexity   | 5.2543 | 5.3860 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
-|   13B | file size    |  25.0G |   7.6G |   9.1G |   8.4G |   9.1G |    14G |
-|   13B | ms/tok @ 4th |    239 |     93 |    101 |    150 |    164 |    141 |
-|   13B | ms/tok @ 8th |    240 |     81 |     96 |     96 |    104 |    136 |
-|   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
+|    7B | perplexity   | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
+|    7B | file size    |  13.0G |   3.5G |   3.9G |   4.3G |   4.7G |   6.7G |
+|    7B | ms/tok @ 4th |    127 |     55 |     54 |     76 |     83 |     72 |
+|    7B | ms/tok @ 8th |    122 |     43 |     45 |     52 |     56 |     67 |
+|    7B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
+|   13B | perplexity   | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
+|   13B | file size    |  25.0G |   6.8G |   7.6G |   8.3G |   9.1G |    13G |
+|   13B | ms/tok @ 4th |      - |    103 |    105 |    148 |    160 |    131 |
+|   13B | ms/tok @ 8th |      - |     73 |     82 |     98 |    105 |    128 |
+|   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |

 ### Perplexity (measuring model quality)

@ -374,6 +464,25 @@ Note the use of `--color` to distinguish between user input and generated text.

 ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)

+### Persistent Interaction
+
+The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
+
+```bash
+# Start a new chat
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+
+# Resume that chat
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+
+# Start a different chat with the same prompt/model
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
+
+# Different prompt cache for different prompt/model
+PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
+    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
+```
+
 ### Instruction mode with Alpaca

 1. First, download the `ggml` Alpaca model into the `./models` folder
--- a/convert.py
+++ b/convert.py
@ -121,7 +121,6 @@ def make_tensors_list() -> List[str]:
            f'layers.{i}.feed_forward.w1.weight',
            f'layers.{i}.feed_forward.w2.weight',
            f'layers.{i}.feed_forward.w3.weight',
-            f'layers.{i}.atttention_norm.weight',
            f'layers.{i}.ffn_norm.weight',
        ]
    return ret
@ -1055,7 +1054,7 @@ def load_some_model(path: Path) -> ModelPlus:
        files = list(path.glob("model-00001-of-*.safetensors"))
        if not files:
            # Try the PyTorch patterns too, with lower priority
-            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
+            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ]
            files = [file for glob in globs for file in path.glob(glob)]
        if not files:
            # Try GGML too, but with lower priority, since if both a non-GGML
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -37,4 +37,7 @@ else()
    add_subdirectory(save-load-state)
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
+    if(LLAMA_BUILD_SERVER)
+        add_subdirectory(server)
+    endif()
 endif()
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -1,6 +1,7 @@
-#include <locale.h>
 #include "ggml.h"
 #include "build-info.h"
+
+#include <locale.h>
 #include <assert.h>
 #include <math.h>
 #include <cstring>
@ -15,7 +16,7 @@
 #include <iterator>
 #include <algorithm>

-float tensor_sum_elements(struct ggml_tensor * tensor) {
+float tensor_sum_elements(const ggml_tensor * tensor) {
    float sum = 0;
    if (tensor->type==GGML_TYPE_F32) {
        for (int j = 0; j < tensor->ne[1]; j++) {
@ -27,21 +28,15 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
    return sum;
 }

+void tensor_dump(const ggml_tensor * tensor, const char * name) {
+    printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", name,
+        tensor->type, ggml_type_name(tensor->type),
+        (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
+    float sum = tensor_sum_elements(tensor);
+    printf("Sum of tensor %s is %6.2f\n", name, sum);
+}

-/*
-    These are mapping to unknown
-    GGML_TYPE_I8,
-    GGML_TYPE_I16,
-    GGML_TYPE_I32,
-    GGML_TYPE_COUNT,
-*/
-
-#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
-
-#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
-        TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
-        (int) TENSOR->ne[0], (int) TENSOR->ne[1], (int) TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
-    { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
+#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)

 struct benchmark_params_struct {
    int32_t n_threads     = 1;
@ -59,8 +54,6 @@ void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct para
 }

 int main(int argc, char ** argv)  {
-
-
    struct benchmark_params_struct benchmark_params;

    bool invalid_param = false;
@ -84,11 +77,11 @@ int main(int argc, char ** argv)  {
            print_usage(argc, argv, benchmark_params);
            exit(0);
        }
-        if (invalid_param) {
-            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-            print_usage(argc, argv, benchmark_params);
-            exit(1);
-        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv, benchmark_params);
+        exit(1);
    }

    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@ -216,10 +209,10 @@ int main(int argc, char ** argv)  {
    // Let's use the F32 result from above as a reference for the q4_0 multiplication
    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);

+    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
+    printf("=====================================================================================\n");

-    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
-    printf("==============================================================================================\n");
-
+    double  gflops_sum = 0;
    for (int i=0;i<benchmark_params.n_iterations ;i++) {

        long long int start = ggml_time_us();
@ -227,12 +220,13 @@ int main(int argc, char ** argv)  {
        ggml_graph_compute(ctx, &gf31);
        long long int stop = ggml_time_us();
        long long int usec = stop-start;
-        float flops_per_usec = (1.0f*flops_per_matrix)/usec;
-        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
+        double gflops = (double)(flops_per_matrix)/usec/1000.0;
+        gflops_sum += gflops;
+        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
            i,
            gf31.n_threads,
            sizex, sizey, sizez, flops_per_matrix,
-            usec,flops_per_usec);
+            usec,gflops);

 #ifdef VERBOSE_DEBUGGING
        TENSOR_DUMP("res",gf31.nodes[0])
@ -256,7 +250,8 @@ int main(int argc, char ** argv)  {

        // Running a different graph computation to make sure we override the CPU cache lines
        ggml_graph_compute(ctx, &gf32);
-
    }
-
+    printf("\n");
+    printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
+    printf("=====================================================================================\n");
 }
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@ -0,0 +1,151 @@
+#!/bin/bash
+
+set -euo pipefail
+
+cd "$(dirname "$0")/.." || exit
+
+if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
+    echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
+    exit 1
+fi
+
+MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
+PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
+USER_NAME="${USER_NAME:-User}"
+AI_NAME="${AI_NAME:-ChatLLaMa}"
+DATE_TIME="$(date +%H:%M)"
+DATE_YEAR="$(date +%Y)"
+
+LOG="${CHAT_SAVE_DIR}/main.log"
+LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
+CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
+CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
+NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
+NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
+
+SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
+SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
+SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
+
+CTX_SIZE=2048
+CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
+OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
+
+# An unbuffered `tail -c+N`
+skip_bytes() {
+    LANG=C IFS= read -r -n "$1" -d '' c
+    while LANG=C IFS= read -r -n 1 -d '' c; do
+        printf '%s' "$c"
+    done
+}
+
+mkdir -p "$CHAT_SAVE_DIR"
+echo >"$LOG"
+trap "tail -n100 ${LOG}" EXIT
+
+if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
+    sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
+        -e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
+        -e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
+        -e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
+        "$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
+fi
+
+if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
+    sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
+fi
+
+if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
+    echo '...' >>"$NEXT_PROMPT_FILE"
+fi
+
+if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
+    echo 'Prompt cache does not exist, building...'
+    # Default batch_size to 8 here for better user feedback during initial prompt processing
+    ./main 2>>"$LOG" \
+        --batch_size 8 \
+        "${OPTS[@]}" \
+        --prompt-cache "$PROMPT_CACHE_FILE" \
+        --file "$CUR_PROMPT_FILE" \
+        --n_predict 1
+    echo
+    echo 'Done!'
+fi
+
+if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
+    cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
+fi
+if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
+    cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
+fi
+
+printf '%s ' "$(< "$CUR_PROMPT_FILE")"
+n_tokens=0
+
+while read -e line; do
+    # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
+    n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
+
+    # Swap prompts when we're about to run out of context
+    if ((n_predict <= 0)); then
+        wait # for background main (below) to finish with next prompt
+        mv "$NEXT_PROMPT_FILE"  "$CUR_PROMPT_FILE"
+        mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
+
+        sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
+        echo '...' >>"$NEXT_PROMPT_FILE"
+        cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
+
+        n_tokens=0
+        n_predict=$((CTX_SIZE / 2))
+    fi
+
+    echo " ${line}" >>"$CUR_PROMPT_FILE"
+    if ((n_tokens > CTX_ROTATE_POINT)); then
+        echo " ${line}" >>"$NEXT_PROMPT_FILE"
+    fi
+
+    n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
+
+    printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
+
+    ./main 2>>"$LOG" "${OPTS[@]}" \
+            --prompt-cache "$CUR_PROMPT_CACHE" \
+            --prompt-cache-all \
+            --file "$CUR_PROMPT_FILE" \
+            --reverse-prompt "${USER_NAME}:" \
+            --n_predict "$n_predict" |
+        skip_bytes 1 |                  # skip BOS token added by ./main
+        tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
+        skip_bytes "$n_prompt_len_pre"  # print generation
+
+    mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
+
+    # if we hit n_predict instead of reverse-prompt, we need to add the prompt
+    if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
+        printf '\n%s:' "$USER_NAME"
+        printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
+    fi
+
+    printf ' '
+
+    # HACK get num tokens from debug message
+    # TODO get both messages in one go
+    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
+        ! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
+        echo >&2 "Couldn't get number of tokens from ./main output!"
+        exit 1
+    fi
+
+    n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
+
+    if ((n_tokens > CTX_ROTATE_POINT)); then
+        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
+    fi
+
+    # Update cache for next prompt in background, ideally during user input
+    ./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+          --prompt-cache "$NEXT_PROMPT_CACHE" \
+          --file "$NEXT_PROMPT_FILE" \
+          --n_predict 1 &
+done
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -8,6 +8,7 @@
 #include <iterator>
 #include <algorithm>
 #include <sstream>
+#include <unordered_set>

 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@ -28,21 +29,21 @@

 int32_t get_num_physical_cores() {
 #ifdef __linux__
-    std::ifstream cpuinfo("/proc/cpuinfo");
-    std::string line;
-    while (std::getline(cpuinfo, line)) {
-        std::size_t pos = line.find("cpu cores");
-        if (pos != std::string::npos) {
-            pos = line.find(": ", pos);
-            if (pos != std::string::npos) {
-                try {
-                    // Extract the number and return it
-                    return static_cast<int32_t>(std::stoul(line.substr(pos + 2)));
-                } catch (const std::invalid_argument &) {
-                    // Ignore if we could not parse
-                }
-            }
+    // enumerate the set of thread siblings, num entries is num cores
+    std::unordered_set<std::string> siblings;
+    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+        std::ifstream thread_siblings("/sys/devices/system/cpu"
+            + std::to_string(cpu) + "/topology/thread_siblings");
+        if (!thread_siblings.is_open()) {
+            break; // no more cpus
        }
+        std::string line;
+        if (std::getline(thread_siblings, line)) {
+            siblings.insert(line);
+        }
+    }
+    if (siblings.size() > 0) {
+        return static_cast<int32_t>(siblings.size());
    }
 #elif defined(__APPLE__) && defined(__MACH__)
    int32_t num_physical_cores;
@ -250,6 +251,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.model = argv[i];
+        } else if (arg == "-a" || arg == "--alias") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_alias = argv[i];
        } else if (arg == "--lora") {
            if (++i >= argc) {
                invalid_param = true;
@ -282,7 +289,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
            params.n_gpu_layers = std::stoi(argv[i]);
+#else
+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--mtest") {
@ -320,12 +332,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-        } else if (arg == "--n-parts") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_parts = std::stoi(argv[i]);
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, default_params);
            exit(0);
@ -356,7 +362,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    }
    if (params.prompt_cache_all &&
            (params.interactive || params.interactive_first ||
-             params.instruct || params.antiprompt.size())) {
+             params.instruct)) {
        fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
        gpt_print_usage(argc, argv, default_params);
        exit(1);
@ -378,8 +384,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
    fprintf(stderr, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
    fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
-    fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
-    fprintf(stderr, "                        specified more than once for multiple prompts).\n");
+    fprintf(stderr, "                        halt generation at PROMPT, return control in interactive mode\n");
+    fprintf(stderr, "                        (can be specified more than once for multiple prompts).\n");
    fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
@ -415,9 +421,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
-    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value\n");
+    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    fprintf(stderr, "  --n-parts N           number of model parts (default: -1 = determine from dimensions)\n");
    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
@ -427,8 +433,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    if (llama_mmap_supported()) {
        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
    fprintf(stderr, "                        number of layers to store in VRAM\n");
+#endif
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
@ -472,7 +480,6 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
    auto lparams = llama_context_default_params();

    lparams.n_ctx        = params.n_ctx;
-    lparams.n_parts      = params.n_parts;
    lparams.n_gpu_layers = params.n_gpu_layers;
    lparams.seed         = params.seed;
    lparams.f16_kv       = params.memory_f16;
@ -585,6 +592,37 @@ void console_set_color(console_state & con_st, console_color_t color) {
 }

 char32_t getchar32() {
+#if defined(_WIN32)
+    HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
+    wchar_t high_surrogate = 0;
+
+    while (true) {
+        INPUT_RECORD record;
+        DWORD count;
+        if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
+            return WEOF;
+        }
+
+        if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
+            wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
+            if (wc == 0) {
+                continue;
+            }
+
+            if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+                high_surrogate = wc;
+                continue;
+            } else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
+                if (high_surrogate != 0) { // Check if we have a high surrogate
+                    return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
+                }
+            }
+
+            high_surrogate = 0; // Reset the high surrogate
+            return static_cast<char32_t>(wc);
+        }
+    }
+#else
    wchar_t wc = getwchar();
    if (static_cast<wint_t>(wc) == WEOF) {
        return WEOF;
@ -603,6 +641,7 @@ char32_t getchar32() {
 #endif

    return static_cast<char32_t>(wc);
+#endif
 }

 void pop_cursor(console_state & con_st) {
@ -756,7 +795,7 @@ bool console_readline(console_state & con_st, std::string & line) {
            break;
        }

-        if (input_char == WEOF || input_char == 0x04 /* Ctrl+D*/) {
+        if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
            end_of_stream = true;
            break;
        }
@ -771,7 +810,7 @@ bool console_readline(console_state & con_st, std::string & line) {
            char32_t code = getchar32();
            if (code == '[' || code == 0x1B) {
                // Discard the rest of the escape sequence
-                while ((code = getchar32()) != WEOF) {
+                while ((code = getchar32()) != (char32_t) WEOF) {
                    if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
                        break;
                    }
--- a/examples/common.h
+++ b/examples/common.h
@ -24,7 +24,6 @@ struct gpt_params {
    int32_t seed          = -1;  // RNG seed
    int32_t n_threads     = get_num_physical_cores();
    int32_t n_predict     = -1;  // new tokens to predict
-    int32_t n_parts       = -1;  // amount of model parts (-1 = determine from model dimensions)
    int32_t n_ctx         = 512; // context size
    int32_t n_batch       = 512; // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep        = 0;   // number of tokens to keep from initial prompt
@ -45,15 +44,16 @@ struct gpt_params {
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate

-    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
-    std::string prompt = "";
+    std::string model             = "models/7B/ggml-model.bin"; // model path
+    std::string model_alias       = "unknown"; // model alias
+    std::string prompt            = "";
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
    std::string input_prefix      = "";  // string to prefix user inputs with
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

    std::string lora_adapter = "";  // lora adapter path
-    std::string lora_base = "";     // base model path for the lora adapter
+    std::string lora_base    = "";  // base model path for the lora adapter

    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -6,7 +6,6 @@

 int main(int argc, char ** argv) {
    gpt_params params;
-    params.model = "models/llama-7B/ggml-model.bin";

    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
@ -32,6 +31,8 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

+    llama_init_backend();
+
    llama_context * ctx;

    // load the model
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -69,8 +69,8 @@ In this section, we cover the most commonly used options for running the `main`
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
-   `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.

 ## Input Prompts

@ -136,9 +136,9 @@ During text generation, LLaMA models have a limited context size, which means th

 ### Context Size

-The `--ctx_size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.

-   `-c N, --ctx_size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+-   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.

 ### Keep Prompt

@ -146,7 +146,7 @@ The `--keep` option allows users to retain the original prompt when the model ru

 -   `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.

-By utilizing context management options like `--ctx_size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.

 ## Generation Flags

@ -154,11 +154,11 @@ The following options allow you to control the text generation process and fine-

 ### Number of Tokens to Predict

-   `-n N, --n_predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).

-The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.

-It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.

 ### Temperature

@ -170,33 +170,33 @@ Example usage: `--temp 0.5`

 ### Repeat Penalty

-   `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
-   `--repeat_last_n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx_size).
+-   `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+-   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
 -   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.

-The `repeat_penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.

-The `repeat_last_n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx_size`).
+The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).

 Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.

-Example usage: `--repeat_penalty 1.15 --repeat_last_n 128 --no-penalize-nl`
+Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`

 ### Top-K Sampling

-   `--top_k N`: Limit the next token selection to the K most probable tokens (default: 40).
+-   `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).

-Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
+Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.

-Example usage: `--top_k 30`
+Example usage: `--top-k 30`

 ### Top-P Sampling

-   `--top_p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+-   `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).

-Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
+Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.

-Example usage: `--top_p 0.95`
+Example usage: `--top-p 0.95`

 ### Tail Free Sampling (TFS)

@ -217,16 +217,16 @@ Example usage: `--typical 0.9`
 ### Mirostat Sampling

 -   `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
-   `--mirostat_lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
-   `--mirostat_ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
+-   `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
+-   `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).

 Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).

-The `--mirostat_lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
+The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.

-The `--mirostat_ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
+The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.

-Example usage: `--mirostat 2 --mirostat_lr 0.05 --mirostat_ent 3.0`
+Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`

 ### Logit Bias

@ -264,15 +264,15 @@ These options help improve the performance and memory usage of the LLaMA models.

 ### Memory Float 32

-   `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
+-   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.

 ### Batch Size

-   `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+-   `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.

 ### Prompt Caching

-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.
+-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.

 ### Quantization

@ -285,5 +285,6 @@ These options provide extra functionality and customization when running the LLa
 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
 -   `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
+-   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -50,7 +50,6 @@ void sigint_handler(int signo) {

 int main(int argc, char ** argv) {
    gpt_params params;
-    params.model = "models/llama-7B/ggml-model.bin";

    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
@ -97,8 +96,7 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

-//    params.prompt = R"(// this function checks if the number n is prime
-//bool is_prime(int n) {)";
+    llama_init_backend();

    llama_context * ctx;
    g_ctx = &ctx;
@ -136,8 +134,6 @@ int main(int argc, char ** argv) {
        return 0;
    }

-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');

    std::string path_session = params.path_prompt_cache;
    std::vector<llama_token> session_tokens;
@ -157,6 +153,7 @@ int main(int argc, char ** argv) {
                return 1;
            }
            session_tokens.resize(n_token_count_out);
+            llama_set_rng_seed(ctx, params.seed);

            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
        } else {
@ -165,7 +162,16 @@ int main(int argc, char ** argv) {
    }

    // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> embd_inp;
+
+    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+        // Add a space in front of the first character to match OG llama tokenizer behavior
+        params.prompt.insert(0, 1, ' ');
+
+        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    } else {
+        embd_inp = session_tokens;
+    }

    const int n_ctx = llama_n_ctx(ctx);

@ -183,7 +189,9 @@ int main(int argc, char ** argv) {
            }
            n_matching_session_tokens++;
        }
-        if (n_matching_session_tokens >= embd_inp.size()) {
+        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
+            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+        } else if (n_matching_session_tokens >= embd_inp.size()) {
            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
@ -209,8 +217,8 @@ int main(int argc, char ** argv) {
        params.antiprompt.push_back("### Instruction:\n\n");
    }

-    // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.interactive_first) {
+    // enable interactive mode if interactive start is specified
+    if (params.interactive_first) {
        params.interactive = true;
    }

@ -242,7 +250,7 @@ int main(int argc, char ** argv) {
        sigint_action.sa_flags = 0;
        sigaction(SIGINT, &sigint_action, NULL);
 #elif defined (_WIN32)
-        auto console_ctrl_handler = [](DWORD ctrl_type) -> BOOL {
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
        };
        SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
@ -306,7 +314,7 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

-    while (n_remain != 0 || params.interactive) {
+    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
        if (embd.size() > 0) {
            // infinite text generation via context swapping
@ -352,6 +360,12 @@ int main(int argc, char ** argv) {
                    }
                }
                if (i > 0) {
+                    // check if we've used up all the prompt but not all cached tokens
+                    if (embd.size() == i && n_session_consumed < (int) session_tokens.size()) {
+                        // force revaluation of the last token to recalculate logits
+                        i--;
+                        n_past--;
+                    }
                    embd.erase(embd.begin(), embd.begin() + i);
                }
            }
@ -504,9 +518,8 @@ int main(int argc, char ** argv) {
            console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
        }

-        // in interactive mode, and not currently processing queued inputs;
-        // check if we should prompt the user for more
-        if (params.interactive && (int) embd_inp.size() <= n_consumed) {
+        // if not currently processing queued inputs;
+        if ((int) embd_inp.size() <= n_consumed) {

            // check for reverse prompt
            if (params.antiprompt.size()) {
@ -517,10 +530,21 @@ int main(int argc, char ** argv) {

                is_antiprompt = false;
                // Check if each of the reverse prompts appears at the end of the output.
+                // If we're not running interactively, the reverse prompt might be tokenized with some following characters
+                // so we'll compensate for that by widening the search window a bit.
                for (std::string & antiprompt : params.antiprompt) {
-                    if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
-                        is_interacting = true;
+                    size_t extra_padding = params.interactive ? 0 : 2;
+                    size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
+                        ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
+                        : 0;
+
+                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
+                        if (params.interactive) {
+                            is_interacting = true;
+                            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+                        }
                        is_antiprompt = true;
+                        fflush(stdout);
                        break;
                    }
                }
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -116,7 +116,6 @@ void perplexity(llama_context * ctx, const gpt_params & params) {

 int main(int argc, char ** argv) {
    gpt_params params;
-    params.model = "models/llama-7B/ggml-model.bin";

    params.n_batch = 512;
    if (gpt_params_parse(argc, argv, params) == false) {
@ -144,6 +143,8 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

+    llama_init_backend();
+
    llama_context * ctx;

    // load the model and apply lora adapter, if any
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -321,7 +321,6 @@ int main(int argc, char ** argv) {
        auto lparams = llama_context_default_params();

        lparams.n_ctx      = 256;
-        lparams.n_parts    = 1;
        lparams.seed       = 1;
        lparams.f16_kv     = false;
        lparams.use_mlock  = false;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -1,7 +1,7 @@
-#include "ggml.h"
-#include "llama.h"
 #include "build-info.h"

+#include "llama.h"
+
 #include <cstdio>
 #include <map>
 #include <string>
@ -42,8 +42,6 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
 //  ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
 //
 int main(int argc, char ** argv) {
-    ggml_time_init();
-
    if (argc < 3) {
        fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
@ -52,12 +50,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
+    llama_init_backend();

    // parse command line arguments
    const std::string fname_inp = argv[1];
@ -116,25 +109,25 @@ int main(int argc, char ** argv) {
    }
    fprintf(stderr, "\n");

-    const int64_t t_main_start_us = ggml_time_us();
+    const int64_t t_main_start_us = llama_time_us();

    int64_t t_quantize_us = 0;

    // load the model
    {
-        const int64_t t_start_us = ggml_time_us();
+        const int64_t t_start_us = llama_time_us();

        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }

-        t_quantize_us = ggml_time_us() - t_start_us;
+        t_quantize_us = llama_time_us() - t_start_us;
    }

    // report timing
    {
-        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_main_end_us = llama_time_us();

        printf("\n");
        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -8,7 +8,6 @@

 int main(int argc, char ** argv) {
    gpt_params params;
-    params.model = "models/llama-7B/ggml-model.bin";
    params.seed = 42;
    params.n_threads = 4;
    params.repeat_last_n = 64;
@ -27,7 +26,6 @@ int main(int argc, char ** argv) {
    auto lparams = llama_context_default_params();

    lparams.n_ctx     = params.n_ctx;
-    lparams.n_parts   = params.n_parts;
    lparams.seed      = params.seed;
    lparams.f16_kv    = params.memory_f16;
    lparams.use_mmap  = params.use_mmap;
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -0,0 +1,8 @@
+set(TARGET server)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(${TARGET} server.cpp json.hpp httplib.h)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -0,0 +1,312 @@
+# llama.cpp/example/server
+
+This example allow you to have a llama.cpp http server to interact from a web page or consume the API.
+
+## Table of Contents
+
+1. [Quick Start](#quick-start)
+2. [Node JS Test](#node-js-test)
+3. [API Endpoints](#api-endpoints)
+4. [More examples](#more-examples)
+5. [Common Options](#common-options)
+6. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
+
+## Quick Start
+
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+
+#### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./server -m models/7B/ggml-model.bin --ctx_size 2048
+```
+
+#### Windows:
+
+```powershell
+server.exe -m models\7B\ggml-model.bin --ctx_size 2048
+```
+
+That will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library.
+
+## Node JS Test
+
+You need to have [Node.js](https://nodejs.org/en) installed.
+
+```bash
+mkdir llama-client
+cd llama-client
+npm init
+npm install axios
+```
+
+Create a index.js file and put inside this:
+
+```javascript
+const axios = require("axios");
+
+const prompt = `Building a website can be done in 10 simple steps:`;
+
+async function Test() {
+    let result = await axios.post("http://127.0.0.1:8080/completion", {
+        prompt,
+        batch_size: 128,
+        n_predict: 512,
+    });
+
+    // the response is received until completion finish
+    console.log(result.data.content);
+}
+
+Test();
+```
+
+And run it:
+
+```bash
+node .
+```
+
+## API Endpoints
+
+You can interact with this API Endpoints. This implementations just support chat style interaction.
+
+-   **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks.
+
+*Options:*
+
+`batch_size`: Set the batch size for prompt processing (default: 512).
+
+`temperature`: Adjust the randomness of the generated text (default: 0.8).
+
+`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
+
+`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+
+`n_predict`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+
+`threads`: Set the number of threads to use during computation.
+
+`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+
+`as_loop`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
+
+`interactive`: It allows interacting with the completion, and the completion stops as soon as it encounters a `stop word`. To enable this, set to `true`.
+
+`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate.
+
+`stop`: Specify the words or characters that indicate a stop. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration.
+
+`exclude`: Specify the words or characters you do not want to appear in the completion. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration.
+
+-   **POST** `hostname:port/embedding`: Generate embedding of a given text
+
+*Options:*
+
+`content`: Set the text to get generate the embedding.
+
+`threads`: Set the number of threads to use during computation.
+
+To use this endpoint, you need to start the server with the `--embedding` option added.
+
+-   **POST** `hostname:port/tokenize`: Tokenize a given text
+
+*Options:*
+
+`content`: Set the text to tokenize.
+
+-   **GET** `hostname:port/next-token`: Receive the next token predicted, execute this request in a loop. Make sure set `as_loop` as `true` in the completion request.
+
+*Options:*
+
+`stop`: Set `hostname:port/next-token?stop=true` to stop the token generation.
+
+## More examples
+
+### Interactive mode
+
+This mode allows interacting in a chat-like manner. It is recommended for models designed as assistants such as `Vicuna`, `WizardLM`, `Koala`, among others. Make sure to add the correct stop word for the corresponding model.
+
+The prompt should be generated by you, according to the model's guidelines. You should keep adding the model's completions to the context as well.
+
+This example works well for `Vicuna - version 1`.
+
+```javascript
+const axios = require("axios");
+
+let prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+### Human: Hello, Assistant.
+### Assistant: Hello. How may I help you today?
+### Human: Please tell me the largest city in Europe.
+### Assistant: Sure. The largest city in Europe is Moscow, the capital of Russia.`;
+
+async function ChatCompletion(answer) {
+    // the user's next question to the prompt
+    prompt += `\n### Human: ${answer}\n`
+
+    result = await axios.post("http://127.0.0.1:8080/completion", {
+        prompt,
+        batch_size: 128,
+        temperature: 0.2,
+        top_k: 40,
+        top_p: 0.9,
+        n_keep: -1,
+        n_predict: 2048,
+        stop: ["\n### Human:"], // when detect this, stop completion
+        exclude: ["### Assistant:"], // no show in the completion
+        threads: 8,
+        as_loop: true, // use this to request the completion token by token
+        interactive: true, // enable the detection of a stop word
+    });
+
+    // create a loop to receive every token predicted
+    // note: this operation is blocking, avoid use this in a ui thread
+
+    let message = "";
+    while (true) {
+        // you can stop the inference adding '?stop=true' like this http://127.0.0.1:8080/next-token?stop=true
+        result = await axios.get("http://127.0.0.1:8080/next-token");
+        process.stdout.write(result.data.content);
+        message += result.data.content;
+
+        // to avoid an infinite loop
+        if (result.data.stop) {
+            console.log("Completed");
+            // make sure to add the completion to the prompt.
+            prompt += `### Assistant: ${message}`;
+            break;
+        }
+    }
+}
+
+// This function should be called every time a question to the model is needed.
+async function Test() {
+    // the server can't inference in paralell
+    await ChatCompletion("Write a long story about a time magician in a fantasy world");
+    await ChatCompletion("Summary the story");
+}
+
+Test();
+```
+
+### Alpaca example
+
+**Temporaly note:** no tested, if you have the model, please test it and report me some issue
+
+```javascript
+const axios = require("axios");
+
+let prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
+`;
+
+async function DoInstruction(instruction) {
+    prompt += `\n\n### Instruction:\n\n${instruction}\n\n### Response:\n\n`;
+    result = await axios.post("http://127.0.0.1:8080/completion", {
+        prompt,
+        batch_size: 128,
+        temperature: 0.2,
+        top_k: 40,
+        top_p: 0.9,
+        n_keep: -1,
+        n_predict: 2048,
+        stop: ["### Instruction:\n\n"], // when detect this, stop completion
+        exclude: [], // no show in the completion
+        threads: 8,
+        as_loop: true, // use this to request the completion token by token
+        interactive: true, // enable the detection of a stop word
+    });
+
+    // create a loop to receive every token predicted
+    // note: this operation is blocking, avoid use this in a ui thread
+
+    let message = "";
+    while (true) {
+        result = await axios.get("http://127.0.0.1:8080/next-token");
+        process.stdout.write(result.data.content);
+        message += result.data.content;
+
+        // to avoid an infinite loop
+        if (result.data.stop) {
+            console.log("Completed");
+            // make sure to add the completion and the user's next question to the prompt.
+            prompt += message;
+            break;
+        }
+    }
+}
+
+// This function should be called every time a instruction to the model is needed.
+DoInstruction("Destroy the world"); // as joke
+```
+
+### Embeddings
+
+First, run the server with `--embedding` option:
+
+```bash
+server -m models/7B/ggml-model.bin --ctx_size 2048 --embedding
+```
+
+Run this code in NodeJS:
+
+```javascript
+const axios = require('axios');
+
+async function Test() {
+    let result = await axios.post("http://127.0.0.1:8080/embedding", {
+        content: `Hello`,
+        threads: 5
+    });
+    // print the embedding array
+    console.log(result.data.embedding);
+}
+
+Test();
+```
+
+### Tokenize
+
+Run this code in NodeJS:
+
+```javascript
+const axios = require('axios');
+
+async function Test() {
+    let result = await axios.post("http://127.0.0.1:8080/tokenize", {
+        content: `Hello`
+    });
+    // print the embedding array
+    console.log(result.data.tokens);
+}
+
+Test();
+```
+
+## Common Options
+
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
+-   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
+-   `--port`: Set the port to listen. Default: `8080`.
+
+### RNG Seed
+
+-   `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
+
+The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
+
+## Performance Tuning and Memory Options
+
+### No Memory Mapping
+
+-   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance.
+
+### Memory Float 32
+
+-   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement but does not appear to increase generation quality in a measurable way. Not recommended.
+
+## Limitations:
+
+-   The actual implementation of llama.cpp need a `llama-state` for handle multiple contexts and clients, but this could require more powerful hardware.
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/json.hpp
+++ b/examples/server/json.hpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -0,0 +1,742 @@
+#include <httplib.h>
+#include <json.hpp>
+#include "common.h"
+#include "llama.h"
+
+struct server_params
+{
+  std::string hostname = "127.0.0.1";
+  int32_t port = 8080;
+};
+
+struct llama_server_context
+{
+  bool as_loop = false;
+  bool has_next_token = false;
+  std::string generated_text = "";
+
+  int32_t num_tokens_predicted = 0;
+  int32_t n_past = 0;
+  int32_t n_consumed = 0;
+  int32_t n_session_consumed = 0;
+  int32_t n_remain = 0;
+
+  std::vector<llama_token> embd;
+  std::vector<llama_token> last_n_tokens;
+  std::vector<llama_token> processed_tokens;
+  std::vector<llama_token> llama_token_newline;
+  std::vector<llama_token> embd_inp;
+  std::vector<std::vector<llama_token>> no_show_words;
+  std::vector<llama_token> tokens_predicted;
+
+  llama_context *ctx;
+  gpt_params params;
+
+  void rewind() {
+    as_loop = false;
+    params.antiprompt.clear();
+    no_show_words.clear();
+    num_tokens_predicted = 0;
+    generated_text = "";
+  }
+
+  bool loadModel(gpt_params params_)
+  {
+    params = params_;
+    ctx = llama_init_from_gpt_params(params);
+    if (ctx == NULL)
+    {
+      fprintf(stderr, "%s: error: unable to load model\n", __func__);
+      return false;
+    }
+    // determine newline token
+    llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+    last_n_tokens.resize(params.n_ctx);
+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+    return true;
+  }
+
+  bool loadPrompt() {
+    params.prompt.insert(0, 1, ' '); // always add a first space
+    std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
+    // compare the evaluated prompt with the new prompt
+    int new_prompt_len = 0;
+    for (size_t i = 0; i < prompt_tokens.size(); i++) {
+      if (i < processed_tokens.size() &&
+        processed_tokens[i] == prompt_tokens[i])
+      {
+        continue;
+      }
+      else
+      {
+        embd_inp.push_back(prompt_tokens[i]);
+        if(new_prompt_len == 0) {
+          if(int32_t(i) - 1 < n_past) {
+            processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
+          }
+          // Evaluate the new fragment prompt from the last token processed.
+          n_past = processed_tokens.size();
+        }
+        new_prompt_len ++;
+      }
+    }
+    if(n_past > 0 && params.interactive) {
+      n_remain -= new_prompt_len;
+    }
+    if ((int)embd_inp.size() > params.n_ctx - 4)
+    {
+      return false;
+    }
+    has_next_token = true;
+    return true;
+  }
+
+  void beginCompletion()
+  {
+    if(n_remain == 0) {
+      // number of tokens to keep when resetting context
+      if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size())
+      {
+        params.n_keep = (int)embd_inp.size();
+      }
+    }
+    n_remain = params.n_predict;
+  }
+
+  llama_token nextToken() {
+    llama_token result = -1;
+    if (embd.size() > 0)
+    {
+      if (n_past + (int)embd.size() > params.n_ctx)
+      {
+        // Reset context
+        const int n_left = n_past - params.n_keep;
+        n_past = std::max(1, params.n_keep);
+        processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end());
+        embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size());
+      }
+      for (int i = 0; i < (int)embd.size(); i += params.n_batch)
+      {
+        int n_eval = (int)embd.size() - i;
+        if (n_eval > params.n_batch)
+        {
+          n_eval = params.n_batch;
+        }
+        if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads))
+        {
+          fprintf(stderr, "%s : failed to eval\n", __func__);
+          has_next_token = false;
+          return result;
+        }
+        n_past += n_eval;
+      }
+    }
+    embd.clear();
+    if ((int)embd_inp.size() <= n_consumed && has_next_token)
+    {
+      // out of user input, sample next token
+      const float temp = params.temp;
+      // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+      const float top_p = params.top_p;
+      const float tfs_z = params.tfs_z;
+      const float typical_p = params.typical_p;
+      const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
+      const float repeat_penalty = params.repeat_penalty;
+      const float alpha_presence = params.presence_penalty;
+      const float alpha_frequency = params.frequency_penalty;
+      const int mirostat = params.mirostat;
+      const float mirostat_tau = params.mirostat_tau;
+      const float mirostat_eta = params.mirostat_eta;
+      const bool penalize_nl = params.penalize_nl;
+      llama_token id = 0;
+      {
+        auto logits = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(ctx);
+
+        // Apply params.logit_bias map
+        for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++)
+        {
+          logits[it->first] += it->second;
+        }
+
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++)
+        {
+          candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+
+        llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
+
+        // Apply penalties
+        float nl_logit = logits[llama_token_nl()];
+        auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
+        llama_sample_repetition_penalty(ctx, &candidates_p,
+                                        last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                                        last_n_repeat, repeat_penalty);
+        llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+                                                      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                                                      last_n_repeat, alpha_frequency, alpha_presence);
+        if (!penalize_nl)
+        {
+          logits[llama_token_nl()] = nl_logit;
+        }
+
+        if (temp <= 0)
+        {
+          // Greedy sampling
+          id = llama_sample_token_greedy(ctx, &candidates_p);
+        }
+        else
+        {
+          if (mirostat == 1)
+          {
+            static float mirostat_mu = 2.0f * mirostat_tau;
+            const int mirostat_m = 100;
+            llama_sample_temperature(ctx, &candidates_p, temp);
+            id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+          }
+          else if (mirostat == 2)
+          {
+            static float mirostat_mu = 2.0f * mirostat_tau;
+            llama_sample_temperature(ctx, &candidates_p, temp);
+            id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+          }
+          else
+          {
+            // Temperature sampling
+            llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+            llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+            llama_sample_temperature(ctx, &candidates_p, temp);
+            id = llama_sample_token(ctx, &candidates_p);
+          }
+        }
+        last_n_tokens.erase(last_n_tokens.begin());
+        last_n_tokens.push_back(id);
+        processed_tokens.push_back(id);
+        num_tokens_predicted++;
+      }
+
+      // replace end of text token with newline token when in interactive mode
+      if (id == llama_token_eos() && params.interactive)
+      {
+        id = llama_token_newline.front();
+        if (params.antiprompt.size() != 0)
+        {
+          // tokenize and inject first reverse prompt
+          const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+          embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+        }
+      }
+
+      // add it to the context
+      embd.push_back(id);
+      for (auto id : embd)
+      {
+        result = id;
+      }
+      // decrement remaining sampling budget
+      --n_remain;
+    }
+    else
+    {
+      // some user input remains from prompt or interaction, forward it to processing
+      while ((int)embd_inp.size() > n_consumed)
+      {
+        embd.push_back(embd_inp[n_consumed]);
+        last_n_tokens.erase(last_n_tokens.begin());
+        last_n_tokens.push_back(embd_inp[n_consumed]);
+        processed_tokens.push_back(embd_inp[n_consumed]);
+        ++n_consumed;
+        if ((int)embd.size() >= params.n_batch)
+        {
+          break;
+        }
+      }
+    }
+    if (params.interactive && (int)embd_inp.size() <= n_consumed)
+    {
+      // check for reverse prompt
+      if (params.antiprompt.size())
+      {
+        std::string last_output;
+        for (auto id : last_n_tokens)
+        {
+          last_output += llama_token_to_str(ctx, id);
+        }
+        has_next_token = true;
+        // Check if each of the reverse prompts appears at the end of the output.
+        for (std::string &antiprompt : params.antiprompt)
+        {
+          if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos)
+          {
+            has_next_token = false;
+            return result;
+          }
+        }
+      }
+      if (n_past > 0)
+      {
+        has_next_token = true;
+      }
+    }
+
+    if (!embd.empty() && embd.back() == llama_token_eos()) {
+        has_next_token = false;
+    }
+
+    if (params.interactive && n_remain <= 0 && params.n_predict != -1)
+    {
+      n_remain = params.n_predict;
+    }
+    has_next_token = n_remain != 0;
+    return result;
+  }
+
+  std::string doCompletion()
+  {
+    llama_token token = nextToken();
+    if (token == -1) {
+      return "";
+    }
+    tokens_predicted.clear();
+    tokens_predicted.push_back(token);
+
+    // Avoid add the no show words to the response
+    for (std::vector<llama_token> word_tokens : no_show_words)
+    {
+      size_t match_token = 1;
+      if (tokens_predicted.front() == word_tokens.front())
+      {
+        bool execute_matching = true;
+        if (tokens_predicted.size() > 1) { // if previus tokens had been tested
+          for (size_t i = 1; i < word_tokens.size(); i++)
+          {
+            if (i >= tokens_predicted.size()) {
+              match_token = i;
+              break;
+            }
+            if (tokens_predicted[i] == word_tokens[i])
+            {
+              continue;
+            }
+            else
+            {
+              execute_matching = false;
+              break;
+            }
+          }
+        }
+        while (execute_matching) {
+          if (match_token == word_tokens.size()) {
+            return "";
+          }
+          token = nextToken();
+          tokens_predicted.push_back(token);
+          if (token == word_tokens[match_token])
+          { // the token follow the sequence
+            match_token++;
+          }
+          else if (match_token < word_tokens.size())
+          { // no complete all word sequence
+            break;
+          }
+        }
+      }
+    }
+    if(as_loop) {
+      generated_text = "";
+    }
+    for (llama_token tkn : tokens_predicted)
+    {
+      generated_text += llama_token_to_str(ctx, tkn);
+    }
+    return generated_text;
+  }
+
+  std::vector<float> embedding(std::string content, int threads) {
+    content.insert(0, 1, ' ');
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, content, true);
+    if (tokens.size() > 0)
+    {
+      if (llama_eval(ctx, tokens.data(), tokens.size(), 0, threads))
+      {
+        fprintf(stderr, "%s : failed to eval\n", __func__);
+        std::vector<float> embeddings_;
+        return embeddings_;
+      }
+    }
+    const int n_embd = llama_n_embd(ctx);
+    const auto embeddings = llama_get_embeddings(ctx);
+    std::vector<float> embeddings_(embeddings, embeddings + n_embd);
+    return embeddings_;
+  }
+};
+
+using namespace httplib;
+
+using json = nlohmann::json;
+
+void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
+{
+  fprintf(stderr, "usage: %s [options]\n", argv[0]);
+  fprintf(stderr, "\n");
+  fprintf(stderr, "options:\n");
+  fprintf(stderr, "  -h, --help            show this help message and exit\n");
+  fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
+  fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+  fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+  fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+  fprintf(stderr, "  --embedding           enable embedding mode\n");
+  fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+  if (llama_mlock_supported())
+  {
+    fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+  }
+  if (llama_mmap_supported())
+  {
+    fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+  }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+  fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
+  fprintf(stderr, "                        number of layers to store in VRAM\n");
+#endif
+  fprintf(stderr, "  -m FNAME, --model FNAME\n");
+  fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+  fprintf(stderr, "  -a ALIAS, --alias ALIAS\n");
+  fprintf(stderr, "                        set an alias for the model, will be added as `model` field in completion response\n");
+  fprintf(stderr, "  --host                ip address to listen (default 127.0.0.1)\n");
+  fprintf(stderr, "  --port PORT           port to listen (default 8080)\n");
+  fprintf(stderr, "\n");
+}
+
+bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_params &params)
+{
+  gpt_params default_params;
+  std::string arg;
+  bool invalid_param = false;
+
+  for (int i = 1; i < argc; i++)
+  {
+    arg = argv[i];
+    if (arg == "--port")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      sparams.port = std::stoi(argv[i]);
+    }
+    else if (arg == "--host")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      sparams.hostname = argv[i];
+    }
+    else if (arg == "-s" || arg == "--seed")
+    {
+#if defined(GGML_USE_CUBLAS)
+      fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n");
+#endif
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      params.seed = std::stoi(argv[i]);
+    }
+    else if (arg == "-m" || arg == "--model")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      params.model = argv[i];
+    }
+    else if (arg == "-a" || arg == "--alias")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      params.model_alias = argv[i];
+    }
+    else if (arg == "--embedding")
+    {
+      params.embedding = true;
+    }
+    else if (arg == "-h" || arg == "--help")
+    {
+      server_print_usage(argc, argv, default_params);
+      exit(0);
+    }
+    else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      params.n_ctx = std::stoi(argv[i]);
+    }
+    else if (arg == "--memory-f32" || arg == "--memory_f32")
+    {
+      params.memory_f16 = false;
+    }
+    else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+      params.n_gpu_layers = std::stoi(argv[i]);
+#else
+      fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+      fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
+    }
+    else
+    {
+      fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+      server_print_usage(argc, argv, default_params);
+      exit(1);
+    }
+  }
+
+  if (invalid_param)
+  {
+    fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+    server_print_usage(argc, argv, default_params);
+    exit(1);
+  }
+  return true;
+}
+
+bool parse_options_completion(json body, llama_server_context& llama, Response &res) {
+  if (!body["threads"].is_null())
+  {
+    llama.params.n_threads = body["threads"].get<int>();
+  }
+  if (!body["n_predict"].is_null())
+  {
+    llama.params.n_predict = body["n_predict"].get<int>();
+  }
+  if (!body["top_k"].is_null())
+  {
+    llama.params.top_k = body["top_k"].get<int>();
+  }
+  if (!body["top_p"].is_null())
+  {
+    llama.params.top_p = body["top_p"].get<float>();
+  }
+  if (!body["temperature"].is_null())
+  {
+    llama.params.temp = body["temperature"].get<float>();
+  }
+  if (!body["batch_size"].is_null())
+  {
+    llama.params.n_batch = body["batch_size"].get<int>();
+  }
+  if (!body["n_keep"].is_null())
+  {
+    llama.params.n_keep = body["n_keep"].get<int>();
+  }
+  if (!body["as_loop"].is_null())
+  {
+    llama.as_loop = body["as_loop"].get<bool>();
+  }
+  if (!body["interactive"].is_null())
+  {
+    llama.params.interactive = body["interactive"].get<bool>();
+  }
+  if (!body["prompt"].is_null())
+  {
+    llama.params.prompt = body["prompt"].get<std::string>();
+  }
+  else
+  {
+    json data = {
+        {"status", "error"},
+        {"reason", "You need to pass the prompt"}};
+    res.set_content(data.dump(), "application/json");
+    res.status = 400;
+    return false;
+  }
+  if (!body["stop"].is_null())
+  {
+    std::vector<std::string> stop_words = body["stop"].get<std::vector<std::string>>();
+    for (std::string stop_word : stop_words)
+    {
+      llama.params.antiprompt.push_back(stop_word);
+      llama.no_show_words.push_back(::llama_tokenize(llama.ctx, stop_word, false));
+    }
+  }
+  if (!body["exclude"].is_null())
+  {
+    std::vector<std::string> no_show_words = body["exclude"].get<std::vector<std::string>>();
+    for (std::string no_show : no_show_words)
+    {
+      llama.no_show_words.push_back(::llama_tokenize(llama.ctx, no_show, false));
+    }
+  }
+  return true;
+}
+
+int main(int argc, char **argv)
+{
+  // own arguments required by this example
+  gpt_params params;
+  server_params sparams;
+
+  // struct that contains llama context and inference
+  llama_server_context llama;
+  params.model = "ggml-model.bin";
+
+  if (server_params_parse(argc, argv, sparams, params) == false)
+  {
+    return 1;
+  }
+
+  if (params.seed <= 0)
+  {
+    params.seed = time(NULL);
+  }
+
+  fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+
+  // load the model
+  if (!llama.loadModel(params))
+  {
+    return 1;
+  }
+
+  Server svr;
+
+  svr.Get("/", [](const Request &, Response &res)
+          { res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
+
+  svr.Post("/completion", [&llama](const Request &req, Response &res)
+            {
+              if(llama.params.embedding) {
+                json data = {
+                    {"status", "error"},
+                    {"reason", "To use completion function disable embedding mode"}};
+                res.set_content(data.dump(), "application/json");
+                res.status = 400;
+                return;
+              }
+
+              llama.rewind();
+
+              if(parse_options_completion(json::parse(req.body), llama, res) == false){
+                return;
+              }
+
+              if (!llama.loadPrompt())
+              {
+                json data = {
+                    {"status", "error"},
+                    {"reason", "Context too long, please be more specific"}};
+                res.set_content(data.dump(), "application/json");
+                res.status = 400;
+                return;
+              }
+
+              llama.beginCompletion();
+              if(llama.as_loop) {
+                json data = {
+                    {"status", "done" } };
+                return res.set_content(data.dump(), "application/json");
+              } else {
+                // loop inference until finish completion
+                while (llama.has_next_token)
+                {
+                  llama.doCompletion();
+                }
+                try
+                {
+                  json data = {
+                      {"model", llama.params.model_alias },
+                      {"content", llama.generated_text },
+                      {"tokens_predicted", llama.num_tokens_predicted}};
+                  return res.set_content(data.dump(), "application/json");
+                }
+                catch (const json::exception &e)
+                {
+                  // Some tokens have bad UTF-8 strings, the json parser is very sensitive
+                  json data = {
+                      {"content", "Bad encoding token"},
+                      {"tokens_predicted", 0}};
+                  return res.set_content(data.dump(), "application/json");
+                }
+              } });
+
+  svr.Post("/tokenize", [&llama](const Request &req, Response &res)
+            {
+              json body = json::parse(req.body);
+              json data = {
+                    {"tokens", ::llama_tokenize(llama.ctx, body["content"].get<std::string>(), false) } };
+                return res.set_content(data.dump(), "application/json");
+            });
+
+  svr.Post("/embedding", [&llama](const Request &req, Response &res)
+            {
+              if(!llama.params.embedding) {
+                std::vector<float> empty;
+                json data = {
+                    {"embedding", empty}};
+                fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n");
+                return res.set_content(data.dump(), "application/json");
+              }
+              json body = json::parse(req.body);
+              std::string content = body["content"].get<std::string>();
+              int threads = body["threads"].get<int>();
+              json data = {
+                    {"embedding", llama.embedding(content, threads) } };
+              return res.set_content(data.dump(), "application/json");
+            });
+
+  svr.Get("/next-token", [&llama](const Request &req, Response &res)
+          {
+            if(llama.params.embedding) {
+                res.set_content("{}", "application/json");
+                return;
+            }
+            std::string result = "";
+            if (req.has_param("stop")) {
+                llama.has_next_token = false;
+            } else {
+              result = llama.doCompletion(); // inference next token
+            }
+            try {
+              json data = {
+                        {"content", result },
+                        {"stop", !llama.has_next_token }};
+              return res.set_content(data.dump(), "application/json");
+            } catch (const json::exception &e) {
+              // Some tokens have bad UTF-8 strings, the json parser is very sensitive
+              json data = {
+                        {"content", "" },
+                        {"stop", !llama.has_next_token }};
+              return res.set_content(data.dump(), "application/json");
+            }
+          });
+
+  fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port);
+
+  if(params.embedding) {
+    fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n");
+  }
+
+  // change hostname and port
+  svr.listen(sparams.hostname, sparams.port);
+}
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -42,19 +42,19 @@ typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y,
 #define QK4_0 32
 #define QR4_0 2
 typedef struct {
-    float   d;              // delta
+    half    d;              // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
 } block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");

 #define QK4_1 32
 #define QR4_1 2
 typedef struct {
-    float   d;              // delta
-    float   m;              // min
+    half    d;              // delta
+    half    m;              // min
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
-static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");

 #define QK5_0 32
 #define QR5_0 2
@ -78,12 +78,33 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
 #define QK8_0 32
 #define QR8_0 1
 typedef struct {
-    float   d;              // delta
+    half    d;              // delta
    int8_t  qs[QK8_0];      // quants
 } block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");

-#define CUDA_DMMV_BLOCK_SIZE 32
+#define WARP_SIZE 32
+
+#define CUDA_MUL_BLOCK_SIZE 256
+
+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+
+// dmmv = dequantize_mul_mat_vec
+#ifndef GGML_CUDA_DMMV_X
+#define GGML_CUDA_DMMV_X 32
+#endif
+#ifndef GGML_CUDA_DMMV_Y
+#define GGML_CUDA_DMMV_Y 1
+#endif
+
+static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= kx) {
+        return;
+    }
+    dst[i] = x[i] * y[i%ky];
+}

 static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
    const block_q4_0 * x = (const block_q4_0 *) vx;
@ -170,216 +191,154 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
    v1 = __half2float(x[ib + 1]);
 }

-static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
-    static const int qk = QK4_0;
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static __global__ void dequantize_block(const void * vx, float * y, const int k) {
+    const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;

-    const block_q4_0 * x = (const block_q4_0 *) vx;
-
-    const int i = blockIdx.x;
-
-    const float d = x[i].d;
-
-    for (int j = 0; j < qk/2; ++j) {
-        const int x0 = (x[i].qs[j] & 0xf) - 8;
-        const int x1 = (x[i].qs[j] >>  4) - 8;
-
-        y[i*qk + j + 0   ] = x0*d;
-        y[i*qk + j + qk/2] = x1*d;
+    if (i >= k) {
+        return;
    }
-}
-
-static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
-    static const int qk = QK4_1;
-
-    const block_q4_1 * x = (const block_q4_1 *) vx;
-
-    const int i = blockIdx.x;
-
-    const float d = x[i].d;
-    const float m = x[i].m;
-
-    for (int j = 0; j < qk/2; ++j) {
-        const int x0 = (x[i].qs[j] & 0xf);
-        const int x1 = (x[i].qs[j] >>  4);
-
-        y[i*qk + j + 0   ] = x0*d + m;
-        y[i*qk + j + qk/2] = x1*d + m;
-    }
-}
-
-static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
-    static const int qk = QK5_0;
-
-    const block_q5_0 * x = (const block_q5_0 *) vx;
-
-    const int i = blockIdx.x;
-
-    const float d = x[i].d;
-
-    uint32_t qh;
-    memcpy(&qh, x[i].qh, sizeof(qh));
-
-    for (int j = 0; j < qk/2; ++j) {
-        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-        const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
-        const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
-
-        y[i*qk + j + 0   ] = x0*d;
-        y[i*qk + j + qk/2] = x1*d;
-    }
-}
-
-static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
-    static const int qk = QK5_1;
-
-    const block_q5_1 * x = (const block_q5_1 *) vx;
-
-    const int i = blockIdx.x;
-
-    const float d = x[i].d;
-    const float m = x[i].m;
-
-    uint32_t qh;
-    memcpy(&qh, x[i].qh, sizeof(qh));
-
-    for (int j = 0; j < qk/2; ++j) {
-        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-        const int x0 = (x[i].qs[j] & 0xf) | xh_0;
-        const int x1 = (x[i].qs[j] >>  4) | xh_1;
-
-        y[i*qk + j + 0   ] = x0*d + m;
-        y[i*qk + j + qk/2] = x1*d + m;
-    }
-}
-
-static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
-    static const int qk = QK8_0;
-
-    const block_q8_0 * x = (const block_q8_0 *) vx;
-
-    const int i = blockIdx.x;
-
-    const float d = x[i].d;
-
-    for (int j = 0; j < qk; ++j) {
-        y[i*qk + j] = x[i].qs[j]*d;
-    }
-}
-
-template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
-    const int row = blockIdx.x;
-    const int tid = threadIdx.x;

+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
    const int y_offset = qr == 1 ? 1 : qk/2;

-    __shared__ float tmp[block_size]; // separate sum for each thread
-    tmp[tid] = 0;
+    // dequantize
+    float & v0 = y[iybs + iqs + 0];
+    float & v1 = y[iybs + iqs + y_offset];
+    dequantize_kernel(vx, ib, iqs, v0, v1);
+}

-    for (int i = 0; i < ncols/block_size; i += 2) {
-        const int col = i*block_size + 2*tid;
-        const int ib = (row*ncols + col)/qk; // block index
-        const int iqs = (col%qk)/qr; // quant index
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
+    // qk = quantized weights per x block
+    // qr = number of quantized weights per data value in x block
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const int tid = threadIdx.x;
+
+    const int iter_stride = 2*GGML_CUDA_DMMV_X;
+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = 0; i < ncols; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
        const int iybs = col - col%qk; // y block start index

-        // dequantize
-        float v0, v1;
-        dequantize_kernel(vx, ib, iqs, v0, v1);
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter

-        // matrix multiplication
-        tmp[tid] += v0 * y[iybs + iqs + 0];
-        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+            // dequantize
+            float v0, v1;
+            dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+
+            // matrix multiplication
+            tmp += v0 * y[iybs + iqs + j/qr + 0];
+            tmp += v1 * y[iybs + iqs + j/qr + y_offset];
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+        }
    }

    // sum up partial sums and write back result
    __syncthreads();
-    for (int s=block_size/2; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        __syncthreads();
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
    }
+
    if (tid == 0) {
-        dst[row] = tmp[0];
+        dst[row] = tmp;
    }
 }

-static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-    const int nb = k / QK4_0;
-    dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
+static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
+    const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
+    mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
 }

-static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-    const int nb = k / QK4_1;
-    dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
+static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 }

-static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-    const int nb = k / QK5_0;
-    dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
+static void dequantize_row_q4_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<QK4_1, QR4_1, dequantize_q4_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 }

-static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-    const int nb = k / QK5_1;
-    dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
+static void dequantize_row_q5_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<QK5_0, QR5_0, dequantize_q5_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 }

-static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-    const int nb = k / QK8_0;
-    dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
+static void dequantize_row_q5_1_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<QK5_1, QR5_1, dequantize_q5_1><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+}
+
+static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<QK8_0, QR8_0, dequantize_q8_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 }

 static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_0, QR4_0, dequantize_q4_0>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

 static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, QR4_1, dequantize_q4_1>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

 static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, QR5_0, dequantize_q5_0>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

 static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_1, QR5_1, dequantize_q5_1>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

 static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK8_0, QR8_0, dequantize_q8_0>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

-// TODO: optimize
-static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
-    const half * x = (const half *) vx;
-
-    const int i = blockIdx.x;
-
-    y[i] = __half2float(x[i]);
-}
-
-static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStream_t stream) {
-    convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
+static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    dequantize_block<32, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 }

 static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
-    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, 32, 1, convert_f16>
-        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    dequantize_mul_mat_vec<1, 1, convert_f16>
+        <<<nrows/GGML_CUDA_DMMV_Y, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }

 static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
@ -414,7 +373,7 @@ static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_t
        case GGML_TYPE_Q8_0:
            return dequantize_mul_mat_vec_q8_0_cuda;
        case GGML_TYPE_F16:
-            return dequantize_mul_mat_vec_q8_0_cuda;
+            return convert_mul_mat_vec_f16_cuda;
        default:
            return nullptr;
    }
@ -555,6 +514,67 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor
    }
 }

+static void ggml_cuda_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src1->backend == GGML_BACKEND_CUDA);
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[2];
+    const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    size_t x_size, d_size;
+
+    float * d_X = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &x_size); // src0
+    float * d_Y = (float *) src1->data; // src1 is already on device, broadcasted.
+    float * d_D = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &d_size); // dst
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            const int i0 = i03*ne02 + i02;
+            float * c_X2 = d_X + i0*ne01*ne00;
+            float * c_D2 = d_D + i0*ne01*ne00;
+
+            cudaStream_t cudaStream = g_cudaStreams[i0 % GGML_CUDA_MAX_STREAMS];
+            cudaStream_t cudaStream2 = g_cudaStreams2[i0 % GGML_CUDA_MAX_STREAMS];
+            cudaEvent_t  cudaEvent = g_cudaEvents[i0 % GGML_CUDA_MAX_EVENTS];
+
+            // copy src0 to device
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X2, src0, i03, i02, cudaStream2));
+            CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
+
+            // wait for data
+            CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
+
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                const int64_t i13 = i03%ne13;
+                const int64_t i12 = i02%ne12;
+                const int64_t i11 = i01%ne11;
+                const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
+
+                float * c_X1 = c_X2 + i01*ne00;
+                float * c_Y = d_Y + i1*ne10;
+                float * c_D1 = c_D2 + i01*ne00;
+
+                // compute
+                mul_f32_cuda(c_X1, c_Y, c_D1, ne00, ne10, cudaStream);
+                CUDA_CHECK(cudaGetLastError());
+            }
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CUDA_CHECK(cudaMemcpyAsync(d, c_D2, sizeof(float)*ne00*ne01, cudaMemcpyDeviceToHost, cudaStream));
+        }
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+    ggml_cuda_pool_free(d_X, x_size);
+    ggml_cuda_pool_free(d_D, d_size);
+}
+
 static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
@ -812,6 +832,11 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
    ggml_cuda_pool_free(d_Q, q_size);
 }

+void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    ggml_cuda_mul_f32(src0, src1, dst);
+}
+
 bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
    const int64_t ne10 = src1->ne[0];

@ -885,14 +910,48 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
    const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);

    size_t q_size;
-    char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
+    char * dst = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);

    cudaStream_t cudaStream2 = g_cudaStreams2[0];

    // copy tensor to device
-    CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
-    CUDA_CHECK(cudaDeviceSynchronize());
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            int i = i3*ne2 + i2;
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(dst + i*ne0*ne1, tensor, i3, i2, cudaStream2));
+        }
+    }

-    tensor->data = d_Q;
+    tensor->data = dst;
    tensor->backend = GGML_BACKEND_CUDA;
 }
+
+void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
+    FILE * fp = fopen(fname, "rb");
+
+    const size_t size = ggml_nbytes(tensor);
+
+    void * buf;
+    CUDA_CHECK(cudaMalloc(&buf, size));
+    void * buf_host = malloc(size);
+
+#ifdef _WIN32
+    int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
+#else
+    int ret = fseek(fp, (long) offset, SEEK_SET);
+#endif
+    GGML_ASSERT(ret == 0); // same
+
+    size_t ret2 = fread(buf_host, size, 1, fp);
+    if (ret2 != 1) {
+        fprintf(stderr, "unexpectedly reached end of file");
+        exit(1);
+    }
+
+    cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
+    cudaDeviceSynchronize();
+
+    tensor->data = buf;
+    free(buf_host);
+    fclose(fp);
+}
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -6,6 +6,7 @@ extern "C" {

 void   ggml_init_cublas(void);

+void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
@ -15,6 +16,7 @@ void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);

 void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
+void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);

 #ifdef  __cplusplus
 }
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@ -1,361 +0,0 @@
-#include "ggml-opencl.h"
-
-#define CL_TARGET_OPENCL_VERSION 110
-#include <clblast_c.h>
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "ggml.h"
-
-#define MULTILINE_QUOTE(...) #__VA_ARGS__
-const char * clblast_dequant = MULTILINE_QUOTE(
-
-typedef uchar uint8_t;
-typedef int int32_t;
-typedef uint uint32_t;
-
-constant uint QK4_0 = 32;
-struct block_q4_0
-{
-    float d;
-    uint8_t qs[QK4_0 / 2];
-};
-
-constant uint QK4_1 = 32;
-struct block_q4_1
-{
-    float d;
-    float m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-constant uint QK5_0 = 32;
-struct __attribute__ ((packed)) block_q5_0
-{
-    half d;
-    uint32_t qh;
-    uint8_t qs[QK5_0 / 2];
-};
-
-constant uint QK5_1 = 32;
-struct block_q5_1
-{
-    half d;
-    half m;
-    uint32_t qh;
-    uint8_t qs[QK5_1 / 2];
-};
-
-constant uint QK8_0 = 32;
-struct block_q8_0
-{
-    float d;
-    uint8_t qs[QK8_0];
-};
-
-
-__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
-    constant uint qk = QK4_0;
-
-    const uint i = get_global_id(0) / qk;
-    const uint j = get_local_id(0);
-
-    const float d = x[i].d;
-
-    const int x0 = (x[i].qs[j] & 0xf) - 8;
-    const int x1 = (x[i].qs[j] >>  4) - 8;
-
-    y[i*qk + j + 0   ] = x0*d;
-    y[i*qk + j + qk/2] = x1*d;
-}
-
-__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
-    constant uint qk = QK4_1;
-
-    const uint i = get_global_id(0) / qk;
-    const uint j = get_local_id(0);
-
-    const float d = x[i].d;
-    const float m = x[i].m;
-
-    const int x0 = (x[i].qs[j] & 0xf);
-    const int x1 = (x[i].qs[j] >>  4);
-
-    y[i*qk + j + 0   ] = x0*d + m;
-    y[i*qk + j + qk/2] = x1*d + m;
-}
-
-__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
-    constant uint qk = QK5_0;
-
-    const uint i = get_global_id(0) / qk;
-    const uint j = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &x[i].d);
-
-    uint32_t qh = x[i].qh;
-
-    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-    const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
-    const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
-
-    y[i*qk + j + 0   ] = x0*d;
-    y[i*qk + j + qk/2] = x1*d;
-}
-
-__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
-    constant uint qk = QK5_1;
-
-    const uint i = get_global_id(0) / qk;
-    const uint j = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &x[i].d);
-    const float m = vload_half(0, (__global half*) &x[i].m);
-
-    uint32_t qh = x[i].qh;
-
-    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-    const int x0 = (x[i].qs[j] & 0xf) | xh_0;
-    const int x1 = (x[i].qs[j] >>  4) | xh_1;
-
-    y[i*qk + j + 0   ] = x0*d + m;
-    y[i*qk + j + qk/2] = x1*d + m;
-}
-
-__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
-    constant uint qk = QK8_0;
-    const uint i = get_global_id(0) / qk;
-    const uint j = get_local_id(0);
-
-    const float d = x[i].d;
-    y[i*qk + j] = x[i].qs[j]*d;
-}
-
-);
-
-#define CL_CHECK(err, name)                                                                     \
-    do {                                                                                        \
-        cl_int err_ = (err);                                                                    \
-        if (err_ != CL_SUCCESS) {                                                               \
-            fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \
-            exit(1);                                                                            \
-        }                                                                                       \
-    } while (0)
-
-static cl_platform_id platform;
-static cl_device_id device;
-static cl_context context;
-static cl_command_queue queue;
-static cl_program program;
-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
-static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
-static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
-
-static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
-    cl_program p;
-    char *program_log;
-    size_t program_size, log_size;
-    int err;
-
-    program_size = strlen(program_buffer);
-
-    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
-    if(err < 0) {
-        fprintf(stderr, "OpenCL error creating program");
-        exit(1);
-    }
-
-    err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
-    if(err < 0) {
-
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-        program_log = (char*) malloc(log_size + 1);
-        program_log[log_size] = '\0';
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
-        printf("%s\n", program_log);
-        free(program_log);
-        exit(1);
-    }
-
-    return p;
-}
-
-void ggml_cl_init(void) {
-    cl_int err = 0;
-    char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
-    char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
-    int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
-    int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
-    printf("\nInitializing CLBlast (First Run)...");
-    printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
-    cl_uint num_platforms;
-    clGetPlatformIDs(0, NULL, &num_platforms);
-    cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
-    clGetPlatformIDs(num_platforms, platforms, NULL);
-    platform = platforms[plat_num];
-    char platform_buffer[1024];
-    clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
-    cl_uint num_devices;
-    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
-    cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
-    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
-    device = devices[dev_num];
-    char device_buffer[1024];
-    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
-    printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
-    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
-    CL_CHECK(err, "clCreateContext");
-    queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    CL_CHECK(err, "clCreateCommandQueue");
-
-    free(platforms);
-    free(devices);
-
-    program = build_program_from_source(context, device, clblast_dequant);
-
-    // Prepare dequantize kernels
-    kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
-    CL_CHECK(err, "clCreateKernel");
-}
-
-static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
-    if (req_size <= *cur_size) {
-        return;
-    }
-
-    // Reallocate buffer with enough space
-    if (*cur_size > 0) {
-        clReleaseMemObject(*buf);
-    }
-    cl_int err;
-    *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
-    *cur_size = req_size;
-    CL_CHECK(err, "clCreateBuffer");
-}
-
-void ggml_cl_sgemm_wrapper(
-        const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
-        const int m, const int n, const int k,
-        const float alpha, const void *host_a, const int lda,
-        const float *host_b, const int ldb, const float beta,
-        float *host_c, const int ldc, const int btype) {
-    cl_int err = 0;
-
-    cl_kernel kernel;
-    size_t global = n * k, local, size_qb;
-    bool dequant;
-
-    switch (btype) {
-    case GGML_TYPE_F32:
-        dequant = false;
-        break;
-    case GGML_TYPE_Q4_0:
-        dequant = true;
-        kernel = kernel_q4_0;
-        local = 16;
-        size_qb = global * (sizeof(float) + local) / 32;
-        break;
-    case GGML_TYPE_Q4_1:
-        dequant = true;
-        kernel = kernel_q4_1;
-        local = 16;
-        size_qb = global * (sizeof(float) * 2 + local) / 32;
-        break;
-    case GGML_TYPE_Q5_0:
-        dequant = true;
-        kernel = kernel_q5_0;
-        local = 16;
-        size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
-        break;
-    case GGML_TYPE_Q5_1:
-        dequant = true;
-        kernel = kernel_q5_1;
-        local = 16;
-        size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
-        break;
-    case GGML_TYPE_Q8_0:
-        dequant = true;
-        kernel = kernel_q8_0;
-        local = 32;
-        size_qb = global * (sizeof(float) + local) / 32;
-        break;
-    default:
-        fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
-        abort();
-    }
-
-    const size_t size_a =  m * k * sizeof(float);
-    const size_t size_b =  n * k * sizeof(float);
-    const size_t size_c =  m * n * sizeof(float);
-
-    // Prepare buffers
-    ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
-    if (dequant) {
-        ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
-    }
-    ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
-    ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
-
-    cl_event ev_a, ev_qb, ev_b;
-
-    if (dequant) {
-        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
-        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
-        CL_CHECK(err, "clSetKernelArg");
-        err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
-        CL_CHECK(err, "clEnqueueWriteBuffer qb");
-    } else {
-        err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
-        CL_CHECK(err, "clEnqueueWriteBuffer b");
-    }
-
-    err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
-    CL_CHECK(err, "clEnqueueWriteBuffer a");
-    if (dequant) {
-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
-        CL_CHECK(err, "clEnqueueNDRangeKernel");
-        clReleaseEvent(ev_qb);
-    }
-    clWaitForEvents(1, &ev_a);
-    clWaitForEvents(1, &ev_b);
-    clReleaseEvent(ev_a);
-    clReleaseEvent(ev_b);
-
-    cl_event ev_sgemm;
-    CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
-                                            (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
-                                            m, n, k,
-                                            alpha,
-                                            cl_buffer_a, 0, lda,
-                                            cl_buffer_b, 0, ldb,
-                                            beta,
-                                            cl_buffer_c, 0, ldc,
-                                            &queue, &ev_sgemm);
-
-    if (status != CLBlastSuccess) {
-        fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
-        abort();
-    }
-
-    cl_event ev_c;
-    clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
-
-    // Wait for completion
-    clWaitForEvents(1, &ev_c);
-    clReleaseEvent(ev_sgemm);
-    clReleaseEvent(ev_c);
-}
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -1,23 +1,21 @@
 #pragma once

+#include "ggml.h"
+
 #ifdef  __cplusplus
 extern "C" {
 #endif

 void ggml_cl_init(void);

-enum ggml_blas_order {
-    GGML_BLAS_ORDER_ROW_MAJOR = 101,
-    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
-};
+bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

-enum ggml_blas_op {
-    GGML_BLAS_OP_N = 111,
-    GGML_BLAS_OP_T = 112,
-    GGML_BLAS_OP_C = 113,
-};
+void * ggml_cl_host_malloc(size_t size);
+void   ggml_cl_host_free(void * ptr);

-void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
+void ggml_cl_transform_tensor(struct ggml_tensor * tensor);

 #ifdef  __cplusplus
 }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -190,11 +190,15 @@
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1

+#define GGML_QNT_VERSION        2    // bump this on quantization format changes
+#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
+
 #define GGML_MAX_DIMS          4
 #define GGML_MAX_NODES         4096
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_OPT           4
+#define GGML_MAX_NAME          32
 #define GGML_DEFAULT_N_THREADS 4

 #define GGML_ASSERT(x) \
@ -246,6 +250,7 @@ extern "C" {
    enum ggml_backend {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_CUDA = 1,
+        GGML_BACKEND_CL = 2,
    };

    // model file types
@ -313,6 +318,7 @@ extern "C" {
        GGML_OP_ROPE,
        GGML_OP_ROPE_BACK,
        GGML_OP_ALIBI,
+        GGML_OP_CLAMP,
        GGML_OP_CONV_1D_1S,
        GGML_OP_CONV_1D_2S,

@ -344,7 +350,7 @@ extern "C" {

    // n-dimensional tensor
    struct ggml_tensor {
-        enum ggml_type type;
+        enum ggml_type    type;
        enum ggml_backend backend;

        int     n_dims;
@ -374,11 +380,13 @@ extern "C" {

        void * data;

-        char name[32];
+        char name[GGML_MAX_NAME];

-        char padding[9]; // TODO: remove and add padding to name?
+        char padding[16];
    };

+    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
+
    // computation graph
    struct ggml_cgraph {
        int n_nodes;
@ -431,6 +439,7 @@ extern "C" {
    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float

    GGML_API const char * ggml_type_name(enum ggml_type type);
+    GGML_API const char * ggml_op_name  (enum ggml_op   op);

    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);

@ -439,6 +448,9 @@ extern "C" {
    // TODO: temporary until model loading of ggml examples is refactored
    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);

+    // use this to compute the memory overhead of a tensor
+    GGML_API size_t ggml_tensor_overhead(void);
+
    // main

    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@ -446,7 +458,11 @@ extern "C" {

    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);

-    GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
+
+    GGML_API void *  ggml_get_mem_buffer(struct ggml_context * ctx);
+    GGML_API size_t  ggml_get_mem_size  (struct ggml_context * ctx);

    GGML_API struct ggml_tensor * ggml_new_tensor(
            struct ggml_context * ctx,
@ -486,6 +502,8 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);

+    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+
    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@ -871,7 +889,7 @@ extern "C" {
            int                   n_past);

    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past);
@ -930,7 +948,16 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
-            int                   n_head);
+            int                   n_head,
+            float                 bias_max);
+
+    // clamp
+    // in-place, returns view(a)
+    struct ggml_tensor * ggml_clamp(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 min,
+            float                 max);

    // padding = 1
    // TODO: we don't support extra parameters for now
@ -1013,6 +1040,11 @@ extern "C" {
    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);

+    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+
+    GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
+    GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
+
    // print info and performance information for the graph
    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);

--- a/llama-util.h
+++ b/llama-util.h
@ -101,12 +101,12 @@ struct llama_file {
        LLAMA_ASSERT(ret == 0); // same
    }

-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
+    void read_raw(void * ptr, size_t len) const {
+        if (len == 0) {
            return;
        }
        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
+        std::size_t ret = std::fread(ptr, len, 1, fp);
        if (ferror(fp)) {
            throw std::runtime_error(format("read error: %s", strerror(errno)));
        }
@ -127,12 +127,12 @@ struct llama_file {
        return std::string(chars.data(), len);
    }

-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
+    void write_raw(const void * ptr, size_t len) const {
+        if (len == 0) {
            return;
        }
        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
+        size_t ret = std::fwrite(ptr, len, 1, fp);
        if (ret != 1) {
            throw std::runtime_error(format("write error: %s", strerror(errno)));
        }
@ -172,7 +172,7 @@ struct llama_mmap {
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;

-    llama_mmap(struct llama_file * file, bool prefetch = true) {
+    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
@ -184,9 +184,9 @@ struct llama_mmap {
            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
        }

-        if (prefetch) {
+        if (prefetch > 0) {
            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, file->size, MADV_WILLNEED)) {
+            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
@ -267,9 +267,9 @@ struct llama_mlock {
        }
    }

-    void init(void * addr) {
-        LLAMA_ASSERT(this->addr == NULL && this->size == 0);
-        this->addr = addr;
+    void init(void * ptr) {
+        LLAMA_ASSERT(addr == NULL && size == 0);
+        addr = ptr;
    }

    void grow_to(size_t target_size) {
@ -340,14 +340,14 @@ struct llama_mlock {
        return (size_t) si.dwPageSize;
    }

-    bool raw_lock(void * addr, size_t size) {
+    bool raw_lock(void * ptr, size_t len) {
        for (int tries = 1; ; tries++) {
-            if (VirtualLock(addr, size)) {
+            if (VirtualLock(ptr, len)) {
                return true;
            }
            if (tries == 2) {
                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                        size, this->size, llama_format_win_err(GetLastError()).c_str());
+                    len, size, llama_format_win_err(GetLastError()).c_str());
                return false;
            }

@ -363,7 +363,7 @@ struct llama_mlock {
            // is equal to the number of pages in its minimum working set minus
            // a small overhead."
            // Hopefully a megabyte is enough overhead:
-            size_t increment = size + 1048576;
+            size_t increment = len + 1048576;
            // The minimum must be <= the maximum, so we need to increase both:
            min_ws_size += increment;
            max_ws_size += increment;
@ -375,8 +375,8 @@ struct llama_mlock {
        }
    }

-    void raw_unlock(void * addr, size_t size) {
-        if (!VirtualUnlock(addr, size)) {
+    void raw_unlock(void * ptr, size_t len) {
+        if (!VirtualUnlock(ptr, len)) {
            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
                    llama_format_win_err(GetLastError()).c_str());
        }
@ -388,12 +388,12 @@ struct llama_mlock {
        return (size_t) 65536;
    }

-    bool raw_lock(const void * addr, size_t size) {
+    bool raw_lock(const void * addr, size_t len) {
        fprintf(stderr, "warning: mlock not supported on this system\n");
        return false;
    }

-    void raw_unlock(const void * addr, size_t size) {}
+    void raw_unlock(const void * addr, size_t len) {}
 #endif
 };

@ -404,10 +404,10 @@ struct llama_buffer {

    llama_buffer() = default;

-    void resize(size_t size) {
+    void resize(size_t len) {
        delete[] addr;
-        addr = new uint8_t[size];
-        this->size = size;
+        addr = new uint8_t[len];
+        size = len;
    }

    ~llama_buffer() {
--- a/llama.cpp
+++ b/llama.cpp
@ -1,6 +1,7 @@
 // Defines fileno on msys:
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
+#include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #endif
@ -11,6 +12,8 @@
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
+#elif defined(GGML_USE_CLBLAST)
+#include "ggml-opencl.h"
 #endif

 #include <array>
@ -45,6 +48,7 @@ enum e_model {
    MODEL_65B,
 };

+
 static const size_t MB = 1024*1024;

 // computed for n_ctx == 2048
@ -110,7 +114,7 @@ struct llama_hparams {
    enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;

    bool operator!=(const llama_hparams & other) const {
-        return memcmp(this, &other, sizeof(llama_hparams));
+        return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
    }
 };

@ -406,6 +410,7 @@ enum llama_file_version {
    LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
    LLAMA_FILE_VERSION_GGJT_V1, // added padding
    LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
+    LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
 };

 struct llama_file_loader {
@ -424,24 +429,30 @@ struct llama_file_loader {
    }
    void read_magic() {
        uint32_t magic = file.read_u32();
-        uint32_t version = 0;

-        if (magic != 'ggml') {
-            version = file.read_u32();
-        }
-
-        if (magic == 'ggml' && version == 0) {
+        if (magic == LLAMA_FILE_MAGIC_GGML) {
            file_version = LLAMA_FILE_VERSION_GGML;
-        } else if (magic == 'ggmf' && version == 1) {
-            file_version = LLAMA_FILE_VERSION_GGMF_V1;
-        } else if (magic == 'ggjt' && version == 1) {
-            file_version = LLAMA_FILE_VERSION_GGJT_V1;
-        } else if (magic == 'ggjt' && version == 2) {
-            file_version = LLAMA_FILE_VERSION_GGJT_V2;
-        } else {
-            throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
-                         magic, version);
+            return;
        }
+
+        uint32_t version = file.read_u32();
+
+        switch (magic) {
+            case LLAMA_FILE_MAGIC_GGMF:
+                switch (version) {
+                    case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
+                }
+                break;
+            case LLAMA_FILE_MAGIC_GGJT:
+                switch (version) {
+                    case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
+                    case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
+                    case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
+                }
+        }
+
+        throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
+                     magic, version);
    }
    void read_hparams() {
        hparams.n_vocab = file.read_u32();
@ -499,7 +510,7 @@ struct llama_file_loader {

            if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
                // skip to the next multiple of 32 bytes
-                file.seek(-file.tell() & 31, SEEK_CUR);
+                file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
            }
            shard.file_idx = file_idx;
            shard.file_off = file.tell();
@ -574,7 +585,7 @@ struct llama_file_saver {
        file.write_u32(new_type);
        file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
        file.write_raw(tensor.name.data(), tensor.name.size());
-        file.seek(-file.tell() & 31, SEEK_CUR);
+        file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
        LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
        file.write_raw(new_data, new_size);
    }
@ -641,7 +652,7 @@ struct llama_model_loader {
        }
    }

-    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
+    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
        auto it = tensors_map.name_to_idx.find(name);
        if (it == tensors_map.name_to_idx.end()) {
            throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@ -652,10 +663,10 @@ struct llama_model_loader {
                         name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
        }

-        return get_tensor_for(lt);
+        return get_tensor_for(lt, backend);
    }

-    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
+    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
        struct ggml_tensor * tensor;
        if (lt.ne.size() == 2) {
            tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@ -665,6 +676,7 @@ struct llama_model_loader {
        }
        ggml_set_name(tensor, lt.name.c_str());
        LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
+        tensor->backend = backend;
        lt.ggml_tensor = tensor;
        num_ggml_tensors_created++;
        return tensor;
@ -678,12 +690,16 @@ struct llama_model_loader {

    void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
        size_t data_size = 0;
+        size_t prefetch_size = 0;
        for (const llama_load_tensor & lt : tensors_map.tensors) {
            data_size += lt.size;
+            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
+                prefetch_size += lt.size;
+            }
        }

        if (use_mmap) {
-            mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
+            mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
            if (!lmlock) {
                // Don't call the callback since the actual loading will be lazy
                // and we can't measure it.
@ -696,6 +712,9 @@ struct llama_model_loader {

        size_t done_size = 0;
        for (llama_load_tensor & lt : tensors_map.tensors) {
+            if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
+                continue;
+            }
            if (progress_callback) {
                progress_callback((float) done_size / data_size, progress_callback_user_data);
            }
@ -708,9 +727,6 @@ struct llama_model_loader {
                lmlock->grow_to(done_size);
            }
        }
-        if (progress_callback) {
-            progress_callback(1.0f, progress_callback_user_data);
-        }
    }

    void load_data_for(llama_load_tensor & lt) {
@ -812,10 +828,9 @@ static bool kv_cache_init(
 struct llama_context_params llama_context_default_params() {
    struct llama_context_params result = {
        /*.n_ctx                       =*/ 512,
-        /*.n_parts                     =*/ -1,
        /*.gpu_layers                  =*/ 0,
        /*.seed                        =*/ -1,
-        /*.f16_kv                      =*/ false,
+        /*.f16_kv                      =*/ true,
        /*.logits_all                  =*/ false,
        /*.vocab_only                  =*/ false,
        /*.use_mmap                    =*/ true,
@ -836,6 +851,21 @@ bool llama_mlock_supported() {
    return llama_mlock::SUPPORTED;
 }

+void llama_init_backend() {
+    ggml_time_init();
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+}
+
+int64_t llama_time_us() {
+    return ggml_time_us();
+}
+
 //
 // model loading
 //
@ -845,7 +875,8 @@ static const char *llama_file_version_name(llama_file_version version) {
        case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
        case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
        case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
-        case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
+        case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
+        case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
    }

    return "unknown";
@ -931,11 +962,19 @@ static void llama_model_load_internal(
        fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
    }

-    if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
+    if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
        if (hparams.ftype != LLAMA_FTYPE_ALL_F32     &&
            hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  &&
            hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
-            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
+            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
+        }
+    }
+
+    if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
+        if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
+            hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
+            hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
+            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
        }
    }

@ -948,27 +987,7 @@ static void llama_model_load_internal(
    size_t ctx_size;
    size_t mmapped_size;
    ml->calc_sizes(&ctx_size, &mmapped_size);
-    fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
-
-    // print memory requirements
-    {
-        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
-
-        // this is the total memory required to run the inference
-        const size_t mem_required =
-            ctx_size +
-            mmapped_size +
-            MEM_REQ_SCRATCH0().at(model.type) +
-            MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at(model.type);
-
-        // this is the memory required by one llama_state
-        const size_t mem_required_state =
-            scale*MEM_REQ_KV_SELF().at(model.type);
-
-        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
-                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
-    }
+    fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);

    // create the ggml context
    {
@ -990,7 +1009,14 @@ static void llama_model_load_internal(
        }
    }

+#ifdef GGML_USE_CUBLAS
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
+#else
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
+#endif
+
    // prepare memory for the weights
+    size_t vram_total = 0;
    {
        const uint32_t n_embd  = hparams.n_embd;
        const uint32_t n_layer = hparams.n_layer;
@ -998,33 +1024,87 @@ static void llama_model_load_internal(

        ml->ggml_ctx = ctx;

-        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
-        model.norm           = ml->get_tensor("norm.weight",           {n_embd});
-        model.output         = ml->get_tensor("output.weight",         {n_embd, n_vocab});
+        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
+        model.norm           = ml->get_tensor("norm.weight",           {n_embd},          GGML_BACKEND_CPU);
+
+        // "output" tensor
+        {
+            ggml_backend backend_output;
+            if (n_gpu_layers > int(n_layer)) { // NOLINT
+                backend_output = LLAMA_BACKEND_OFFLOAD;
+            } else {
+                backend_output = GGML_BACKEND_CPU;
+            }
+
+            model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
+        }
+
+        const int i_gpu_start = n_layer - n_gpu_layers;

        model.layers.resize(n_layer);
        for (uint32_t i = 0; i < n_layer; ++i) {
+            const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+
            auto & layer = model.layers[i];

            std::string layers_i = "layers." + std::to_string(i);

-            layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
+            layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);

-            layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
-            layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
-            layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
-            layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
+            layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
+            layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
+            layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
+            layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);

-            layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
+            layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);

-            layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
-            layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
-            layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
+            layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff},   backend);
+            layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd}, backend);
+            layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff},   backend);
+
+            if (backend == GGML_BACKEND_CUDA) {
+                vram_total +=
+                    ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)             +
+                    ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
+                    ggml_nbytes(layer.w1)             + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+            }
        }
    }

    ml->done_getting_tensors();

+    // print memory requirements
+    {
+        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
+
+        // this is the total memory required to run the inference
+        const size_t mem_required =
+            ctx_size +
+            mmapped_size - vram_total + // weights in VRAM not in memory
+            MEM_REQ_SCRATCH0().at(model.type) +
+            MEM_REQ_SCRATCH1().at(model.type) +
+            MEM_REQ_EVAL().at(model.type);
+
+        // this is the memory required by one llama_state
+        const size_t mem_required_state =
+            scale*MEM_REQ_KV_SELF().at(model.type);
+
+        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
+                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
+
+#ifdef GGML_USE_CUBLAS
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+        }
+        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+#elif !defined(GGML_USE_CLBLAST)
+        (void) n_gpu_layers;
+#endif
+    }
+
    // populate `tensors_by_name`
    for (llama_load_tensor & lt : ml->tensors_map.tensors) {
        model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
@ -1032,37 +1112,61 @@ static void llama_model_load_internal(

    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);

-    model.mapping = std::move(ml->mapping);
 #ifdef GGML_USE_CUBLAS
+    {
+        size_t done_size = 0;
+        size_t data_size = 0;
+        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
+            data_size += lt.size;
+            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
+                done_size += lt.size;
+            }
+        }
+        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
+            if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
+                continue;
+            }
+            if (progress_callback) {
+                progress_callback((float) done_size / data_size, progress_callback_user_data);
+            }
+            ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
+            done_size += lt.size;
+        }
+    }
+#elif defined(GGML_USE_CLBLAST)
    {
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));

-        fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+        fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);

        size_t vram_total = 0;

        for (int i = 0; i < n_gpu; ++i) {
            const auto & layer = model.layers[i];

-            ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
-            ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
-            ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
-            ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
-            ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
-            ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
-            ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+            ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
+            ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
+            ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
+            ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
+            ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
+            ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
+            ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
        }
        if (n_gpu_layers > (int) hparams.n_layer) {
-            fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
-            ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+            fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
+            ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
        }

-        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+        fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
    }
-#else
-    (void) n_gpu_layers;
 #endif

+    if (progress_callback) {
+        progress_callback(1.0f, progress_callback_user_data);
+    }
+
+    model.mapping = std::move(ml->mapping);
+
    // loading time will be recalculate after the first eval, so
    // we take page faults deferred by mmap() into consideration
    lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
@ -1160,10 +1264,8 @@ static bool llama_eval_internal(
        {
            cur = ggml_rms_norm(ctx0, inpL);

-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
-                        cur);
+            // cur = cur*attention_norm(broadcasted)
+            cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
        }

        // self-attention
@ -1270,10 +1372,8 @@ static bool llama_eval_internal(
            {
                cur = ggml_rms_norm(ctx0, inpFF);

-                // cur = ffn_norm*cur
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
-                        cur);
+                // cur = cur*ffn_norm(broadcasted)
+                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
            }

            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@ -1310,10 +1410,8 @@ static bool llama_eval_internal(

        inpL = ggml_rms_norm(ctx0, inpL);

-        // inpL = norm*inpL
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.norm, inpL),
-                    inpL);
+        // inpL = inpL*norm(broadcasted)
+        inpL = ggml_mul(ctx0, inpL, model.norm);

        embeddings = inpL;
    }
@ -2141,7 +2239,7 @@ struct llama_context * llama_init_from_file(
            unsigned * cur_percentage_p = (unsigned *) ctx;
            unsigned percentage = (unsigned) (100 * progress);
            while (percentage > *cur_percentage_p) {
-                ++*cur_percentage_p;
+                *cur_percentage_p = percentage;
                fprintf(stderr, ".");
                fflush(stderr);
                if (percentage >= 100) {
@ -2234,7 +2332,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
    {
        uint32_t magic;
        fin.read((char *) &magic, sizeof(magic));
-        if (magic != 'ggla') {
+        if (magic != LLAMA_FILE_MAGIC_GGLA) {
            fprintf(stderr, "%s: bad file magic\n", __func__);
            return 1;
        }
@ -2298,7 +2396,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *

        // maybe this should in llama_model_loader
        if (model_loader->use_mmap) {
-            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
+            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
        }
    }

@ -2391,7 +2489,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
                }
                size_t idx = model_loader->tensors_map.name_to_idx[base_name];
                llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
-                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
+                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
                lt.data = (uint8_t *) lt.ggml_tensor->data;
                model_loader->load_data_for(lt);
                lt.ggml_tensor->data = lt.data;
@ -2617,8 +2715,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 }

 // Sets the state reading from the specified source address
-size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
-    const uint8_t * inp = src;
+size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
+    uint8_t * inp = src;

    // set rng
    {
--- a/llama.h
+++ b/llama.h
@ -19,12 +19,23 @@
 #    define LLAMA_API
 #endif

-#define LLAMA_FILE_VERSION           2
-#define LLAMA_FILE_MAGIC             'ggjt'
-#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
-#define LLAMA_SESSION_MAGIC          'ggsn'
+#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+#define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
+#define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
+#define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
+#define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
+
+#define LLAMA_FILE_VERSION           3
+#define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
+#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
+#define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1

+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
+#define LLAMA_SUPPORTS_GPU_OFFLOAD
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -40,9 +51,9 @@ extern "C" {
    typedef int llama_token;

    typedef struct llama_token_data {
-        llama_token id;  // token id
-        float logit; // log-odds of the token
-        float p;     // probability of the token
+        llama_token id; // token id
+        float logit;    // log-odds of the token
+        float p;        // probability of the token
    } llama_token_data;

    typedef struct llama_token_data_array {
@ -55,7 +66,6 @@ extern "C" {

    struct llama_context_params {
        int n_ctx;        // text context
-        int n_parts;      // -1 for default
        int n_gpu_layers; // number of layers to store in VRAM
        int seed;         // RNG seed, -1 for random

@ -74,16 +84,16 @@ extern "C" {

    // model file types
    enum llama_ftype {
-        LLAMA_FTYPE_ALL_F32     = 0,
-        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        LLAMA_FTYPE_ALL_F32              = 0,
+        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        // LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // support has been removed
-        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
    };

    LLAMA_API struct llama_context_params llama_context_default_params();
@ -91,6 +101,13 @@ extern "C" {
    LLAMA_API bool llama_mmap_supported();
    LLAMA_API bool llama_mlock_supported();

+    // TODO: not great API - very likely to change
+    // Initialize the llama + ggml backend
+    // Call once at the start of the program
+    LLAMA_API void llama_init_backend();
+
+    LLAMA_API int64_t llama_time_us();
+
    // Various functions for loading a ggml llama model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
@ -139,7 +156,7 @@ extern "C" {

    // Set the state reading from the specified address
    // Returns the number of bytes read
-    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
+    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);

    // Save/load session file
    LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -1,6 +1,10 @@
-#include "llama.h"
 #include "ggml.h"
-#include <cassert>
+#include "llama.h"
+
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
 #include <cmath>
 #include <numeric>
 #include <cassert>
@ -8,7 +12,6 @@
 #include <vector>
 #include <algorithm>

-
 void dump(const llama_token_data_array * candidates) {
    for (size_t i = 0; i < candidates->size; i++) {
        printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);