From 60ecf099eddfe70fec797ef6790572e452054add Mon Sep 17 00:00:00 2001
From: Martin Schwaighofer <mschwaig@users.noreply.github.com>
Date: Sun, 28 Jan 2024 12:59:43 +0100
Subject: [PATCH 1/5] add Vulkan support to Nix flake

---
 .devops/nix/package.nix | 21 +++++++++++++++++----
 flake.nix               |  1 +
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index a868a9a61..ad23f7dd7 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -13,18 +13,22 @@
   cudaPackages,
   darwin,
   rocmPackages,
+  vulkan-headers,
+  vulkan-loader,
   clblast,
   useBlas ? builtins.all (x: !x) [
     useCuda
     useMetalKit
     useOpenCL
     useRocm
+    useVulkan
   ],
   useCuda ? config.cudaSupport,
   useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
   useMpi ? false, # Increases the runtime closure size by ~700M
   useOpenCL ? false,
   useRocm ? config.rocmSupport,
+  useVulkan ? false,
   llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
 }@inputs:
 
@@ -48,7 +52,8 @@ let
     ++ lib.optionals useMetalKit [ "MetalKit" ]
     ++ lib.optionals useMpi [ "MPI" ]
     ++ lib.optionals useOpenCL [ "OpenCL" ]
-    ++ lib.optionals useRocm [ "ROCm" ];
+    ++ lib.optionals useRocm [ "ROCm" ]
+    ++ lib.optionals useVulkan [ "Vulkan" ];
 
   pnameSuffix =
     strings.optionalString (suffices != [ ])
@@ -108,6 +113,11 @@ let
     hipblas
     rocblas
   ];
+
+  vulkanBuildInputs = [
+    vulkan-headers
+    vulkan-loader
+  ];
 in
 
 effectiveStdenv.mkDerivation (
@@ -164,7 +174,8 @@ effectiveStdenv.mkDerivation (
       ++ optionals useCuda cudaBuildInputs
       ++ optionals useMpi [ mpi ]
       ++ optionals useOpenCL [ clblast ]
-      ++ optionals useRocm rocmBuildInputs;
+      ++ optionals useRocm rocmBuildInputs
+      ++ optionals useVulkan vulkanBuildInputs;
 
     cmakeFlags =
       [
@@ -178,6 +189,7 @@ effectiveStdenv.mkDerivation (
         (cmakeBool "LLAMA_HIPBLAS" useRocm)
         (cmakeBool "LLAMA_METAL" useMetalKit)
         (cmakeBool "LLAMA_MPI" useMpi)
+        (cmakeBool "LLAMA_VULKAN" useVulkan)
       ]
       ++ optionals useCuda [
         (
@@ -218,6 +230,7 @@ effectiveStdenv.mkDerivation (
         useMpi
         useOpenCL
         useRocm
+        useVulkan
         ;
 
       shell = mkShell {
@@ -242,11 +255,11 @@ effectiveStdenv.mkDerivation (
       # Configurations we don't want even the CI to evaluate. Results in the
       # "unsupported platform" messages. This is mostly a no-op, because
       # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
+      badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin;
 
       # Configurations that are known to result in build failures. Can be
       # overridden by importing Nixpkgs with `allowBroken = true`.
-      broken = (useMetalKit && !effectiveStdenv.isDarwin);
+      broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin);
 
       description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
       homepage = "https://github.com/ggerganov/llama.cpp/";
diff --git a/flake.nix b/flake.nix
index a776ba024..ad2f9b295 100644
--- a/flake.nix
+++ b/flake.nix
@@ -157,6 +157,7 @@
 
                 mpi-cpu = config.packages.default.override { useMpi = true; };
                 mpi-cuda = config.packages.default.override { useMpi = true; };
+                vulkan = config.packages.default.override { useVulkan = true; };
               }
               // lib.optionalAttrs (system == "x86_64-linux") {
                 rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;

From 3cc5ed353c07201d8d5b98b0a4713ab633da6d04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 3 Feb 2024 20:14:59 +0100
Subject: [PATCH 2/5] make: fix nvcc optimization flags for host code (#5309)

---
 Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index a55d15888..40b16e0ea 100644
--- a/Makefile
+++ b/Makefile
@@ -109,6 +109,7 @@ MK_NVCCFLAGS  += -O3
 else
 MK_CFLAGS     += -O3
 MK_CXXFLAGS   += -O3
+MK_NVCCFLAGS  += -O3
 endif
 
 # clock_gettime came in POSIX.1b (1993)
@@ -365,7 +366,7 @@ ifdef LLAMA_CUBLAS
 	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
-	MK_NVCCFLAGS  = -use_fast_math
+	MK_NVCCFLAGS += -use_fast_math
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT

From 3c0d25c4756742ebf15ad44700fabc0700c638bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 3 Feb 2024 20:15:13 +0100
Subject: [PATCH 3/5] make: add nvcc info print (#5310)

---
 Makefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 40b16e0ea..21d5e15ba 100644
--- a/Makefile
+++ b/Makefile
@@ -553,8 +553,11 @@ $(info I CFLAGS:    $(CFLAGS))
 $(info I CXXFLAGS:  $(CXXFLAGS))
 $(info I NVCCFLAGS: $(NVCCFLAGS))
 $(info I LDFLAGS:   $(LDFLAGS))
-$(info I CC:        $(shell $(CC) --version | head -n 1))
-$(info I CXX:       $(shell $(CXX) --version | head -n 1))
+$(info I CC:        $(shell $(CC)   --version | head -n 1))
+$(info I CXX:       $(shell $(CXX)  --version | head -n 1))
+ifdef LLAMA_CUBLAS
+$(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
+endif # LLAMA_CUBLAS
 $(info )
 
 #

From 277fad30c60ef3559dc2d01b19d05e659d40a824 Mon Sep 17 00:00:00 2001
From: Welby Seely <welbyseely@gmail.com>
Date: Sat, 3 Feb 2024 23:18:51 -0500
Subject: [PATCH 4/5] cmake : use set() for LLAMA_WIN_VER (#5298)

option() is specifically for booleans.

Fixes #5158
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c156c4824..8c04e4c19 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,7 +79,7 @@ if (NOT MSVC)
 endif()
 
 if (WIN32)
-    option(LLAMA_WIN_VER                     "llama: Windows Version"                           0x602)
+    set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version")
 endif()
 
 # 3rd party libs

From 5ed26e1fc9fab4ce96ecf2d84183fe45bdcab0d4 Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Sun, 4 Feb 2024 10:39:58 +0200
Subject: [PATCH 5/5] Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 examples/imatrix/imatrix.cpp | 116 +++++++++++++++++++++++++++++++++--
 1 file changed, 112 insertions(+), 4 deletions(-)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index ea06fcdbf..bc9f6fa68 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -36,6 +36,8 @@ public:
     void set_parameters(StatParams&& params) { m_params = std::move(params); }
     bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
     void save_imatrix() const;
+    bool load_imatrix(const char * file_name, bool add);
+    static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
 private:
     std::unordered_map<std::string, Stats> m_stats;
     StatParams                             m_params;
@@ -189,6 +191,57 @@ void IMatrixCollector::save_imatrix(const char * fname) const {
     }
 }
 
+bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
+    std::ifstream in(imatrix_file, std::ios::binary);
+    if (!in) {
+        printf("%s: failed to open %s\n",__func__,imatrix_file);
+        return false;
+    }
+    int n_entries;
+    in.read((char*)&n_entries, sizeof(n_entries));
+    if (in.fail() || n_entries < 1) {
+        printf("%s: no data in file %s\n", __func__, imatrix_file);
+        return false;
+    }
+    for (int i = 0; i < n_entries; ++i) {
+        int len; in.read((char *)&len, sizeof(len));
+        std::vector<char> name_as_vec(len+1);
+        in.read((char *)name_as_vec.data(), len);
+        if (in.fail()) {
+            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
+            return false;
+        }
+        name_as_vec[len] = 0;
+        std::string name{name_as_vec.data()};
+        auto& e = imatrix_data[std::move(name)];
+        int ncall;
+        in.read((char*)&ncall, sizeof(ncall));
+        int nval;
+        in.read((char *)&nval, sizeof(nval));
+        if (in.fail() || nval < 1) {
+            printf("%s: failed reading number of values for entry %d\n",__func__,i);
+            imatrix_data = {};
+            return false;
+        }
+        e.values.resize(nval);
+        in.read((char*)e.values.data(), nval*sizeof(float));
+        if (in.fail()) {
+            printf("%s: failed reading data for entry %d\n",__func__,i);
+            imatrix_data = {};
+            return false;
+        }
+        e.ncall = ncall;
+    }
+    return true;
+}
+
+bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
+    if (!add) {
+        m_stats.clear();
+    }
+    return load_imatrix(file_name, m_stats);
+}
+
 static IMatrixCollector g_collector;
 
 static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -269,7 +322,7 @@ static void process_logits(
     }
 }
 
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
 
     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
     const int n_ctx = llama_n_ctx(ctx);
@@ -282,6 +335,15 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
     auto tim2 = std::chrono::high_resolution_clock::now();
     fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 
+    if (from_chunk > 0) {
+        if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
+            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
+            return false;
+        }
+        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
+        tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
+    }
+
     if (int(tokens.size()) < 2*n_ctx) {
         fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
                 n_ctx);
@@ -402,7 +464,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
 int main(int argc, char ** argv) {
 
     StatParams sparams;
+    std::string prev_result_file;
+    std::string combine_files;
     bool compute_ppl = true;
+    int  from_chunk  = 0;
     std::vector<char*> args;
     args.push_back(argv[0]);
     int iarg = 1;
@@ -423,6 +488,13 @@ int main(int argc, char ** argv) {
             compute_ppl = false;
         } else if (arg == "--keep-imatrix") {
             sparams.keep_every = std::stoi(argv[++iarg]);
+        } else if (arg == "--continue-from") {
+            prev_result_file = argv[++iarg];
+        } else if (arg == "--combine") {
+            combine_files = argv[++iarg];
+        }
+        else if (arg == "--from-chunk") {
+            from_chunk = std::stoi(argv[++iarg]);
         } else {
             args.push_back(argv[iarg]);
         }
@@ -436,14 +508,50 @@ int main(int argc, char ** argv) {
         }
     }
 
+    g_collector.set_parameters(std::move(sparams));
+
+    if (!combine_files.empty()) {
+        std::vector<std::string> files;
+        size_t pos = 0;
+        while (true) {
+            auto new_pos = combine_files.find(',', pos);
+            if (new_pos != std::string::npos) {
+                files.emplace_back(combine_files.substr(pos, new_pos - pos));
+                pos = new_pos + 1;
+            } else {
+                files.emplace_back(combine_files.substr(pos));
+                break;
+            }
+        }
+        if (files.size() < 2) {
+            fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
+            return 1;
+        }
+        printf("Combining the following %d files\n", int(files.size()));
+        for (auto& file : files) {
+            printf("    %s\n", file.c_str());
+            if (!g_collector.load_imatrix(file.c_str(), true)) {
+                fprintf(stderr, "Failed to load %s\n", file.c_str());
+                return 1;
+            }
+        }
+        g_collector.save_imatrix();
+        return 0;
+    }
+
+    if (!prev_result_file.empty()) {
+        if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
+            fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
+            return 1;
+        }
+    }
+
     gpt_params params;
     params.n_batch = 512;
     if (!gpt_params_parse(args.size(), args.data(), params)) {
         return 1;
     }
 
-    g_collector.set_parameters(std::move(sparams));
-
     params.logits_all = true;
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
@@ -495,7 +603,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s\n", get_system_info(params).c_str());
     }
 
-    bool OK = compute_imatrix(ctx, params, compute_ppl);
+    bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
     if (!OK) {
         return 1;
     }