From 0fd6c1f015f6cccf3b527f7dbd8386a434728126 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 14 Mar 2024 10:12:29 +0200
Subject: [PATCH 01/31] embedding : print cosine similarity (#899)

---
 common/common.cpp                | 13 +++++++++++++
 common/common.h                  |  1 +
 examples/embedding/embedding.cpp | 21 ++++++++++++++++-----
 examples/gritlm/gritlm.cpp       | 26 ++++++--------------------
 4 files changed, 36 insertions(+), 25 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index 73b1b61ba..58fbd05aa 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1877,3 +1877,16 @@ void llama_embd_normalize(const float * inp, float * out, int n) {
     }
 }
 
+float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
+    double sum  = 0.0;
+    double sum1 = 0.0;
+    double sum2 = 0.0;
+
+    for (int i = 0; i < n; i++) {
+        sum  += embd1[i] * embd2[i];
+        sum1 += embd1[i] * embd1[i];
+        sum2 += embd2[i] * embd2[i];
+    }
+
+    return sum / (sqrt(sum1) * sqrt(sum2));
+}
diff --git a/common/common.h b/common/common.h
index 0f178b9eb..d250eef8b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -268,3 +268,4 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
 
 void llama_embd_normalize(const float * inp, float * out, int n);
 
+float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 49302a199..f390c4061 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -168,14 +168,25 @@ int main(int argc, char ** argv) {
     batch_decode(ctx, batch, out, s, n_embd);
 
     // print first 3 embeddings
+    fprintf(stdout, "\n");
     for (int j = 0; j < std::min(3, n_prompts); j++) {
-        fprintf(stderr, "embedding %d: ", j);
-        for (int i = 0; i < n_embd; i++) {
-            fprintf(stderr, "%f ", emb[j * n_embd + i]);
+        fprintf(stdout, "embedding %d: ", j);
+        for (int i = 0; i < std::min(16, n_embd); i++) {
+            fprintf(stdout, "%f ", emb[j * n_embd + i]);
         }
-        fprintf(stderr, "\n\n");
+        fprintf(stdout, "\n");
+    }
+
+    // print cosine similarity matrix
+    fprintf(stdout, "\n");
+    printf("cosine similarity matrix:\n\n");
+    for (int i = 0; i < n_prompts; i++) {
+        for (int j = 0; j < n_prompts; j++) {
+            float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+            fprintf(stdout, "%6.2f ", sim);
+        }
+        fprintf(stdout, "\n");
     }
-    fprintf(stderr, "\n");
 
     // clean up
     llama_print_timings(ctx);
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 3d4b085d6..52fd719b3 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -6,22 +6,6 @@
 
 // #define GRIT_DEBUG
 
-static float dot_product(const std::vector<float> & v1, const std::vector<float> & v2) {
-    float dot = 0.0f;
-    for (uint64_t i = 0; i < v1.size(); ++i) {
-        dot += v1[i] * v2[i];
-    }
-    return dot;
-}
-
-static float norm(const std::vector<float> & v) {
-    return std::sqrt(dot_product(v, v));
-}
-
-static float cosine_similarity(const std::vector<float> & v1, const std::vector<float> & v2) {
-    return dot_product(v1, v2) / (norm(v1) * norm(v2));
-}
-
 static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
     std::vector<std::vector<float>> result;
 
@@ -203,10 +187,12 @@ int main(int argc, char * argv[]) {
         const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
         const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));
 
-        const float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]);
-        const float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
-        const float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]);
-        const float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]);
+        const int n_embd = llama_n_embd(mdl);
+
+        const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
+        const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
 
         std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
         std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);

From 381da2d9f0940d7009e3e918bed36338c8ff2fbb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 14 Mar 2024 11:55:23 +0200
Subject: [PATCH 02/31] metal : build metallib + fix embed path (#6015)

* metal : build metallib + fix embed path

ggml-ci

* metal : fix embed build + update library load logic

ggml-ci

* metal : fix embeded library build

ggml-ci

* ci : fix iOS builds to use embedded library
---
 .github/workflows/build.yml |  2 ++
 .gitignore                  |  2 ++
 CMakeLists.txt              | 70 ++++++++++++++++++++-----------------
 Makefile                    | 15 ++++----
 ggml-metal.m                | 49 +++++++++++++++++---------
 ggml-metal.metal            |  3 --
 6 files changed, 83 insertions(+), 58 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d39cd6bc3..0da01d5ba 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -333,6 +333,7 @@ jobs:
           mkdir build
           cd build
           cmake -G Xcode .. \
+            -DLLAMA_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
             -DLLAMA_BUILD_SERVER=OFF \
@@ -361,6 +362,7 @@ jobs:
           mkdir build
           cd build
           cmake -G Xcode .. \
+            -DLLAMA_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
             -DLLAMA_BUILD_SERVER=OFF \
diff --git a/.gitignore b/.gitignore
index d28f4d1b8..1ad8d929b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,8 @@
 .vscode/
 .idea/
 
+ggml-metal-embed.metal
+
 lcov-report/
 gcovr-report/
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a8abf4088..3ac2804a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -200,9 +200,6 @@ if (LLAMA_METAL)
         add_compile_definitions(GGML_METAL_NDEBUG)
     endif()
 
-    # get full path to the file
-    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
-
     # copy ggml-common.h and ggml-metal.metal to bin directory
     configure_file(ggml-common.h    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h    COPYONLY)
     configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
@@ -211,53 +208,62 @@ if (LLAMA_METAL)
         enable_language(ASM)
         add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
 
+        set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h")
         set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+
         file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
-        set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
+
+        # merge ggml-common.h and ggml-metal.metal into a single file
+        set(METALLIB_EMBED_ASM    "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
+        set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
 
         add_custom_command(
-            OUTPUT ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
-            DEPENDS ${METALLIB_SOURCE}
+            OUTPUT ${METALLIB_EMBED_ASM}
+            COMMAND echo "Embedding Metal library"
+            COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED}
+            COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
+            COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
+            COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
+            COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
+            COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
+            COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
+            DEPENDS ggml-metal.metal ggml-common.h
             COMMENT "Generate assembly for embedded Metal library"
         )
 
-        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
-    endif()
-
-    if (LLAMA_METAL_SHADER_DEBUG)
-        # custom command to do the following:
-        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
-        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
-        #
-        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
-        #       disabling fast math is needed in order to pass tests/test-backend-ops
-        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
-        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
-        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
-        set(XC_FLAGS -fno-fast-math -fno-inline -g)
-        if (LLAMA_QKK_64)
-            set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
+        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
+    else()
+        if (LLAMA_METAL_SHADER_DEBUG)
+            # custom command to do the following:
+            #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
+            #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
+            #
+            # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
+            #       disabling fast math is needed in order to pass tests/test-backend-ops
+            # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
+            # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
+            #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
+            set(XC_FLAGS -fno-fast-math -fno-inline -g)
+        else()
+            set(XC_FLAGS -O3)
         endif()
 
         add_custom_command(
             OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
             COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
             COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            DEPENDS ggml-metal.metal
+            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
+            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
+            DEPENDS ggml-metal.metal ggml-common.h
             COMMENT "Compiling Metal kernels"
-        )
+            )
 
         add_custom_target(
             ggml-metal ALL
             DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-        )
-    endif()
+            )
+    endif() # LLAMA_METAL_EMBED_LIBRARY
 
     set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
         ${FOUNDATION_LIBRARY}
diff --git a/Makefile b/Makefile
index db9968efb..cb597b209 100644
--- a/Makefile
+++ b/Makefile
@@ -557,15 +557,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@
 
 ifdef LLAMA_METAL_EMBED_LIBRARY
-ggml-metal-embed.o: ggml-metal.metal
+ggml-metal-embed.o: ggml-metal.metal ggml-common.h
 	@echo "Embedding Metal library"
+	@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
 	$(eval TEMP_ASSEMBLY=$(shell mktemp))
-	@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
-	@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
+	@echo ".section __DATA, __ggml_metallib"   >  $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_start"        >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_start:"              >> $(TEMP_ASSEMBLY)
+	@echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_end"          >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_end:"                >> $(TEMP_ASSEMBLY)
 	@$(AS) $(TEMP_ASSEMBLY) -o $@
 	@rm -f ${TEMP_ASSEMBLY}
 endif
diff --git a/ggml-metal.m b/ggml-metal.m
index 3a5476c52..c3451a79b 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -280,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
     id<MTLLibrary> metal_library;
 
     // load library
+    //
+    // - first check if the library is embedded
+    // - then check if the library is in the bundle
+    // - if not found, load the source and compile it
+    // - if that fails, return NULL
     {
         NSBundle * bundle = nil;
 #ifdef SWIFT_PACKAGE
@@ -287,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #else
         bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
 #endif
+
         NSError * error = nil;
-        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
-        if (libPath != nil) {
+
+#if GGML_METAL_EMBED_LIBRARY
+        const bool try_metallib = false;
+#else
+        const bool try_metallib = true;
+#endif
+
+        NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
+        if (try_metallib && path_lib != nil) {
             // pre-compiled library found
-            NSURL * libURL = [NSURL fileURLWithPath:libPath];
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
+            NSURL * libURL = [NSURL fileURLWithPath:path_lib];
+            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
+
             metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
             if (error) {
                 GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
@@ -305,31 +319,34 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
             extern const char ggml_metallib_start[];
             extern const char ggml_metallib_end[];
 
-            NSString * src  = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
+            NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
 #else
             GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
 
-            NSString * sourcePath;
-            NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
+            NSString * path_source;
+            NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
 
-            GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil");
+            GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
 
-            if (ggmlMetalPathResources) {
-                sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"];
+            if (path_resource) {
+                path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
             } else {
-                sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
+                path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
             }
-            if (sourcePath == nil) {
+
+            if (path_source == nil) {
                 GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
-                sourcePath = @"ggml-metal.metal";
+                path_source = @"ggml-metal.metal";
             }
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
-            NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
+
+            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
+
+            NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
             if (error) {
                 GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                 return NULL;
             }
-#endif
+#endif // GGML_METAL_EMBED_LIBRARY
 
             @autoreleasepool {
                 // dictionary of preprocessor macros
diff --git a/ggml-metal.metal b/ggml-metal.metal
index ebf2f5b47..63de56325 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -4,9 +4,6 @@
 
 #include <metal_stdlib>
 
-#define GGML_COMMON_IMPL_METAL
-#include "ggml-common.h"
-
 using namespace metal;
 
 #define MAX(x, y) ((x) > (y) ? (x) : (y))

From 68265ebfc6a1bed022973ea0c3145be1450b7e70 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 14 Mar 2024 12:37:20 +0200
Subject: [PATCH 03/31] embedding : print all resulting embeddings (#899)

---
 examples/embedding/embedding.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index f390c4061..895469a31 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -167,9 +167,9 @@ int main(int argc, char ** argv) {
     float * out = emb + p * n_embd;
     batch_decode(ctx, batch, out, s, n_embd);
 
-    // print first 3 embeddings
+    // print the first part of the embeddings
     fprintf(stdout, "\n");
-    for (int j = 0; j < std::min(3, n_prompts); j++) {
+    for (int j = 0; j < n_prompts; j++) {
         fprintf(stdout, "embedding %d: ", j);
         for (int i = 0; i < std::min(16, n_embd); i++) {
             fprintf(stdout, "%f ", emb[j * n_embd + i]);

From 3fe8d7a17f84bd721cd4d8db35365da44b69f68b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 14 Mar 2024 12:38:37 +0200
Subject: [PATCH 04/31] ggml : designate enum vals for integer types (#6050)

---
 ggml.h | 64 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/ggml.h b/ggml.h
index 1171088a9..ab26c8f59 100644
--- a/ggml.h
+++ b/ggml.h
@@ -337,24 +337,24 @@ extern "C" {
     struct ggml_object;
     struct ggml_context;
 
+    // NOTE: always add types at the end of the enum to keep backward compatibility
     enum ggml_type {
-        GGML_TYPE_F32  = 0,
-        GGML_TYPE_F16  = 1,
-        GGML_TYPE_Q4_0 = 2,
-        GGML_TYPE_Q4_1 = 3,
+        GGML_TYPE_F32     = 0,
+        GGML_TYPE_F16     = 1,
+        GGML_TYPE_Q4_0    = 2,
+        GGML_TYPE_Q4_1    = 3,
         // GGML_TYPE_Q4_2 = 4, support has been removed
-        // GGML_TYPE_Q4_3 (5) support has been removed
-        GGML_TYPE_Q5_0 = 6,
-        GGML_TYPE_Q5_1 = 7,
-        GGML_TYPE_Q8_0 = 8,
-        GGML_TYPE_Q8_1 = 9,
-        // k-quantizations
-        GGML_TYPE_Q2_K = 10,
-        GGML_TYPE_Q3_K = 11,
-        GGML_TYPE_Q4_K = 12,
-        GGML_TYPE_Q5_K = 13,
-        GGML_TYPE_Q6_K = 14,
-        GGML_TYPE_Q8_K = 15,
+        // GGML_TYPE_Q4_3 = 5, support has been removed
+        GGML_TYPE_Q5_0    = 6,
+        GGML_TYPE_Q5_1    = 7,
+        GGML_TYPE_Q8_0    = 8,
+        GGML_TYPE_Q8_1    = 9,
+        GGML_TYPE_Q2_K    = 10,
+        GGML_TYPE_Q3_K    = 11,
+        GGML_TYPE_Q4_K    = 12,
+        GGML_TYPE_Q5_K    = 13,
+        GGML_TYPE_Q6_K    = 14,
+        GGML_TYPE_Q8_K    = 15,
         GGML_TYPE_IQ2_XXS = 16,
         GGML_TYPE_IQ2_XS  = 17,
         GGML_TYPE_IQ3_XXS = 18,
@@ -363,9 +363,9 @@ extern "C" {
         GGML_TYPE_IQ3_S   = 21,
         GGML_TYPE_IQ2_S   = 22,
         GGML_TYPE_IQ4_XS  = 23,
-        GGML_TYPE_I8,
-        GGML_TYPE_I16,
-        GGML_TYPE_I32,
+        GGML_TYPE_I8      = 24,
+        GGML_TYPE_I16     = 25,
+        GGML_TYPE_I32     = 26,
         GGML_TYPE_COUNT,
     };
 
@@ -383,20 +383,20 @@ extern "C" {
 
     // model file types
     enum ggml_ftype {
-        GGML_FTYPE_UNKNOWN     = -1,
-        GGML_FTYPE_ALL_F32     = 0,
-        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        GGML_FTYPE_UNKNOWN        = -1,
+        GGML_FTYPE_ALL_F32        = 0,
+        GGML_FTYPE_MOSTLY_F16     = 1,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0    = 2,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1    = 3,  // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q8_0    = 7,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_0    = 8,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_1    = 9,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q2_K    = 10, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q3_K    = 11, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_K    = 12, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_K    = 13, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q6_K    = 14, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors

From 3ca23481dd309bd51cc31c73a4cc34f922cc372f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= <ondrej@certik.us>
Date: Thu, 14 Mar 2024 04:40:14 -0600
Subject: [PATCH 05/31] gguf-py : add support for I8, I16 and I32 (#6045)

* Refactor dtype handling to be extensible

This code is equivalent as before, but now it is prepared to easily add
more NumPy dtypes.

* Add support for I8, I16 and I32

These types are allowed in the GGUF specification.

* Add support for I8, I16 and I32 to gguf_writer

* Add support for I8, I16, I32 to gguf_reader
---
 gguf-py/gguf/constants.py   |  6 ++++++
 gguf-py/gguf/gguf_reader.py |  9 +++++++++
 gguf-py/gguf/gguf_writer.py | 16 ++++++++++++----
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index b23badb10..99f71f0a1 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -661,6 +661,9 @@ class GGMLQuantizationType(IntEnum):
     IQ3_S   = 21
     IQ2_S   = 22
     IQ4_XS  = 23
+    I8      = 24
+    I16     = 25
+    I32     = 26
 
 
 class GGUFEndian(IntEnum):
@@ -727,6 +730,9 @@ GGML_QUANT_SIZES = {
     GGMLQuantizationType.IQ3_S:   (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
     GGMLQuantizationType.IQ2_S:   (256, 2 + QK_K // 4 + QK_K // 16),
     GGMLQuantizationType.IQ4_XS:  (256, 2 + 2 + QK_K // 2 + QK_K // 64),
+    GGMLQuantizationType.I8:      (1, 1),
+    GGMLQuantizationType.I16:     (1, 2),
+    GGMLQuantizationType.I32:     (1, 4),
 }
 
 
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index 5b6d4ba6b..1c10f5753 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -248,6 +248,15 @@ class GGUFReader:
             elif ggml_type == GGMLQuantizationType.F16:
                 item_count = n_elems
                 item_type = np.float16
+            elif ggml_type == GGMLQuantizationType.I8:
+                item_count = n_elems
+                item_type = np.int8
+            elif ggml_type == GGMLQuantizationType.I16:
+                item_count = n_elems
+                item_type = np.int16
+            elif ggml_type == GGMLQuantizationType.I32:
+                item_count = n_elems
+                item_type = np.int32
             else:
                 item_count = n_bytes
                 item_type = np.uint8
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index e49c5db68..9c1eeac31 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -196,9 +196,6 @@ class GGUFWriter:
         if self.state is not WriterState.EMPTY:
             raise ValueError(f'Expected output file to be empty, got {self.state}')
 
-        if raw_dtype is None and tensor_dtype not in (np.float32, np.float16):
-            raise ValueError("Only F32 and F16 tensors are supported for now")
-
         encoded_name = name.encode("utf8")
         self.ti_data += self._pack("Q", len(encoded_name))
         self.ti_data += encoded_name
@@ -207,7 +204,18 @@ class GGUFWriter:
         for i in range(n_dims):
             self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
         if raw_dtype is None:
-            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+            if tensor_shape == np.float32:
+                dtype = GGMLQuantizationType.F32
+            elif tensor_dtype == np.float16:
+                dtype = GGMLQuantizationType.F16
+            elif tensor_dtype == np.int8:
+                dtype = GGMLQuantizationType.I8
+            elif tensor_dtype == np.int16:
+                dtype = GGMLQuantizationType.I16
+            elif tensor_dtype == np.int32:
+                dtype = GGMLQuantizationType.I32
+            else:
+                raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now")
         else:
             dtype = raw_dtype
         self.ti_data += self._pack("I", dtype)

From 2c4fb69246834503db7b78bcbedcef506bbc60c4 Mon Sep 17 00:00:00 2001
From: Michael Podvitskiy <podvitskiymichael@gmail.com>
Date: Thu, 14 Mar 2024 11:56:48 +0100
Subject: [PATCH 06/31] llama : optimize defrag moves + fix fragmentation
 calculation (#6037)

* attempt to reduce the impact of a worst-case scenario

* fragmentation calculation fix

* Update llama.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 llama.cpp | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 38e7036a7..ff467c575 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9036,8 +9036,8 @@ static int llama_decode_internal(
     //llama_synchronize(&lctx);
 
     // decide if we need to defrag the kv cache
-    if (cparams.defrag_thold >= 0.0f) {
-        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f;
+    if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
+        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
 
         // queue defragmentation for next llama_kv_cache_update
         if (fragmentation > cparams.defrag_thold) {
@@ -9069,6 +9069,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
     // number of cells moved
     uint32_t n_moves = 0;
 
+    // each move requires 6*n_layer tensors (see build_defrag)
+    //   - source view, destination view, copy operation
+    //   - x2 for keys and values
+    const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
+
     // determine which KV cells to move where
     //
     //  cell i moves to ids[i]
@@ -9095,15 +9100,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
             nh++;
         }
 
-        // each move requires 6*n_layer tensors (see build_defrag)
-        //   - source view, destination view, copy operation
-        //   - x2 for keys and values
-        //
-        if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
-            // the graph is too big, we cannot move more cells
-            break;
-        }
-
         uint32_t nf = 0;
         uint32_t is = n_kv - 1;
 
@@ -9133,11 +9129,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
         // are we moving a continuous block of memory?
         bool cont = false;
 
+        // should we stop searching for the next move?
+        bool stop = false;
+
         // go back and move the nf cells to the hole
         for (; i1 < n_kv; ++i1) {
             auto & cell1 = kv_self.cells[i1];
 
             if (cell1.is_empty() || ids[i1] != n_kv) {
+                if (n_moves == max_moves) {
+                    stop = true;
+                    break;
+                }
+
                 cont = false;
                 continue;
             }
@@ -9164,6 +9168,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
             }
         }
 
+        if (stop || n_moves == max_moves) {
+            break;
+        }
+
         //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
 
         i0 += nh - 1;

From a44bc969e4cd62ca9f4332e17fe3c51f2093e7c6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 14 Mar 2024 13:13:06 +0200
Subject: [PATCH 07/31] llama : fix typo

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index ff467c575..eb48b1e90 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3932,7 +3932,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
     LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
     LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
-    LLAMA_LOG_INFO("%s: causal attm      = %d\n",     __func__, hparams.causal_attn);
+    LLAMA_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
     LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
     LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
     LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);

From 43241adf22e8231ffaf3827d2c9310cc0ffd5ac5 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Thu, 14 Mar 2024 12:15:39 +0100
Subject: [PATCH 08/31] server: disable debug release type sanitizer, simplify
 trigger (#6047)

- increase time out for server
 - do not fail fast
---
 .github/workflows/server.yml                  | 17 ++++++++---------
 examples/server/tests/features/steps/steps.py |  9 ++++++++-
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index e385e03f3..5e38b3547 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -25,17 +25,14 @@ jobs:
     strategy:
       matrix:
         sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
+        build_type: [Debug]
         include:
           - build_type: Release
             sanitizer: ""
-        exclude:
-          - build_type: Release
-            sanitizer: ADDRESS
-          - build_type: Release
+          - build_type: Debug
             sanitizer: THREAD
-          - build_type: Release
-            sanitizer: UNDEFINED
+            disabled_on_pr: true
+      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
 
     container:
       image: ubuntu:latest
@@ -81,13 +78,14 @@ jobs:
 
       - name: Tests
         id: server_integration_tests
+        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
         run: |
           cd examples/server/tests
           PORT=8888 ./tests.sh
 
       - name: Slow tests
         id: server_integration_tests_slow
-        if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }}
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
         run: |
           cd examples/server/tests
           PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
@@ -124,13 +122,14 @@ jobs:
 
       - name: Tests
         id: server_integration_tests
+        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
         run: |
           cd examples/server/tests
           behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
 
       - name: Slow tests
         id: server_integration_tests_slow
-        if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }}
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
         run: |
           cd examples/server/tests
           behave.exe --stop --no-skipped --no-capture --tags slow
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index cfa9f96ec..a59a52d21 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -119,6 +119,10 @@ def step_server_metrics(context):
 def step_start_server(context):
     start_server_background(context)
     attempts = 0
+    max_attempts = 20
+    if 'GITHUB_ACTIONS' in os.environ:
+        max_attempts *= 2
+
     while True:
         with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
             result = sock.connect_ex((context.server_fqdn, context.server_port))
@@ -126,7 +130,7 @@ def step_start_server(context):
                 print("\x1b[33;46mserver started!\x1b[0m")
                 return
             attempts += 1
-            if attempts > 20:
+            if attempts > max_attempts:
                 assert False, "server not started"
             print(f"waiting for server to start, connect error code = {result}...")
             time.sleep(0.1)
@@ -943,6 +947,9 @@ async def wait_for_health_status(context,
         print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
     interval = 0.5
     counter = 0
+    if 'GITHUB_ACTIONS' in os.environ:
+        timeout *= 2
+
     async with aiohttp.ClientSession() as session:
         while True:
             async with await session.get(f'{base_url}/health', params=params) as health_response:

From 15a333260ab637a040ed0864c206a2ceaf806bb8 Mon Sep 17 00:00:00 2001
From: Jian Liao <jianliao@users.noreply.github.com>
Date: Thu, 14 Mar 2024 04:18:23 -0700
Subject: [PATCH 09/31] readme : improve readme for Llava-1.6 example (#6044)

Co-authored-by: Jian Liao <jianliao@adobe.com>
---
 examples/llava/README.md | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/examples/llava/README.md b/examples/llava/README.md
index 35e6d9e5d..67cb0f22b 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -63,12 +63,20 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
 ```console
 git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 ```
-2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
+
+2) Install the required Python packages:
+
+```sh
+pip install -r examples/llava/requirements.txt
+```
+
+3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
 python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory
-3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
+
+4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
 ```console
 mkdir vit
 cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
@@ -76,18 +84,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/
 curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
 ```
 
-4) Create the visual gguf model:
+5) Create the visual gguf model:
 ```console
 python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
 
-5) Then convert the model to gguf format:
+6) Then convert the model to gguf format:
 ```console
 python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
 
-6) And finally we can run the llava-cli using the 1.6 model version:
+7) And finally we can run the llava-cli using the 1.6 model version:
 ```console
 ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```

From 77178eedc83d49f31bf757d8e12315d76460be78 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 14 Mar 2024 13:32:14 +0200
Subject: [PATCH 10/31] gguf-py : fix dtype check (#6045)

---
 gguf-py/gguf/gguf_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 9c1eeac31..4d389be95 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -204,7 +204,7 @@ class GGUFWriter:
         for i in range(n_dims):
             self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
         if raw_dtype is None:
-            if tensor_shape == np.float32:
+            if tensor_dtype == np.float32:
                 dtype = GGMLQuantizationType.F32
             elif tensor_dtype == np.float16:
                 dtype = GGMLQuantizationType.F16

From 044ec4b2a567f649459ccd20af2f387c784faa51 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 14 Mar 2024 15:14:14 +0200
Subject: [PATCH 11/31] embedding : add EOS token if not present (#899)

---
 examples/embedding/embedding.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 895469a31..cbf9aa2b5 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -112,13 +112,20 @@ int main(int argc, char ** argv) {
     // tokenize the prompts and trim
     std::vector<std::vector<int32_t>> inputs;
     for (const auto & prompt : prompts) {
-        auto inp = ::llama_tokenize(ctx, prompt, true);
+        auto inp = ::llama_tokenize(ctx, prompt, true, false);
         if (inp.size() > n_batch) {
             inp.resize(n_batch);
         }
         inputs.push_back(inp);
     }
 
+    // add eos if not present
+    for (auto & inp : inputs) {
+        if (inp.empty() || inp.back() != llama_token_eos(model)) {
+            inp.push_back(llama_token_eos(model));
+        }
+    }
+
     // tokenization stats
     if (params.verbose_prompt) {
         for (int i = 0; i < (int) inputs.size(); i++) {
@@ -172,7 +179,7 @@ int main(int argc, char ** argv) {
     for (int j = 0; j < n_prompts; j++) {
         fprintf(stdout, "embedding %d: ", j);
         for (int i = 0; i < std::min(16, n_embd); i++) {
-            fprintf(stdout, "%f ", emb[j * n_embd + i]);
+            fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
         }
         fprintf(stdout, "\n");
     }

From 69ff61397d2b7b550dcdda4a35b35128892408b0 Mon Sep 17 00:00:00 2001
From: Michael Podvitskiy <podvitskiymichael@gmail.com>
Date: Thu, 14 Mar 2024 17:21:56 +0100
Subject: [PATCH 12/31] llama : support models without vocabulary (#5798)

* additional methods to read model and ctx parameters

* vocab size as a part of a model metadata

* models without vocabulary, convert.py part

* models without vocabulary, llama.cpp part

* PR clean up

* converter scrypt fixes

* llama_vocab_type update (renamed the new key)

* pr review fixes

* revert function renaming

* one more NoVocab assert
---
 convert.py                  | 126 +++++++++++++++++++++---------------
 gguf-py/gguf/constants.py   |   2 +
 gguf-py/gguf/gguf_writer.py |   3 +
 llama.cpp                   |  92 +++++++++++++++++---------
 llama.h                     |   7 +-
 5 files changed, 142 insertions(+), 88 deletions(-)

diff --git a/convert.py b/convert.py
index c15f8c47e..161430f3e 100755
--- a/convert.py
+++ b/convert.py
@@ -332,6 +332,9 @@ class Params:
 #
 
 class BpeVocab:
+    tokenizer_model = "gpt2"
+    name = "bpe"
+
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
         self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
         if isinstance(self.bpe_tokenizer.get('model'), dict):
@@ -390,6 +393,9 @@ class BpeVocab:
 
 
 class SentencePieceVocab:
+    tokenizer_model = "llama"
+    name = "spm"
+
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
         self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
         added_tokens: dict[str, int]
@@ -453,6 +459,9 @@ class SentencePieceVocab:
 
 
 class HfVocab:
+    tokenizer_model = "llama"
+    name = "hfft"
+
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
         try:
             from transformers import AutoTokenizer
@@ -553,7 +562,15 @@ class HfVocab:
         return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 
 
-Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
+class NoVocab:
+    tokenizer_model = "no_vocab"
+    name = "no_vocab"
+
+    def __repr__(self) -> str:
+        return "<NoVocab for a model without integrated vocabulary>"
+
+
+Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
 
 
 #
@@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
     # Handle special case where the model's vocab size is not set
     if params.n_vocab == -1:
         raise ValueError(
-            f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
+            f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
         )
+    if isinstance(vocab, NoVocab):
+        return  # model has no vocab
 
     # Check for a vocab size mismatch
     if params.n_vocab == vocab.vocab_size:
@@ -977,6 +996,7 @@ class OutputFile:
             name = str(params.path_model.parent).split('/')[-1]
 
         self.gguf.add_name                (name)
+        self.gguf.add_vocab_size          (params.n_vocab)
         self.gguf.add_context_length      (params.n_ctx)
         self.gguf.add_embedding_length    (params.n_embd)
         self.gguf.add_block_count         (params.n_layer)
@@ -1013,21 +1033,9 @@ class OutputFile:
         if params.ftype is not None:
             self.gguf.add_file_type(params.ftype)
 
-    def handle_tokenizer_model(self, vocab: Vocab) -> str:
-        # Map the vocab types to the supported tokenizer models
-        tokenizer_model = {
-            SentencePieceVocab: "llama",
-            HfVocab: "llama",
-            BpeVocab: "gpt2",
-        }.get(type(vocab))
-
-        # Block if vocab type is not predefined
-        if tokenizer_model is None:
-            raise ValueError("Unknown vocab type: Not supported")
-
-        return tokenizer_model
-
     def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
+        assert not isinstance(vocab, NoVocab)
+
         tokens = []
         scores = []
         toktypes = []
@@ -1043,11 +1051,8 @@ class OutputFile:
         return tokens, scores, toktypes
 
     def add_meta_vocab(self, vocab: Vocab) -> None:
-        # Handle the tokenizer model
-        tokenizer_model = self.handle_tokenizer_model(vocab)
-
         # Ensure that tokenizer_model is added to the GGUF model
-        self.gguf.add_tokenizer_model(tokenizer_model)
+        self.gguf.add_tokenizer_model(vocab.tokenizer_model)
 
         # Extract model vocabulary for model conversion
         tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
@@ -1074,6 +1079,26 @@ class OutputFile:
     def write_tensor_info(self) -> None:
         self.gguf.write_ti_data_to_file()
 
+    def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
+        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
+        if ftype == GGMLFileType.MostlyQ8_0:
+            ndarrays = bounded_parallel_map(
+                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
+                use_processpool_executor=True,
+            )
+        else:
+            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
+
+        start = time.time()
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+            elapsed = time.time() - start
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            padi = len(str(len(model)))
+            print(
+                f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
+            )
+            self.gguf.write_tensor_data(ndarray)
+
     def close(self) -> None:
         self.gguf.close()
 
@@ -1082,7 +1107,7 @@ class OutputFile:
         fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
         endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
     ) -> None:
-        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
+        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
 
@@ -1120,8 +1145,11 @@ class OutputFile:
 
         # meta data
         of.add_meta_arch(params)
-        of.add_meta_vocab(vocab)
-        of.add_meta_special_vocab(svocab)
+        if isinstance(vocab, NoVocab):
+            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
+        else:
+            of.add_meta_vocab(vocab)
+            of.add_meta_special_vocab(svocab)
 
         # tensor info
         for name, lazy_tensor in model.items():
@@ -1131,24 +1159,7 @@ class OutputFile:
         of.write_tensor_info()
 
         # tensor data
-        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
-        if ftype == GGMLFileType.MostlyQ8_0:
-            ndarrays = bounded_parallel_map(
-                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
-                use_processpool_executor=True,
-            )
-        else:
-            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
-
-        start = time.time()
-        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
-            elapsed = time.time() - start
-            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
-            padi = len(str(len(model)))
-            print(
-                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
-            )
-            of.gguf.write_tensor_data(ndarray)
+        of.write_tensor_data(ftype, model, concurrency)
 
         of.close()
 
@@ -1309,8 +1320,8 @@ class VocabFactory:
                 return vtype, path
         raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
 
-    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
-        load_merges = vocabtype == "bpe"
+    def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
+        load_merges = vocab.name == "bpe"
         n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
         return gguf.SpecialVocab(
             model_parent_path,
@@ -1319,30 +1330,34 @@ class VocabFactory:
             n_vocab=n_vocab,
         )
 
-    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
+    def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
         vocab_type, path = self._select_file(vocab_types)
         print(f"Loading vocab file {path!r}, type {vocab_type!r}")
 
         added_tokens_path = path.parent / "added_tokens.json"
-        vocab: Vocab
         if vocab_type == "bpe":
-            vocab = BpeVocab(
+            return BpeVocab(
                 path, added_tokens_path if added_tokens_path.exists() else None
             )
-        elif vocab_type == "spm":
-            vocab = SentencePieceVocab(
+        if vocab_type == "spm":
+            return SentencePieceVocab(
                 path, added_tokens_path if added_tokens_path.exists() else None
             )
-        elif vocab_type == "hfft":
-            vocab = HfVocab(
+        if vocab_type == "hfft":
+            return HfVocab(
                 path.parent, added_tokens_path if added_tokens_path.exists() else None
             )
+        raise ValueError(vocab_type)
+
+    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
+        vocab: Vocab
+        if len(vocab_types) == 1 and "no_vocab" in vocab_types:
+            vocab = NoVocab()
         else:
-            raise ValueError(vocab_type)
+            vocab = self._create_vocab_by_path(vocab_types)
         # FIXME: Respect --vocab-dir?
         special_vocab = self._create_special_vocab(
             vocab,
-            vocab_type,
             model_parent_path,
         )
         return vocab, special_vocab
@@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
     parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
     parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
+    parser.add_argument("--no-vocab",     action="store_true",    help="store model without the vocab")
     parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
     parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--vocab-type",                           help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
@@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
 
     args = parser.parse_args(args_in)
+    if args.no_vocab:
+        if args.vocab_only:
+            raise ValueError("no need to specify --vocab-only if using --no-vocab")
+        args.vocab_type = "no_vocab"
 
     if args.dump_single:
         model_plus = lazy_load_file(args.model)
@@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None:
         print(f"Wrote {outfile}")
         return
 
-    if model_plus.vocab is not None and args.vocab_dir is None:
+    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
         vocab = model_plus.vocab
 
     print(f"Vocab info: {vocab}")
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 99f71f0a1..2d7cf16c1 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -32,6 +32,7 @@ class Keys:
         FILE_TYPE            = "general.file_type"
 
     class LLM:
+        VOCAB_SIZE            = "{arch}.vocab_size"
         CONTEXT_LENGTH        = "{arch}.context_length"
         EMBEDDING_LENGTH      = "{arch}.embedding_length"
         BLOCK_COUNT           = "{arch}.block_count"
@@ -752,6 +753,7 @@ KEY_GENERAL_SOURCE_HF_REPO       = Keys.General.SOURCE_HF_REPO
 KEY_GENERAL_FILE_TYPE            = Keys.General.FILE_TYPE
 
 # LLM
+KEY_VOCAB_SIZE            = Keys.LLM.VOCAB_SIZE
 KEY_CONTEXT_LENGTH        = Keys.LLM.CONTEXT_LENGTH
 KEY_EMBEDDING_LENGTH      = Keys.LLM.EMBEDDING_LENGTH
 KEY_BLOCK_COUNT           = Keys.LLM.BLOCK_COUNT
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 4d389be95..81b2eb884 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -321,6 +321,9 @@ class GGUFWriter:
         self.data_alignment = alignment
         self.add_uint32(Keys.General.ALIGNMENT, alignment)
 
+    def add_vocab_size(self, size: int) -> None:
+        self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
+
     def add_context_length(self, length: int) -> None:
         self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)
 
diff --git a/llama.cpp b/llama.cpp
index eb48b1e90..10fd53469 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -258,6 +258,7 @@ enum llm_kv {
     LLM_KV_GENERAL_SOURCE_URL,
     LLM_KV_GENERAL_SOURCE_HF_REPO,
 
+    LLM_KV_VOCAB_SIZE,
     LLM_KV_CONTEXT_LENGTH,
     LLM_KV_EMBEDDING_LENGTH,
     LLM_KV_BLOCK_COUNT,
@@ -321,6 +322,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_GENERAL_SOURCE_URL,            "general.source.url"                    },
     { LLM_KV_GENERAL_SOURCE_HF_REPO,        "general.source.huggingface.repository" },
 
+    { LLM_KV_VOCAB_SIZE,                    "%s.vocab_size"            },
     { LLM_KV_CONTEXT_LENGTH,                "%s.context_length"        },
     { LLM_KV_EMBEDDING_LENGTH,              "%s.embedding_length"      },
     { LLM_KV_BLOCK_COUNT,                   "%s.block_count"           },
@@ -3242,10 +3244,11 @@ static const char * llama_model_type_name(e_model type) {
 
 static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
     switch (type) {
-        case LLAMA_VOCAB_TYPE_SPM: return "SPM";
-        case LLAMA_VOCAB_TYPE_BPE: return "BPE";
-        case LLAMA_VOCAB_TYPE_WPM: return "WPM";
-        default:                   return "unknown";
+        case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
+        case LLAMA_VOCAB_TYPE_SPM:  return "SPM";
+        case LLAMA_VOCAB_TYPE_BPE:  return "BPE";
+        case LLAMA_VOCAB_TYPE_WPM:  return "WPM";
+        default:                    return "unknown";
     }
 }
 
@@ -3277,14 +3280,14 @@ static void llm_load_hparams(
     ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
 
     // get hparams kv
-    ml.get_arr_n(LLM_KV_TOKENIZER_LIST,       hparams.n_vocab);
-    ml.get_key  (LLM_KV_CONTEXT_LENGTH,       hparams.n_ctx_train);
-    ml.get_key  (LLM_KV_EMBEDDING_LENGTH,     hparams.n_embd);
-    ml.get_key  (LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff);
-    ml.get_key  (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
-    ml.get_key  (LLM_KV_BLOCK_COUNT,          hparams.n_layer);
-    ml.get_key  (LLM_KV_EXPERT_COUNT,         hparams.n_expert,      false);
-    ml.get_key  (LLM_KV_EXPERT_USED_COUNT,    hparams.n_expert_used, false);
+    ml.get_key(LLM_KV_VOCAB_SIZE,           hparams.n_vocab,       false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
+    ml.get_key(LLM_KV_CONTEXT_LENGTH,       hparams.n_ctx_train);
+    ml.get_key(LLM_KV_EMBEDDING_LENGTH,     hparams.n_embd);
+    ml.get_key(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff);
+    ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
+    ml.get_key(LLM_KV_BLOCK_COUNT,          hparams.n_layer);
+    ml.get_key(LLM_KV_EXPERT_COUNT,         hparams.n_expert,      false);
+    ml.get_key(LLM_KV_EXPERT_USED_COUNT,    hparams.n_expert_used, false);
 
     GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
     GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@@ -3645,30 +3648,25 @@ static void llm_load_vocab(
 
     const auto kv = LLM_KV(model.arch);
 
-    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
-    if (token_idx == -1) {
-        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
-    }
-
-    const float * scores = nullptr;
-    const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
-    if (score_idx != -1) {
-        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
-    }
-
-    const int * toktypes = nullptr;
-    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
-    if (toktype_idx != -1) {
-        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
-    }
-
     // determine vocab type
     {
         std::string tokenizer_name;
 
         ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
 
-        if (tokenizer_name == "llama") {
+        if (tokenizer_name == "no_vocab") {
+            vocab.type = LLAMA_VOCAB_TYPE_NONE;
+
+            // default special tokens
+            vocab.special_bos_id = -1;
+            vocab.special_eos_id = -1;
+            vocab.special_unk_id = -1;
+            vocab.special_sep_id = -1;
+            vocab.special_pad_id = -1;
+            vocab.linefeed_id    = -1;
+
+            return;
+        } else if (tokenizer_name == "llama") {
             vocab.type = LLAMA_VOCAB_TYPE_SPM;
 
             // default special tokens
@@ -3734,6 +3732,23 @@ static void llm_load_vocab(
         }
     }
 
+    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
+    if (token_idx == -1) {
+        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+    }
+
+    const float * scores = nullptr;
+    const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
+    if (score_idx != -1) {
+        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
+    }
+
+    const int * toktypes = nullptr;
+    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
+    if (toktype_idx != -1) {
+        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+    }
+
     const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
 
     vocab.id_to_token.resize(n_vocab);
@@ -5023,7 +5038,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 
         llm_load_print_meta(ml, model);
 
-        if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
+        if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
+            model.hparams.n_vocab != model.vocab.id_to_token.size()) {
             throw std::runtime_error("vocab size mismatch");
         }
 
@@ -9361,26 +9377,32 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
 }
 
 static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
     return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
 }
 
 static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
     return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
 }
 
 static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
     return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
 }
 
 static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
     return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
 }
 
 static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
+    GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
     return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
 }
 
 static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
+    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
     GGML_ASSERT(llama_is_byte_token(vocab, id));
     const auto& token_data = vocab.id_to_token.at(id);
     switch (llama_vocab_get_type(vocab)) {
@@ -9401,6 +9423,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
 }
 
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
+    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
     static const char * hex = "0123456789ABCDEF";
     switch (llama_vocab_get_type(vocab)) {
         case LLAMA_VOCAB_TYPE_SPM: {
@@ -10232,6 +10255,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                     }
                 }
             } break;
+        case LLAMA_VOCAB_TYPE_NONE:
+            GGML_ASSERT(false);
     }
 
     return output;
@@ -13138,7 +13163,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
 }
 
 int32_t llama_n_vocab(const struct llama_model * model) {
-    return model->vocab.id_to_token.size();
+    return model->hparams.n_vocab;
 }
 
 int32_t llama_n_ctx_train(const struct llama_model * model) {
@@ -13962,14 +13987,17 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id
 }
 
 const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
+    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
     return model->vocab.id_to_token[token].text.c_str();
 }
 
 float llama_token_get_score(const struct llama_model * model, llama_token token) {
+    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
     return model->vocab.id_to_token[token].score;
 }
 
 llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
+    GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
     return model->vocab.id_to_token[token].type;
 }
 
diff --git a/llama.h b/llama.h
index 2d16cc9b9..90aa5372e 100644
--- a/llama.h
+++ b/llama.h
@@ -59,9 +59,10 @@ extern "C" {
     typedef int32_t llama_seq_id;
 
     enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
-        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
-        LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
+        LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
+        LLAMA_VOCAB_TYPE_SPM  = 1, // SentencePiece
+        LLAMA_VOCAB_TYPE_BPE  = 2, // Byte Pair Encoding
+        LLAMA_VOCAB_TYPE_WPM  = 3, // WordPiece
     };
 
     // note: these values should be synchronized with ggml_rope

From 727107707a73b3dc8a497cf9fc9405722c16dd2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= <ondrej@certik.us>
Date: Thu, 14 Mar 2024 11:57:31 -0600
Subject: [PATCH 13/31] gguf-py : bump version to 0.8.0 (#6060)

---
 gguf-py/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index 9789c2c87..96396e04e 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.7.0"
+version = "0.8.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [

From 6e0438da3cc95b89cdbf55f45fa4e324d9076792 Mon Sep 17 00:00:00 2001
From: Steve Grubb <ausearch.1@gmail.com>
Date: Thu, 14 Mar 2024 14:29:32 -0400
Subject: [PATCH 14/31] gguf : fix resource leaks (#6061)

There several places where a gguf context is allocated. A call to gguf_free
is missing in some error paths. Also on linux, llama-bench was missing a
fclose.
---
 examples/gguf/gguf.cpp                                       | 1 +
 examples/llama-bench/llama-bench.cpp                         | 1 +
 examples/llava/clip.cpp                                      | 4 ++++
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 1 +
 4 files changed, 7 insertions(+)

diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index e67be4fb2..5444503a5 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -211,6 +211,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
                 for (int j = 0; j < ggml_nelements(cur); ++j) {
                     if (data[j] != 100 + i) {
                         fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
+                        gguf_free(ctx);
                         return false;
                     }
                 }
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index bf94e7e7a..d6e5e0497 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -103,6 +103,7 @@ static std::string get_cpu_info() {
                 }
             }
         }
+        fclose(f);
     }
 #endif
     // TODO: other platforms
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 6653b815d..2035554ea 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -995,6 +995,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         if (!new_clip->ctx_data) {
             fprintf(stderr, "%s: ggml_init() failed\n", __func__);
             clip_free(new_clip);
+            gguf_free(ctx);
             return nullptr;
         }
 
@@ -1002,6 +1003,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         if (!fin) {
             printf("cannot open model file for loading tensors\n");
             clip_free(new_clip);
+            gguf_free(ctx);
             return nullptr;
         }
 
@@ -1023,6 +1025,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             if (!fin) {
                 printf("%s: failed to seek for tensor %s\n", __func__, name);
                 clip_free(new_clip);
+                gguf_free(ctx);
                 return nullptr;
             }
             int num_bytes = ggml_nbytes(cur);
@@ -1908,6 +1911,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                 break;
             default:
                 printf("Please use an input file in f32 or f16\n");
+                gguf_free(ctx_out);
                 return false;
             }
 
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 7eafe8515..7d06e401b 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -711,6 +711,7 @@ static bool load_checkpoint_file(const char * filename, struct my_llama_model *
 
     load_checkpoint_gguf(fctx, f_ggml_ctx, model, train);
 
+    gguf_free(fctx);
     return true;
 }
 

From 4755afd1cbd40d93c017e5b98c39796f52345314 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 14 Mar 2024 22:58:41 +0200
Subject: [PATCH 15/31] llama : fix integer overflow during quantization
 (#6063)

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 10fd53469..2c3841974 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -11977,7 +11977,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
     return new_type;
 }
 
-static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
+static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
     std::mutex mutex;
     int counter = 0;
     size_t new_size = 0;

From b0bc9f4a9da7c19f4779106ea83b23feca747566 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 15 Mar 2024 09:22:24 +0100
Subject: [PATCH 16/31] llama-bench : use random tokens to improve accuracy
 with mixtral (#6069)

---
 examples/llama-bench/llama-bench.cpp | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index d6e5e0497..32eea7869 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -8,6 +8,7 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <cstdlib>
 #include <iterator>
 #include <map>
 #include <numeric>
@@ -1123,15 +1124,19 @@ struct sql_printer : public printer {
 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
-    //std::vector<llama_token> tokens(n_prompt, llama_token_bos(llama_get_model(ctx)));
-    //llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0));
-    //GGML_UNUSED(n_batch);
+    const llama_model * model = llama_get_model(ctx);
+    const int32_t n_vocab = llama_n_vocab(model);
+
+    std::vector<llama_token> tokens(n_batch);
 
-    std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
     int n_processed = 0;
 
     while (n_processed < n_prompt) {
         int n_tokens = std::min(n_prompt - n_processed, n_batch);
+        tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
+        for (int i = 1; i < n_tokens; i++) {
+            tokens[i] = std::rand() % n_vocab;
+        }
         llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
         n_processed += n_tokens;
     }
@@ -1142,11 +1147,15 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
-    llama_token token = llama_token_bos(llama_get_model(ctx));
+    const llama_model * model = llama_get_model(ctx);
+    const int32_t n_vocab = llama_n_vocab(model);
+
+    llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
 
     for (int i = 0; i < n_gen; i++) {
         llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
         llama_synchronize(ctx);
+        token = std::rand() % n_vocab;
     }
 }
 

From aab606a11fc0a9740a7f297521c3eef851dfb351 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Fri, 15 Mar 2024 09:44:57 +0100
Subject: [PATCH 17/31] llama : add Orion chat template (#6066)

---
 llama.cpp                    | 20 ++++++++++++++++++++
 tests/test-chat-template.cpp |  4 ++++
 2 files changed, 24 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 2c3841974..b8a8d2723 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -14242,6 +14242,26 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<start_of_turn>model\n";
         }
+    } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
+        // OrionStarAI/Orion-14B-Chat
+        std::string system_prompt = "";
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                // there is no system message support, we will merge it with user prompt
+                system_prompt = message->content;
+                continue;
+            } else if (role == "user") {
+                ss << "Human: ";
+                if (!system_prompt.empty()) {
+                    ss << system_prompt << "\n\n";
+                    system_prompt = "";
+                }
+                ss << message->content << "\n\nAssistant: </s>";
+            } else {
+                ss << message->content << "</s>";
+            }
+        }
     } else {
         // template not supported
         return -1;
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index fa2eb577b..6e9e4bd1e 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -31,6 +31,8 @@ int main(void) {
         "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
         // google/gemma-7b-it
         "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
+        // OrionStarAI/Orion-14B-Chat
+        "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
     };
     std::vector<std::string> expected_output = {
         // teknium/OpenHermes-2.5-Mistral-7B
@@ -45,6 +47,8 @@ int main(void) {
         "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
         // google/gemma-7b-it
         "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
+        // OrionStarAI/Orion-14B-Chat
+        "Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;

From 7ce2c77f88e1ca66ec48417e56f91746bac018c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20=C4=8Cert=C3=ADk?= <ondrej@certik.us>
Date: Fri, 15 Mar 2024 02:46:51 -0600
Subject: [PATCH 18/31] gguf : add support for I64 and F64 arrays (#6062)

* gguf : add support for I64 and F64 arrays

GGML currently does not support I64 or F64 arrays and they are not often
used in machine learning, however if in the future the need arises, it
would be nice to add them now, so that the types are next to the other
types I8, I16, I32 in the enums, and it also reserves their type number.

Furthermore, with this addition the GGUF format becomes very usable for
most computational applications of NumPy (being compatible with the most
common NumPy dtypes: i8, i16, i32, i64, f32, f64), providing a faster,
and more versatile alternative to the `npz` format, and a simpler
alternative to the `hdf5` format.

The change in this PR seems small, not significantly increasing the
maintenance burden. I tested this from Python using GGUFWriter/Reader
and `gguf-dump`, as well as from C, everything seems to work.

* Fix compiler warnings
---
 ggml.c                      | 17 +++++++++++++++++
 ggml.h                      |  2 ++
 gguf-py/gguf/constants.py   |  4 ++++
 gguf-py/gguf/gguf_reader.py | 14 ++++++++++----
 gguf-py/gguf/gguf_writer.py | 12 ++++++++----
 5 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/ggml.c b/ggml.c
index fbc66f65b..c94006e51 100644
--- a/ggml.c
+++ b/ggml.c
@@ -470,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .type_size                = sizeof(int32_t),
         .is_quantized             = false,
     },
+    [GGML_TYPE_I64] = {
+        .type_name                = "i64",
+        .blck_size                = 1,
+        .type_size                = sizeof(int64_t),
+        .is_quantized             = false,
+    },
+    [GGML_TYPE_F64] = {
+        .type_name                = "f64",
+        .blck_size                = 1,
+        .type_size                = sizeof(double),
+        .is_quantized             = false,
+        .nrows                    = 1,
+    },
     [GGML_TYPE_F32] = {
         .type_name                = "f32",
         .blck_size                = 1,
@@ -12418,6 +12431,8 @@ static void ggml_compute_forward_alibi(
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
+        case GGML_TYPE_I64:
+        case GGML_TYPE_F64:
         case GGML_TYPE_COUNT:
             {
                 GGML_ASSERT(false);
@@ -12504,6 +12519,8 @@ static void ggml_compute_forward_clamp(
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
+        case GGML_TYPE_I64:
+        case GGML_TYPE_F64:
         case GGML_TYPE_COUNT:
             {
                 GGML_ASSERT(false);
diff --git a/ggml.h b/ggml.h
index ab26c8f59..c937d4a53 100644
--- a/ggml.h
+++ b/ggml.h
@@ -366,6 +366,8 @@ extern "C" {
         GGML_TYPE_I8      = 24,
         GGML_TYPE_I16     = 25,
         GGML_TYPE_I32     = 26,
+        GGML_TYPE_I64     = 27,
+        GGML_TYPE_F64     = 28,
         GGML_TYPE_COUNT,
     };
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 2d7cf16c1..458a641dc 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -665,6 +665,8 @@ class GGMLQuantizationType(IntEnum):
     I8      = 24
     I16     = 25
     I32     = 26
+    I64     = 27
+    F64     = 28
 
 
 class GGUFEndian(IntEnum):
@@ -734,6 +736,8 @@ GGML_QUANT_SIZES = {
     GGMLQuantizationType.I8:      (1, 1),
     GGMLQuantizationType.I16:     (1, 2),
     GGMLQuantizationType.I32:     (1, 4),
+    GGMLQuantizationType.I64:     (1, 8),
+    GGMLQuantizationType.F64:     (1, 8),
 }
 
 
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index 1c10f5753..33afac552 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -242,12 +242,15 @@ class GGUFReader:
             n_bytes = n_elems * type_size // block_size
             data_offs = int(start_offs + offset_tensor[0])
             item_type: npt.DTypeLike
-            if ggml_type == GGMLQuantizationType.F32:
-                item_count = n_elems
-                item_type = np.float32
-            elif ggml_type == GGMLQuantizationType.F16:
+            if ggml_type == GGMLQuantizationType.F16:
                 item_count = n_elems
                 item_type = np.float16
+            elif ggml_type == GGMLQuantizationType.F32:
+                item_count = n_elems
+                item_type = np.float32
+            elif ggml_type == GGMLQuantizationType.F64:
+                item_count = n_elems
+                item_type = np.float64
             elif ggml_type == GGMLQuantizationType.I8:
                 item_count = n_elems
                 item_type = np.int8
@@ -257,6 +260,9 @@ class GGUFReader:
             elif ggml_type == GGMLQuantizationType.I32:
                 item_count = n_elems
                 item_type = np.int32
+            elif ggml_type == GGMLQuantizationType.I64:
+                item_count = n_elems
+                item_type = np.int64
             else:
                 item_count = n_bytes
                 item_type = np.uint8
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 81b2eb884..1967b633c 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -204,18 +204,22 @@ class GGUFWriter:
         for i in range(n_dims):
             self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
         if raw_dtype is None:
-            if tensor_dtype == np.float32:
-                dtype = GGMLQuantizationType.F32
-            elif tensor_dtype == np.float16:
+            if tensor_dtype == np.float16:
                 dtype = GGMLQuantizationType.F16
+            elif tensor_dtype == np.float32:
+                dtype = GGMLQuantizationType.F32
+            elif tensor_dtype == np.float64:
+                dtype = GGMLQuantizationType.F64
             elif tensor_dtype == np.int8:
                 dtype = GGMLQuantizationType.I8
             elif tensor_dtype == np.int16:
                 dtype = GGMLQuantizationType.I16
             elif tensor_dtype == np.int32:
                 dtype = GGMLQuantizationType.I32
+            elif tensor_dtype == np.int64:
+                dtype = GGMLQuantizationType.I64
             else:
-                raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now")
+                raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
         else:
             dtype = raw_dtype
         self.ti_data += self._pack("I", dtype)

From 753e36f650fa2a5869f89188d9ee745dc74cf14b Mon Sep 17 00:00:00 2001
From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
Date: Fri, 15 Mar 2024 09:26:20 +0000
Subject: [PATCH 19/31] [SYCL] Fix non-intel device selection (#6042)

* Fix non-intel device selection

* Update ggml-sycl.cpp

Co-authored-by: Neo Zhang Jianyu <jianyu.zhang@intel.com>

* Update ggml-sycl.cpp

Co-authored-by: Neo Zhang Jianyu <jianyu.zhang@intel.com>

---------

Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Co-authored-by: Neo Zhang Jianyu <jianyu.zhang@intel.com>
---
 ggml-sycl.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 9f6506383..a1ca6aba5 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -3451,7 +3451,7 @@ class sycl_gpu_mgr {
                 dpct::device_info prop;
                 dpct::get_device_info(prop, device);
                 if (max_compute_units == prop.get_max_compute_units() &&
-                    prop.get_major_version() == 1) {
+                    is_ext_oneapi_device(device)) {
                     gpus.push_back(id);
                     devices.push_back(device);
                     work_group_size = prop.get_max_work_group_size();
@@ -3484,6 +3484,15 @@ class sycl_gpu_mgr {
             assert(false);
             return -1;
         }
+
+        bool is_ext_oneapi_device(const sycl::device &dev) {
+            sycl::backend dev_backend = dev.get_backend();
+            if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
+                dev_backend == sycl::backend::ext_oneapi_cuda ||
+                dev_backend == sycl::backend::ext_oneapi_hip)
+                return true;
+            return false;
+        }
 };
 
 static sycl_gpu_mgr *g_sycl_gpu_mgr = NULL;

From 131b0584096ee9df4d07cb28759dfea6efe6475f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 15 Mar 2024 11:36:50 +0200
Subject: [PATCH 20/31] make : ggml-metal.o depends on ggml.h

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index cb597b209..c0f125036 100644
--- a/Makefile
+++ b/Makefile
@@ -553,7 +553,7 @@ endif
 endif # LLAMA_METAL
 
 ifdef LLAMA_METAL
-ggml-metal.o: ggml-metal.m ggml-metal.h
+ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
 	$(CC) $(CFLAGS) -c $< -o $@
 
 ifdef LLAMA_METAL_EMBED_LIBRARY

From 46acb3676718b983157058aecf729a2064fc7d34 Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Fri, 15 Mar 2024 18:53:53 +0800
Subject: [PATCH 21/31] fix set main gpu error (#6073)

---
 examples/sycl/build.sh      |   5 +-
 examples/sycl/run-llama2.sh |  16 +-
 ggml-sycl.cpp               | 332 ++++++++++++++++++++++++++----------
 ggml-sycl.h                 |   5 +
 llama.cpp                   |  23 ++-
 5 files changed, 282 insertions(+), 99 deletions(-)

diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh
index 26ad2f7da..f20391d7a 100755
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@@ -13,8 +13,11 @@ source /opt/intel/oneapi/setvars.sh
 #for FP32
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
-#build example/main only
+#build example/main
 #cmake --build . --config Release --target main
 
+#build example/llama-bench
+#cmake --build . --config Release --target llama-bench
+
 #build all binary
 cmake --build . --config Release -v
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
index 52f7c01a4..c979a52f6 100755
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -9,18 +9,28 @@ source /opt/intel/oneapi/setvars.sh
 
 if [ $# -gt 0 ]; then
     GGML_SYCL_DEVICE=$1
+    GGML_SYCL_SINGLE_GPU=1
 else
     GGML_SYCL_DEVICE=0
 fi
-echo "use $GGML_SYCL_DEVICE as main GPU"
+
 #export GGML_SYCL_DEBUG=1
 
 
 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
 
-#use all GPUs with same max compute units
-ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
+    echo "use $GGML_SYCL_DEVICE as main GPU"
+    #use signle GPU only
+    ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+else
+    #use multiple GPUs with same max compute units
+    ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+fi
 
 #use main GPU only
 #ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
 
+#use multiple GPUs with same max compute units
+#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index a1ca6aba5..6dc5eb20c 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -16,6 +16,7 @@
 #include <cinttypes>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <float.h>
 #include <limits>
 #include <stdint.h>
@@ -24,10 +25,9 @@
 #include <cmath>
 #include <iostream>
 #include <fstream>
-
 #include <stdio.h>
 #include <stdlib.h>
-
+#include <regex>
 
 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
@@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
 #define __dpct_noinline__ __attribute__((noinline))
 #endif
 
+
+std::string get_device_type_name(const sycl::device &Device) {
+    auto DeviceType = Device.get_info<sycl::info::device::device_type>();
+    switch (DeviceType) {
+    case sycl::info::device_type::cpu:
+        return "cpu";
+    case sycl::info::device_type::gpu:
+        return "gpu";
+    case sycl::info::device_type::host:
+        return "host";
+    case sycl::info::device_type::accelerator:
+        return "acc";
+    default:
+        return "unknown";
+    }
+}
+
+std::string get_device_backend_and_type(const sycl::device &device) {
+    std::stringstream device_type;
+    sycl::backend backend = device.get_backend();
+    device_type <<  backend << ":" << get_device_type_name(device);
+    return device_type.str();
+}
+
 namespace dpct
 {
     typedef sycl::queue *queue_ptr;
@@ -942,17 +966,65 @@ namespace dpct
 
     private:
         mutable std::recursive_mutex m_mutex;
+        static bool compare_dev(sycl::device &device1, sycl::device &device2)
+        {
+            dpct::device_info prop1;
+            dpct::get_device_info(prop1, device1);
+            dpct::device_info prop2;
+            dpct::get_device_info(prop2, device2);
+            return prop1.get_max_compute_units() > prop2.get_max_compute_units();
+        }
+        static int convert_backend_index(std::string & backend) {
+            if (backend == "ext_oneapi_level_zero:gpu") return 0;
+            if (backend == "opencl:gpu") return 1;
+            if (backend == "opencl:cpu") return 2;
+            if (backend == "opencl:acc") return 3;
+            printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
+            GGML_ASSERT(false);
+        }
+        static bool compare_backend(std::string &backend1, std::string &backend2) {
+            return convert_backend_index(backend1) < convert_backend_index(backend2);
+        }
         dev_mgr()
         {
             sycl::device default_device =
                 sycl::device(sycl::default_selector_v);
             _devs.push_back(std::make_shared<device_ext>(default_device));
 
-            std::vector<sycl::device> sycl_all_devs =
-                sycl::device::get_devices(sycl::info::device_type::all);
+            std::vector<sycl::device> sycl_all_devs;
             // Collect other devices except for the default device.
             if (default_device.is_cpu())
                 _cpu_device = 0;
+
+            auto Platforms = sycl::platform::get_platforms();
+            // Keep track of the number of devices per backend
+            std::map<sycl::backend, size_t> DeviceNums;
+            std::map<std::string, std::vector<sycl::device>> backend_devices;
+
+            while (!Platforms.empty()) {
+                auto Platform = Platforms.back();
+                Platforms.pop_back();
+                auto devices = Platform.get_devices();
+                std::string backend_type = get_device_backend_and_type(devices[0]);
+                for (const auto &device : devices) {
+                    backend_devices[backend_type].push_back(device);
+                }
+            }
+
+            std::vector<std::string> keys;
+            for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
+                keys.push_back(it->first);
+            }
+            std::sort(keys.begin(), keys.end(), compare_backend);
+
+            for (auto &key : keys) {
+                std::vector<sycl::device> devs = backend_devices[key];
+                std::sort(devs.begin(), devs.end(), compare_dev);
+                for (const auto &dev : devs) {
+                    sycl_all_devs.push_back(dev);
+                }
+            }
+
             for (auto &dev : sycl_all_devs)
             {
                 if (dev == default_device)
@@ -3202,6 +3274,11 @@ static int g_work_group_size = 0;
 #define GGML_SYCL_MMV_Y 1
 #endif
 
+enum ggml_sycl_backend_gpu_mode {
+    SYCL_UNSET_GPU_MODE = -1,
+    SYCL_SINGLE_GPU_MODE = 0,
+    SYCL_MUL_GPU_MODE
+};
 
 static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
@@ -3401,12 +3478,31 @@ class sycl_gpu_mgr {
         int work_group_size = 0;
         std::string gpus_list = "";
 
+        /*
+        Use all GPUs with same top max compute units
+        */
         sycl_gpu_mgr() {
             detect_sycl_gpu_list_with_max_cu();
             get_allow_gpus();
             create_context_with_gpus();
         }
 
+        /*
+        Only use the assigned GPU
+        */
+        sycl_gpu_mgr(int main_gpu_id) {
+            sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
+            dpct::device_info prop;
+            dpct::get_device_info(prop, device);
+            gpus.push_back(main_gpu_id);
+            devices.push_back(device);
+            work_group_size = prop.get_max_work_group_size();
+            max_compute_units = prop.get_max_compute_units();
+
+            get_allow_gpus();
+            create_context_with_gpus();
+        }
+
         void create_context_with_gpus() {
             sycl::context ctx = sycl::context(devices);
             assert(gpus.size() > 0);
@@ -3422,7 +3518,7 @@ class sycl_gpu_mgr {
                 gpus_list += std::to_string(gpus[i]);
                 gpus_list += ",";
             }
-            if (gpus_list.length() > 2) {
+            if (gpus_list.length() > 1) {
                 gpus_list.pop_back();
             }
         }
@@ -3471,8 +3567,8 @@ class sycl_gpu_mgr {
                 if (gpus[i] == id)
                     return i;
             }
-            assert(false);
-            return -1;
+            printf("miss to get device index by id=%d\n", id);
+            GGML_ASSERT(false);
         }
 
         int get_next_index(int id) {
@@ -3481,8 +3577,7 @@ class sycl_gpu_mgr {
                 if (gpus[i] == id)
                     return i;
             }
-            assert(false);
-            return -1;
+            GGML_ASSERT(false);
         }
 
         bool is_ext_oneapi_device(const sycl::device &dev) {
@@ -3500,11 +3595,14 @@ static int g_device_count = -1;
 static int g_all_sycl_device_count = -1;
 static int g_main_device = -1;
 static int g_main_device_id = -1;
+static bool g_ggml_backend_sycl_buffer_type_initialized = false;
 
 static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
 
 static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
 
+static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
+
 struct sycl_device_capabilities {
     int     cc;                 // compute capability
     bool    vmm;                // virtual memory support
@@ -13008,17 +13106,20 @@ bool ggml_sycl_loaded(void) {
     return g_sycl_loaded;
 }
 
-void print_device_detail(int id) {
+void print_device_detail(int id, sycl::device &device, std::string device_type) {
+
     dpct::device_info prop;
     SYCL_CHECK(CHECK_TRY_ERROR(
-        dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id))));
-    sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
+        dpct::get_device_info(prop, device)));
+
     std::string version;
     version += std::to_string(prop.get_major_version());
     version += ".";
     version += std::to_string(prop.get_minor_version());
 
-    fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id,
+    device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
+
+    fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
             prop.get_name(), version.c_str(), prop.get_max_compute_units(),
             prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
             prop.get_global_mem_size());
@@ -13026,19 +13127,35 @@ void print_device_detail(int id) {
 
 void ggml_backend_sycl_print_sycl_devices() {
     int device_count = dpct::dev_mgr::instance().device_count();
+    std::map<std::string, size_t> DeviceNums;
     fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    fprintf(stderr, "|ID| Name                                        |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n");
-    fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n");
+    fprintf(stderr, "|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |\n");
+    fprintf(stderr, "|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|\n");
+    fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
     for (int id = 0; id < device_count; ++id) {
-        print_device_detail(id);
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        sycl::backend backend = device.get_backend();
+        std::string backend_type = get_device_backend_and_type(device);
+        int type_id=DeviceNums[backend_type]++;
+        std::stringstream device_type;
+        device_type << "[" <<  backend_type << ":" << std::to_string(type_id) << "]";
+        print_device_detail(id, device, device_type.str());
     }
 }
 
 void print_gpu_device_list() {
-    fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
-            g_sycl_gpu_mgr->get_gpu_count(),
-            g_sycl_gpu_mgr->gpus_list.c_str(),
-            g_sycl_gpu_mgr->max_compute_units);
+    GGML_ASSERT(g_sycl_gpu_mgr);
+
+    char* hint=NULL;
+    if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
+        hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
+    } else {
+        hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
+    }
+    fprintf(stderr, hint,
+        g_sycl_gpu_mgr->get_gpu_count(),
+        g_sycl_gpu_mgr->gpus_list.c_str(),
+        g_sycl_gpu_mgr->max_compute_units);
 }
 
 int get_sycl_env(const char *env_name, int default_val) {
@@ -13074,23 +13191,6 @@ void ggml_init_sycl() try {
 #else
         fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
 #endif
-        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
-                            dpct::dev_mgr::instance().device_count()) != 0) {
-            initialized = true;
-            g_sycl_loaded = false;
-            return;
-        }
-        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
-        ggml_backend_sycl_print_sycl_devices();
-
-        if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
-
-        g_device_count = g_sycl_gpu_mgr->get_gpu_count();
-        g_work_group_size = g_sycl_gpu_mgr->work_group_size;
-
-        print_gpu_device_list();
-
-        int64_t total_vram = 0;
 
 /* NOT REMOVE, keep it for next optimize for XMX.
 #if defined(SYCL_USE_XMX)
@@ -13099,49 +13199,15 @@ void ggml_init_sycl() try {
         fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
 #endif
 */
-        for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
-            g_device_caps[id].vmm = 0;
-            g_device_caps[id].device_id = -1;
-            g_device_caps[id].cc = 0;
-            g_tensor_split[id] = 0;
-            g_default_tensor_split[id] = 0;
+
+        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
+                            dpct::dev_mgr::instance().device_count()) != 0) {
+            initialized = true;
+            g_sycl_loaded = false;
+            return;
         }
-
-        for (int i = 0; i < g_device_count; ++i) {
-            int device_id = g_sycl_gpu_mgr->gpus[i];
-            g_device_caps[i].vmm = 0;
-
-            dpct::device_info prop;
-            SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-                prop, dpct::dev_mgr::instance().get_device(device_id))));
-
-            g_default_tensor_split[i] = total_vram;
-            total_vram += prop.get_global_mem_size();
-
-            g_device_caps[i].cc =
-                100 * prop.get_major_version() + 10 * prop.get_minor_version();
-        }
-
-        for (int i = 0; i < g_device_count; ++i) {
-            g_default_tensor_split[i] /= total_vram;
-        }
-
-        for (int i = 0; i < g_device_count; ++i) {
-            SYCL_CHECK(ggml_sycl_set_device(i));
-
-            // create sycl streams
-            for (int is = 0; is < MAX_STREAMS; ++is) {
-                SYCL_CHECK(CHECK_TRY_ERROR(
-                    g_syclStreams[i][is] =
-                        dpct::get_current_device().create_queue(
-                            g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
-            }
-
-            const dpct::queue_ptr stream = g_syclStreams[i][0];
-            // create sycl handle
-            SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
-        }
-
+        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
+        ggml_backend_sycl_print_sycl_devices();
         initialized = true;
         g_sycl_loaded = true;
     }
@@ -13152,6 +13218,63 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
+void ggml_init_by_gpus(int device_count) try {
+    g_device_count = device_count;
+    g_work_group_size = g_sycl_gpu_mgr->work_group_size;
+
+    int64_t total_vram = 0;
+
+    print_gpu_device_list();
+
+    for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
+        g_device_caps[id].vmm = 0;
+        g_device_caps[id].device_id = -1;
+        g_device_caps[id].cc = 0;
+        g_tensor_split[id] = 0;
+        g_default_tensor_split[id] = 0;
+    }
+
+    for (int i = 0; i < g_device_count; ++i) {
+        int device_id = g_sycl_gpu_mgr->gpus[i];
+        g_device_caps[i].vmm = 0;
+
+        dpct::device_info prop;
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(device_id))));
+
+        g_default_tensor_split[i] = total_vram;
+        total_vram += prop.get_global_mem_size();
+
+        g_device_caps[i].cc =
+            100 * prop.get_major_version() + 10 * prop.get_minor_version();
+    }
+
+    for (int i = 0; i < g_device_count; ++i) {
+        g_default_tensor_split[i] /= total_vram;
+    }
+
+    for (int i = 0; i < g_device_count; ++i) {
+        SYCL_CHECK(ggml_sycl_set_device(i));
+
+        // create sycl streams
+        for (int is = 0; is < MAX_STREAMS; ++is) {
+            SYCL_CHECK(CHECK_TRY_ERROR(
+                g_syclStreams[i][is] =
+                    dpct::get_current_device().create_queue(
+                        g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
+        }
+
+        const dpct::queue_ptr stream = g_syclStreams[i][0];
+        // create sycl handle
+        SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
 void *ggml_sycl_host_malloc(size_t size) try {
     if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
         return nullptr;
@@ -16551,22 +16674,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
     /* .is_host          = */ nullptr,
 };
 
-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
+    if (device_index>=g_device_count or device_index<0) {
+        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
+            device_index, g_device_count-1);
+        GGML_ASSERT(device_index<g_device_count);
+    }
     static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
 
-    static bool ggml_backend_sycl_buffer_type_initialized = false;
-
-    if (!ggml_backend_sycl_buffer_type_initialized) {
+    if (!g_ggml_backend_sycl_buffer_type_initialized) {
         for (int i = 0; i < g_device_count; i++) {
             ggml_backend_sycl_buffer_types[i] = {
                 /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
                 /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
             };
         }
-        ggml_backend_sycl_buffer_type_initialized = true;
+        g_ggml_backend_sycl_buffer_type_initialized = true;
     }
-
-    return &ggml_backend_sycl_buffer_types[device];
+    return &ggml_backend_sycl_buffer_types[device_index];
 }
 
 // sycl split buffer type
@@ -17319,11 +17444,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
     return g_sycl_gpu_mgr->get_index(device_id);
 }
 
+GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
+    return g_sycl_gpu_mgr->gpus[device_index];
+}
+
+GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
+    GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
+    fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
+    if (g_sycl_gpu_mgr) {
+        delete g_sycl_gpu_mgr;
+    }
+    g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
+    g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
+    ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
+    g_ggml_backend_sycl_buffer_type_initialized = false;
+}
+
+GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
+    if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
+        return;
+    }
+
+    fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
+
+    if (g_sycl_gpu_mgr) {
+        delete g_sycl_gpu_mgr;
+    }
+    g_sycl_gpu_mgr = new sycl_gpu_mgr();
+    g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
+    ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
+    g_ggml_backend_sycl_buffer_type_initialized = false;
+}
+
 extern "C" int ggml_backend_sycl_reg_devices();
 
 int ggml_backend_sycl_reg_devices() {
-    if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
-    g_device_count = g_sycl_gpu_mgr->get_gpu_count();
+    ggml_backend_sycl_set_mul_device_mode();
     assert(g_device_count>0);
     for (int i = 0; i < g_device_count; i++) {
         int id = g_sycl_gpu_mgr->gpus[i];
diff --git a/ggml-sycl.h b/ggml-sycl.h
index bf5b11b36..c549a64a1 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
 
+// TODO: these are temporary
+//       ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
+GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
+GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
+GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
 #ifdef  __cplusplus
 }
 #endif
diff --git a/llama.cpp b/llama.cpp
index b8a8d2723..8e185d4bf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5064,6 +5064,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
 #endif
 
+#ifdef GGML_USE_SYCL
+        if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
+            ggml_backend_sycl_set_single_device_mode(params.main_gpu);
+            //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
+            params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
+        } else {
+            ggml_backend_sycl_set_mul_device_mode();
+        }
+#endif
+
         if (!llm_load_tensors(
             ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
             params.progress_callback, params.progress_callback_user_data
@@ -12921,23 +12931,22 @@ struct llama_context * llama_new_context_with_model(
         if (model->n_gpu_layers > 0) {
             // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
             if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-                int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
-                ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
+                ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
                 if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
+                    int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
+                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
                     llama_free(ctx);
                     return nullptr;
                 }
                 ctx->backends.push_back(backend);
             } else {
                 // LLAMA_SPLIT_LAYER requires a backend for each GPU
-                int id_list[GGML_SYCL_MAX_DEVICES];
-                ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
                 for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
-                    int device_id = id_list[i];
                     ggml_backend_t backend = ggml_backend_sycl_init(i);
                     if (backend == nullptr) {
-                        LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
+                        int id_list[GGML_SYCL_MAX_DEVICES];
+                        ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
+                        LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
                         llama_free(ctx);
                         return nullptr;
                     }

From 3020327f6cd6d2ce50528dd65f4b199d2ea8b1ae Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 15 Mar 2024 13:24:03 +0100
Subject: [PATCH 22/31] cuda : disable unused cudaLaunchHostFunc code (#6078)

---
 ggml-cuda.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index d1b5e52ba..db595409a 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -11541,6 +11541,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
     if (ggml_backend_is_cuda(event->backend)) {
         CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
     } else {
+#if 0
         // untested
         auto wait_fn = [](void * user_data) {
             ggml_backend_event_t event = (ggml_backend_event_t)user_data;
@@ -11548,6 +11549,8 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
         };
 
         CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
+#endif
+        GGML_ASSERT(false);
     }
 }
 

From 4e9a7f7f7fb6acbddd1462909c8d696e38edbfcc Mon Sep 17 00:00:00 2001
From: Ting Lou <ting.lou@gmail.com>
Date: Fri, 15 Mar 2024 22:31:05 +0800
Subject: [PATCH 23/31] llava : change API to pure C style for Rust FFI bindgen
 (#6079)

Co-authored-by: Lou Ting <louting.t@alibaba-inc.com>
---
 examples/llava/clip.cpp  | 36 ++++++++++++++++++------------------
 examples/llava/clip.h    |  6 +++---
 examples/llava/llava.cpp |  2 +-
 examples/llava/llava.h   |  4 ++--
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 2035554ea..a0ed82d7e 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1235,16 +1235,16 @@ struct clip_image_f32 * clip_image_f32_init() {
 
 void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
 void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
-void clip_image_u8_batch_free(struct clip_image_u8_batch  & batch) {
-    if (batch.size > 0) {
-        delete[] batch.data;
-        batch.size = 0;
+void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
     }
 }
-void clip_image_f32_batch_free(struct clip_image_f32_batch  & batch) {
-    if (batch.size > 0) {
-        delete[] batch.data;
-        batch.size = 0;
+void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
     }
 }
 
@@ -1497,7 +1497,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
 
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
     bool pad_to_square = true;
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
@@ -1509,11 +1509,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
         pad_to_square = false;
     }
     // free the previous res_imgs if any set
-    if (res_imgs.size > 0) {
+    if (res_imgs->size > 0) {
         clip_image_f32_batch_free(res_imgs);
     }
-    res_imgs.data = nullptr;
-    res_imgs.size = 0;
+    res_imgs->data = nullptr;
+    res_imgs->size = 0;
 
     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
     // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
@@ -1568,11 +1568,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
             bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
             patches.insert(patches.begin(), image_original_resize);
             // clip_image_f32_batch_init(patches.size());
-            res_imgs.size = patches.size();
-            res_imgs.data = new clip_image_f32[res_imgs.size];
+            res_imgs->size = patches.size();
+            res_imgs->data = new clip_image_f32[res_imgs->size];
             int num=0;
             for (auto& patch : patches) {
-                normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
+                normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
                 num++;
             }
 
@@ -1660,9 +1660,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
     // }
     // res_imgs.push_back(res);
 
-    res_imgs.size = 1;
-    res_imgs.data = new clip_image_f32[res_imgs.size];
-    res_imgs.data[0] = *res;
+    res_imgs->size = 1;
+    res_imgs->data = new clip_image_f32[res_imgs->size];
+    res_imgs->data[0] = *res;
     clip_image_f32_free(res);
 
     return true;
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index e5bd54924..45bdad689 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();
 
 CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
-CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  & batch);
-CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
+CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
 
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
 
@@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
 
 /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
-CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
 
 CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 980128166..29764757a 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     clip_image_f32_batch img_res_v;
     img_res_v.size = 0;
     img_res_v.data = nullptr;
-    if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
+    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
         fprintf(stderr, "%s: unable to preprocess image\n", __func__);
         delete[] img_res_v.data;
         return false;
diff --git a/examples/llava/llava.h b/examples/llava/llava.h
index 2d40f3f1d..19212f6e9 100644
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -29,9 +29,9 @@ struct llava_image_embed {
 };
 
 /** sanity check for clip <-> llava embed size match */
-LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
 
-LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
+LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
 
 /** build an image embed from image file bytes */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);

From 12247f4c69a173b9482f68aaa174ec37fc909ccf Mon Sep 17 00:00:00 2001
From: Andrew Canis <andrew.canis@gmail.com>
Date: Fri, 15 Mar 2024 16:41:22 -0400
Subject: [PATCH 24/31] llama : add Command-R support (#6033)

Information about the Command-R 35B model (128k context) can be found at:
	https://huggingface.co/CohereForAI/c4ai-command-r-v01

Based on the llama2 model with a few changes:

1) New hyper parameter to scale output logits (logit_scale)
2) Uses LayerNorm instead of RMSNorm
3) Transfomer layers have a single shared LayerNorm that feeds into both the
   self-attention and FFN layers in parallel. There is no post-attention LayerNorm.
4) No support for Rotary Position Embeddings (RoPE) scaling
5) No biases used

Find GGUF files here:
	https://huggingface.co/andrewcanis/c4ai-command-r-v01-GGUF

To convert model to GGUF format yourself:

1) Download Command-R Hugging Face safetensors:
	git lfs install
	git clone https://huggingface.co/CohereForAI/c4ai-command-r-v01

2) Run:
	python3 convert-hf-to-gguf.py --outtype f16 ./c4ai-command-r-v01
---
 README.md                   |   1 +
 convert-hf-to-gguf.py       |  17 ++++
 gguf-py/gguf/constants.py   |  15 +++
 gguf-py/gguf/gguf_writer.py |   3 +
 llama.cpp                   | 183 ++++++++++++++++++++++++++++++++++++
 5 files changed, 219 insertions(+)

diff --git a/README.md b/README.md
index 61bedc3f8..5cbdf7e47 100644
--- a/README.md
+++ b/README.md
@@ -112,6 +112,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
 - [x] [Mamba](https://github.com/state-spaces/mamba)
+- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
 
 **Multimodal models:**
 
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 5eee32016..cf1f98d66 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1965,6 +1965,23 @@ class MambaModel(Model):
             self.gguf_writer.add_tensor(new_name, data)
 
 
+@Model.register("CohereForCausalLM")
+class CommandR2Model(Model):
+    model_arch = gguf.MODEL_ARCH.COMMAND_R
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # max_position_embeddings = 8192 in config.json but model was actually
+        # trained on 128k context length
+        self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+
 ###### CONVERSION LOGIC ######
 
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 458a641dc..4a4facb06 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -42,6 +42,7 @@ class Keys:
         EXPERT_COUNT          = "{arch}.expert_count"
         EXPERT_USED_COUNT     = "{arch}.expert_used_count"
         POOLING_TYPE          = "{arch}.pooling_type"
+        LOGIT_SCALE           = "{arch}.logit_scale"
 
     class Attention:
         HEAD_COUNT        = "{arch}.attention.head_count"
@@ -121,6 +122,7 @@ class MODEL_ARCH(IntEnum):
     GEMMA      = auto()
     STARCODER2 = auto()
     MAMBA      = auto()
+    COMMAND_R  = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -187,6 +189,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.GEMMA:          "gemma",
     MODEL_ARCH.STARCODER2:     "starcoder2",
     MODEL_ARCH.MAMBA:          "mamba",
+    MODEL_ARCH.COMMAND_R:      "command-r",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -579,6 +582,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.SSM_D,
         MODEL_TENSOR.SSM_OUT,
     ],
+    MODEL_ARCH.COMMAND_R: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     # TODO
 }
 
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 1967b633c..2ae6c814b 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -361,6 +361,9 @@ class GGUFWriter:
     def add_clamp_kqv(self, value: float) -> None:
         self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
 
+    def add_logit_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
+
     def add_expert_count(self, count: int) -> None:
         self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
 
diff --git a/llama.cpp b/llama.cpp
index 8e185d4bf..fc5dd5cb4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -214,6 +214,7 @@ enum llm_arch {
     LLM_ARCH_GEMMA,
     LLM_ARCH_STARCODER2,
     LLM_ARCH_MAMBA,
+    LLM_ARCH_COMMAND_R,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -243,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GEMMA,           "gemma"      },
     { LLM_ARCH_STARCODER2,      "starcoder2" },
     { LLM_ARCH_MAMBA,           "mamba"      },
+    { LLM_ARCH_COMMAND_R,       "command-r"  },
     { LLM_ARCH_UNKNOWN,         "(unknown)"  },
 };
 
@@ -268,6 +270,7 @@ enum llm_kv {
     LLM_KV_EXPERT_COUNT,
     LLM_KV_EXPERT_USED_COUNT,
     LLM_KV_POOLING_TYPE,
+    LLM_KV_LOGIT_SCALE,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -332,6 +335,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_EXPERT_COUNT,                  "%s.expert_count"          },
     { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"     },
     { LLM_KV_POOLING_TYPE ,                 "%s.pooling_type"          },
+    { LLM_KV_LOGIT_SCALE,                   "%s.logit_scale"           },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,          "%s.attention.head_count"             },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,       "%s.attention.head_count_kv"          },
@@ -838,6 +842,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
         },
     },
+    {
+        LLM_ARCH_COMMAND_R,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -1597,6 +1616,7 @@ enum e_model {
     MODEL_20B,
     MODEL_30B,
     MODEL_34B,
+    MODEL_35B,
     MODEL_40B,
     MODEL_65B,
     MODEL_70B,
@@ -1643,6 +1663,7 @@ struct llama_hparams {
 
     float f_clamp_kqv      = 0.0f;
     float f_max_alibi_bias = 0.0f;
+    float f_logit_scale    = 0.0f;
 
     bool causal_attn = true;
     bool need_kq_pos = false;
@@ -3231,6 +3252,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_20B:    return "20B";
         case MODEL_30B:    return "30B";
         case MODEL_34B:    return "34B";
+        case MODEL_35B:    return "35B";
         case MODEL_40B:    return "40B";
         case MODEL_65B:    return "65B";
         case MODEL_70B:    return "70B";
@@ -3623,6 +3645,15 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_COMMAND_R:
+            {
+                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 40: model.type = e_model::MODEL_35B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         default: (void)0;
     }
 
@@ -3944,6 +3975,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
     LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
     LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
+    LLAMA_LOG_INFO("%s: f_logit_scale    = %.1e\n",   __func__, hparams.f_logit_scale);
     LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
     LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
     LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
@@ -4918,6 +4950,37 @@ static bool llm_load_tensors(
                         layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
                     }
                 } break;
+            case LLM_ARCH_COMMAND_R:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    {
+                        model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        // init output from the input tok embed
+                        model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += ggml_nbytes(model.output);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                    }
+                } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -8315,6 +8378,121 @@ struct llm_build_context {
 
         return gf;
     }
+
+    struct ggml_cgraph * build_command_r() {
+
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        const float f_logit_scale = hparams.f_logit_scale;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+            struct ggml_tensor * ffn_inp = cur;
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_custom(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            struct ggml_tensor * attn_out = cur;
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, ffn_inp,
+                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // add together residual + FFN + self-attention
+            cur = ggml_add(ctx0, cur, inpL);
+            cur = ggml_add(ctx0, cur, attn_out);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+
+        if (f_logit_scale) {
+            cur = ggml_scale(ctx0, cur, f_logit_scale);
+        }
+
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+
+    }
 };
 
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -8497,6 +8675,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_mamba();
             } break;
+        case LLM_ARCH_COMMAND_R:
+            {
+                result = llm.build_command_r();
+            } break;
         default:
             GGML_ASSERT(false);
     }
@@ -13147,6 +13329,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_ORION:
         case LLM_ARCH_INTERNLM2:
         case LLM_ARCH_MINICPM:
+        case LLM_ARCH_COMMAND_R:
             return LLAMA_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2

From 877b4d0c628cc70dddb5df72ed8fc14d126ca7e8 Mon Sep 17 00:00:00 2001
From: Theia Vogel <theia@vgel.me>
Date: Fri, 15 Mar 2024 13:43:02 -0700
Subject: [PATCH 25/31] llama : add support for control vectors (#5970)

* control vector api and implementation

* control-vectors : minor code style updates

* disable control vector when data == nullptr

use -1 for disabled range (also on init) in case we ever support controlling layer 0 (embeddings)

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp | 215 ++++++++++++++++++++++++++++++++++++++++++++++
 common/common.h   |  31 ++++++-
 llama.cpp         | 128 +++++++++++++++++++++++++++
 llama.h           |  23 ++++-
 4 files changed, 392 insertions(+), 5 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 58fbd05aa..4912237e0 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -568,6 +568,34 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.lora_base = argv[i];
+        } else if (arg == "--control-vector") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.control_vectors.push_back({ 1.0f, argv[i], });
+        } else if (arg == "--control-vector-scaled") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            const char * fname = argv[i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.control_vectors.push_back({ std::stof(argv[i]), fname, });
+        } else if (arg == "--control-vector-layer-range") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.control_vector_layer_start = std::stoi(argv[i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.control_vector_layer_end = std::stoi(argv[i]);
         } else if (arg == "--mmproj") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1095,6 +1123,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
     printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    printf("  --control-vector FNAME\n");
+    printf("                        add a control vector\n");
+    printf("  --control-vector-scaled FNAME S\n");
+    printf("                        add a control vector with user defined scaling S\n");
+    printf("  --control-vector-layer-range START END\n");
+    printf("                        layer range to apply the control vector(s) to, start and end inclusive\n");
     printf("  -m FNAME, --model FNAME\n");
     printf("                        model path (default: %s)\n", params.model.c_str());
     printf("  -md FNAME, --model-draft FNAME\n");
@@ -1360,6 +1394,30 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         return std::make_tuple(nullptr, nullptr);
     }
 
+    if (!params.control_vectors.empty()) {
+        if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
+        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);
+
+        const auto cvec = llama_control_vector_load(params.control_vectors);
+        if (cvec.n_embd == -1) {
+            llama_free(lctx);
+            llama_free_model(model);
+            return std::make_tuple(nullptr, nullptr);
+        }
+
+        int err = llama_control_vector_apply(lctx,
+                                             cvec.data.data(),
+                                             cvec.data.size(),
+                                             cvec.n_embd,
+                                             params.control_vector_layer_start,
+                                             params.control_vector_layer_end);
+        if (err) {
+            llama_free(lctx);
+            llama_free_model(model);
+            return std::make_tuple(nullptr, nullptr);
+        }
+    }
+
     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
         const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
         float lora_scale = std::get<1>(params.lora_adapter[i]);
@@ -1890,3 +1948,160 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
 
     return sum / (sqrt(sum1) * sqrt(sum2));
 }
+
+//
+// Control vector utils
+//
+
+static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
+    int32_t n_tensors;
+
+    size_t n_bytes = 0;
+
+    uint32_t max_direction_layer = 0;
+
+    llama_control_vector_data result = { -1, {} };
+
+    // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
+    {
+        struct ggml_init_params meta_params = {
+            /* .mem_size   = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
+            /* .mem_buffer = */ nullptr,
+            /* .no_alloc   = */ true,
+        };
+        ggml_context * meta_ctx = ggml_init(meta_params);
+        struct gguf_init_params meta_gguf_params = {
+            /* .no_alloc = */ true,
+            /* .ctx      = */ &meta_ctx,
+        };
+        struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+        if (!meta_ctx_gguf) {
+            fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
+            ggml_free(meta_ctx);
+            return result;
+        }
+
+        n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
+        for (int i = 0; i < n_tensors; i++) {
+            std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
+
+            // split on '.'
+            size_t dotpos = name.find('.');
+            if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+                try {
+                    uint32_t layer = std::stoi(name.substr(dotpos + 1));
+                    if (layer == 0) {
+                        fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
+                        ggml_free(meta_ctx);
+                        gguf_free(meta_ctx_gguf);
+                        return result;
+                    }
+                    if (layer > max_direction_layer) {
+                        max_direction_layer = layer;
+                    }
+                } catch (...) {
+                    fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
+                    ggml_free(meta_ctx);
+                    gguf_free(meta_ctx_gguf);
+                    return result;
+                }
+            }
+
+            struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
+            if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
+                fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
+                ggml_free(meta_ctx);
+                gguf_free(meta_ctx_gguf);
+                return result;
+            }
+            if (result.n_embd == -1) {
+                result.n_embd = ggml_nelements(tensor_meta);
+            } else if (ggml_nelements(tensor_meta) != result.n_embd) {
+                fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
+                ggml_free(meta_ctx);
+                gguf_free(meta_ctx_gguf);
+                return result;
+            }
+            n_bytes += ggml_nbytes(tensor_meta);
+        }
+        ggml_free(meta_ctx);
+        gguf_free(meta_ctx_gguf);
+    }
+
+    if (n_tensors == 0) {
+        fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
+        return result;
+    }
+
+    // load and scale tensors into final control vector context
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ ggml_tensor_overhead() * n_tensors + n_bytes,
+        /* .mem_buffer = */ nullptr,
+        /* .no_alloc   = */ false,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
+        ggml_free(ctx);
+        return result;
+    }
+
+    // do not store data for layer 0 (it's not used)
+    result.data.resize(result.n_embd * max_direction_layer);
+
+    for (uint32_t il = 1; il <= max_direction_layer; il++) {
+        const std::string name = "direction." + std::to_string(il);
+        const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+
+        float * dst = result.data.data() + result.n_embd * (il - 1);
+
+        if (tensor) {
+            const float * src = (const float *) tensor->data;
+            for (int j = 0; j < result.n_embd; j++) {
+                dst[j] = src[j] * load_info.strength;
+            }
+        } else {
+            for (int j = 0; j < result.n_embd; j++) {
+                dst[j] = 0.0f;
+            }
+        }
+    }
+
+    return result;
+}
+
+llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
+    llama_control_vector_data result = { -1, {} };
+
+    for (const auto & info : load_infos) {
+        auto cur = llama_control_vector_load_one(info);
+
+        if (cur.n_embd == -1) {
+            return result;
+        }
+        if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
+            fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
+            return result;
+        }
+
+        if (result.n_embd == -1) {
+            result = std::move(cur);
+        } else {
+            for (size_t i = 0; i < cur.data.size(); i++) {
+                result.data[i] += cur.data[i];
+            }
+        }
+    }
+
+    if (result.n_embd == -1) {
+        fprintf(stderr, "%s: no vectors passed\n", __func__);
+    }
+
+    return result;
+}
diff --git a/common/common.h b/common/common.h
index d250eef8b..687f3425e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT;
 extern char const *LLAMA_COMPILER;
 extern char const *LLAMA_BUILD_TARGET;
 
+struct llama_control_vector_load_info;
+
+int32_t get_num_physical_cores();
+
 //
 // CLI argument parsing
 //
-int32_t get_num_physical_cores();
 
 struct gpt_params {
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
@@ -103,6 +106,11 @@ struct gpt_params {
     std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
     std::string lora_base  = "";                              // base model path for the lora adapter
 
+    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
+
+    int32_t control_vector_layer_start = -1; // layer range for control vector
+    int32_t control_vector_layer_end   = -1; // layer range for control vector
+
     int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
                                     //                                       (which is more convenient to use for plotting)
@@ -269,3 +277,24 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
 void llama_embd_normalize(const float * inp, float * out, int n);
 
 float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+
+//
+// Control vector utils
+//
+
+struct llama_control_vector_data {
+    int n_embd;
+
+    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
+    std::vector<float> data;
+};
+
+struct llama_control_vector_load_info {
+    float strength;
+
+    std::string fname;
+};
+
+// Load control vectors, scale each by strength, and add them together.
+// On error, returns {-1, empty}
+llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
diff --git a/llama.cpp b/llama.cpp
index fc5dd5cb4..52bd718ba 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1894,6 +1894,31 @@ struct llama_kv_cache {
     }
 };
 
+struct llama_control_vector {
+    std::vector<struct ggml_tensor *> tensors; // per layer
+    std::vector<struct ggml_context *> ctxs;
+    std::vector<ggml_backend_buffer_t> bufs;
+
+    int32_t layer_start = -1;
+    int32_t layer_end   = -1;
+
+    ggml_tensor * tensor_for(int il) const {
+        if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
+            return nullptr;
+        }
+        return tensors[il];
+    }
+
+    ~llama_control_vector() {
+        for (struct ggml_context * ctx : ctxs) {
+            ggml_free(ctx);
+        }
+        for (ggml_backend_buffer_t buf : bufs) {
+            ggml_backend_buffer_free(buf);
+        }
+    }
+};
+
 struct llama_vocab {
     using id    = int32_t;
     using token = std::string;
@@ -2108,6 +2133,9 @@ struct llama_context {
     struct ggml_tensor * inp_s_mask;    // F32 [1, kv_size]
     struct ggml_tensor * inp_s_seq;     // I32 [kv_size, n_batch]
 
+    // control vectors
+    struct llama_control_vector cvec;
+
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
 #endif
@@ -5931,6 +5959,12 @@ struct llm_build_context {
             }
 
             cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+            if (layer_dir != nullptr) {
+                cur = ggml_add(ctx0, cur, layer_dir);
+            }
             cb(cur, "l_out", il);
 
             // input for next layer
@@ -13366,6 +13400,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
     return model->hparams.n_embd;
 }
 
+int32_t llama_n_layer(const struct llama_model * model) {
+    return model->hparams.n_layer;
+}
+
 float llama_rope_freq_scale_train(const struct llama_model * model) {
     return model->hparams.rope_freq_scale_train;
 }
@@ -13465,6 +13503,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
     }
 }
 
+static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
+    GGML_ASSERT(cvec.tensors.empty());
+    GGML_ASSERT(cvec.ctxs.empty());
+    GGML_ASSERT(cvec.bufs.empty());
+
+    // count layer buffer types
+    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
+    for (int64_t i = 0; i < model.hparams.n_layer; i++) {
+        buft_layer_count[model.buft_layer[i].buft]++;
+    }
+
+    // allocate contexts
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    for (auto & it : buft_layer_count) {
+        int n_layers = it.second;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ n_layers * ggml_tensor_overhead(),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ggml_context * ctx = ggml_init(params);
+        if (!ctx) {
+            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
+            return 1;
+        }
+        ctx_map[it.first] = ctx;
+    }
+
+    // make tensors
+    cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
+    for (size_t il = 1; il < model.hparams.n_layer; il++) {
+        struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
+        ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
+        cvec.tensors.push_back(tensor);
+    }
+
+    // allocate tensors / buffers and zero
+    for (auto it : ctx_map) {
+        ggml_backend_buffer_type_t buft = it.first;
+        ggml_context * ctx = it.second;
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
+            return false;
+        }
+        ggml_backend_buffer_clear(buf, 0);
+        cvec.ctxs.push_back(ctx);
+        cvec.bufs.push_back(buf);
+    }
+
+    return true;
+}
+
+int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
+    const llama_model & model = lctx->model;
+    llama_control_vector & cvec = lctx->cvec;
+
+    if (data == nullptr) {
+        // disable the current control vector (but leave allocated for later)
+        cvec.layer_start = -1;
+        cvec.layer_end   = -1;
+        return 0;
+    }
+
+    if (n_embd != (int) model.hparams.n_embd) {
+        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
+        return 1;
+    }
+
+    if (cvec.tensors.empty()) {
+        if (!llama_control_vector_init(cvec, model)) {
+            return 1;
+        }
+    }
+
+    cvec.layer_start = il_start;
+    cvec.layer_end   = il_end;
+
+    for (size_t il = 1; il < model.hparams.n_layer; il++) {
+        assert(cvec.tensors[il] != nullptr);
+
+        const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
+        if (off + n_embd <= len) {
+            ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
+        }
+    }
+
+    return 0;
+}
+
 struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
     struct llama_kv_cache_view result = {
         /*.n_cells            = */ 0,
diff --git a/llama.h b/llama.h
index 90aa5372e..40dcf54e3 100644
--- a/llama.h
+++ b/llama.h
@@ -388,6 +388,7 @@ extern "C" {
     LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
     LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
     LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
 
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@@ -435,10 +436,24 @@ extern "C" {
     // Returns 0 on success
     LLAMA_API int32_t llama_model_apply_lora_from_file(
             const struct llama_model * model,
-                      const char * path_lora,
-                           float   scale,
-                      const char * path_base_model,
-                         int32_t   n_threads);
+                          const char * path_lora,
+                               float   scale,
+                          const char * path_base_model,
+                             int32_t   n_threads);
+
+    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
+    // the currently loaded vector.
+    // n_embd should be the size of a single layer's control, and data should point
+    // to an n_embd x n_layers buffer starting from layer 1.
+    // il_start and il_end are the layer range the vector should apply to (both inclusive)
+    // See llama_control_vector_load in common to load a control vector.
+    LLAMA_API int32_t llama_control_vector_apply(
+            struct llama_context * lctx,
+                     const float * data,
+                          size_t   len,
+                         int32_t   n_embd,
+                         int32_t   il_start,
+                         int32_t   il_end);
 
     //
     // KV cache

From d84c48505f60bcd358b82a751d40418c4d235643 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 15 Mar 2024 22:14:16 +0100
Subject: [PATCH 26/31] llama : fix Baichuan2 13B (#6092)

---
 llama.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 52bd718ba..e4db288dd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6000,7 +6000,7 @@ struct llm_build_context {
         inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
 
         // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
+        struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
 
         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
@@ -6050,7 +6050,6 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
                 cb(Kcur, "Kcur", il);
 
-
                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                         model.layers[il].wo, NULL,
                         Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);

From a56d09a4407f29c21e149b44fd5308f83aa1cb09 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 13:20:53 +0100
Subject: [PATCH 27/31] ci : close inactive issue with workflow (#6053)

* issues: ci - close inactive issue with workflow

* ci: close issue, change workflow schedule time
---
 .github/workflows/close-issue.yml | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 .github/workflows/close-issue.yml

diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
new file mode 100644
index 000000000..bc08a72d0
--- /dev/null
+++ b/.github/workflows/close-issue.yml
@@ -0,0 +1,22 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "42 0 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v5
+        with:
+          days-before-issue-stale: 30
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
+          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}

From 15961ec04dbd59d21d8984d42e4c0f7e7e7d320a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?DAN=E2=84=A2?= <dranger003@gmail.com>
Date: Sat, 16 Mar 2024 11:39:15 -0400
Subject: [PATCH 28/31] common : refactor nested if causing error C1061 on MSVC
 (#6101)

* Refactor nested if causing error C1061 on MSVC.

* Revert back and remove else's.

* Add flag to track found arguments.
---
 common/common.cpp | 475 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 356 insertions(+), 119 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 4912237e0..1b0ba8493 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -151,13 +151,17 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             std::replace(arg.begin(), arg.end(), '_', '-');
         }
 
+        bool arg_found = false;
         if (arg == "-s" || arg == "--seed") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.seed = std::stoul(argv[i]);
-        } else if (arg == "-t" || arg == "--threads") {
+        }
+        if (arg == "-t" || arg == "--threads") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -166,7 +170,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             if (params.n_threads <= 0) {
                 params.n_threads = std::thread::hardware_concurrency();
             }
-        } else if (arg == "-tb" || arg == "--threads-batch") {
+        }
+        if (arg == "-tb" || arg == "--threads-batch") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -175,7 +181,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             if (params.n_threads_batch <= 0) {
                 params.n_threads_batch = std::thread::hardware_concurrency();
             }
-        } else if (arg == "-td" || arg == "--threads-draft") {
+        }
+        if (arg == "-td" || arg == "--threads-draft") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -184,7 +192,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             if (params.n_threads_draft <= 0) {
                 params.n_threads_draft = std::thread::hardware_concurrency();
             }
-        } else if (arg == "-tbd" || arg == "--threads-batch-draft") {
+        }
+        if (arg == "-tbd" || arg == "--threads-batch-draft") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -193,25 +203,37 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             if (params.n_threads_batch_draft <= 0) {
                 params.n_threads_batch_draft = std::thread::hardware_concurrency();
             }
-        } else if (arg == "-p" || arg == "--prompt") {
+        }
+        if (arg == "-p" || arg == "--prompt") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.prompt = argv[i];
-        } else if (arg == "-e" || arg == "--escape") {
+        }
+        if (arg == "-e" || arg == "--escape") {
+            arg_found = true;
             params.escape = true;
-        } else if (arg == "--prompt-cache") {
+        }
+        if (arg == "--prompt-cache") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.path_prompt_cache = argv[i];
-        } else if (arg == "--prompt-cache-all") {
+        }
+        if (arg == "--prompt-cache-all") {
+            arg_found = true;
             params.prompt_cache_all = true;
-        } else if (arg == "--prompt-cache-ro") {
+        }
+        if (arg == "--prompt-cache-ro") {
+            arg_found = true;
             params.prompt_cache_ro = true;
-        } else if (arg == "-bf" || arg == "--binary-file") {
+        }
+        if (arg == "-bf" || arg == "--binary-file") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -228,7 +250,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             ss << file.rdbuf();
             params.prompt = ss.str();
             fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
-        } else if (arg == "-f" || arg == "--file") {
+        }
+        if (arg == "-f" || arg == "--file") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -245,51 +269,67 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             if (!params.prompt.empty() && params.prompt.back() == '\n') {
                 params.prompt.pop_back();
             }
-        } else if (arg == "-n" || arg == "--n-predict") {
+        }
+        if (arg == "-n" || arg == "--n-predict") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "--top-k") {
+        }
+        if (arg == "--top-k") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.top_k = std::stoi(argv[i]);
-        } else if (arg == "-c" || arg == "--ctx-size") {
+        }
+        if (arg == "-c" || arg == "--ctx-size") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--grp-attn-n" || arg == "-gan") {
+        }
+        if (arg == "--grp-attn-n" || arg == "-gan") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
 
             params.grp_attn_n = std::stoi(argv[i]);
-        } else if (arg == "--grp-attn-w" || arg == "-gaw") {
+        }
+        if (arg == "--grp-attn-w" || arg == "-gaw") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
 
             params.grp_attn_w = std::stoi(argv[i]);
-        } else if (arg == "--rope-freq-base") {
+        }
+        if (arg == "--rope-freq-base") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.rope_freq_base = std::stof(argv[i]);
-        } else if (arg == "--rope-freq-scale") {
+        }
+        if (arg == "--rope-freq-scale") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.rope_freq_scale = std::stof(argv[i]);
-        } else if (arg == "--rope-scaling") {
+        }
+        if (arg == "--rope-scaling") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -299,43 +339,57 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
             else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
             else { invalid_param = true; break; }
-        } else if (arg == "--rope-scale") {
+        }
+        if (arg == "--rope-scale") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.rope_freq_scale = 1.0f/std::stof(argv[i]);
-        } else if (arg == "--yarn-orig-ctx") {
+        }
+        if (arg == "--yarn-orig-ctx") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.yarn_orig_ctx = std::stoi(argv[i]);
-        } else if (arg == "--yarn-ext-factor") {
+        }
+        if (arg == "--yarn-ext-factor") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.yarn_ext_factor = std::stof(argv[i]);
-        } else if (arg == "--yarn-attn-factor") {
+        }
+        if (arg == "--yarn-attn-factor") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.yarn_attn_factor = std::stof(argv[i]);
-        } else if (arg == "--yarn-beta-fast") {
+        }
+        if (arg == "--yarn-beta-fast") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.yarn_beta_fast = std::stof(argv[i]);
-        } else if (arg == "--yarn-beta-slow") {
+        }
+        if (arg == "--yarn-beta-slow") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.yarn_beta_slow = std::stof(argv[i]);
-        } else if (arg == "--pooling") {
+        }
+        if (arg == "--pooling") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -345,118 +399,156 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
             else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
             else { invalid_param = true; break; }
-        } else if (arg == "--defrag-thold" || arg == "-dt") {
+        }
+        if (arg == "--defrag-thold" || arg == "-dt") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.defrag_thold = std::stof(argv[i]);
-        } else if (arg == "--samplers") {
+        }
+        if (arg == "--samplers") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             const auto sampler_names = string_split(argv[i], ';');
             sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
-        } else if (arg == "--sampling-seq") {
+        }
+        if (arg == "--sampling-seq") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
-        } else if (arg == "--top-p") {
+        }
+        if (arg == "--top-p") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.top_p = std::stof(argv[i]);
-        } else if (arg == "--min-p") {
+        }
+        if (arg == "--min-p") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.min_p = std::stof(argv[i]);
-        } else if (arg == "--temp") {
+        }
+        if (arg == "--temp") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.temp = std::stof(argv[i]);
             sparams.temp = std::max(sparams.temp, 0.0f);
-        } else if (arg == "--tfs") {
+        }
+        if (arg == "--tfs") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.tfs_z = std::stof(argv[i]);
-        } else if (arg == "--typical") {
+        }
+        if (arg == "--typical") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.typical_p = std::stof(argv[i]);
-        } else if (arg == "--repeat-last-n") {
+        }
+        if (arg == "--repeat-last-n") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.penalty_last_n = std::stoi(argv[i]);
             sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
-        } else if (arg == "--repeat-penalty") {
+        }
+        if (arg == "--repeat-penalty") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.penalty_repeat = std::stof(argv[i]);
-        } else if (arg == "--frequency-penalty") {
+        }
+        if (arg == "--frequency-penalty") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.penalty_freq = std::stof(argv[i]);
-        } else if (arg == "--presence-penalty") {
+        }
+        if (arg == "--presence-penalty") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.penalty_present = std::stof(argv[i]);
-        } else if (arg == "--dynatemp-range") {
+        }
+        if (arg == "--dynatemp-range") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.dynatemp_range = std::stof(argv[i]);
-        } else if (arg == "--dynatemp-exp") {
+        }
+        if (arg == "--dynatemp-exp") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.dynatemp_exponent = std::stof(argv[i]);
-        } else if (arg == "--mirostat") {
+        }
+        if (arg == "--mirostat") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.mirostat = std::stoi(argv[i]);
-        } else if (arg == "--mirostat-lr") {
+        }
+        if (arg == "--mirostat-lr") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.mirostat_eta = std::stof(argv[i]);
-        } else if (arg == "--mirostat-ent") {
+        }
+        if (arg == "--mirostat-ent") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.mirostat_tau = std::stof(argv[i]);
-        } else if (arg == "--cfg-negative-prompt") {
+        }
+        if (arg == "--cfg-negative-prompt") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.cfg_negative_prompt = argv[i];
-        } else if (arg == "--cfg-negative-prompt-file") {
+        }
+        if (arg == "--cfg-negative-prompt-file") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -471,86 +563,114 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
                 sparams.cfg_negative_prompt.pop_back();
             }
-        } else if (arg == "--cfg-scale") {
+        }
+        if (arg == "--cfg-scale") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.cfg_scale = std::stof(argv[i]);
-        } else if (arg == "-b" || arg == "--batch-size") {
+        }
+        if (arg == "-b" || arg == "--batch-size") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.n_batch = std::stoi(argv[i]);
-        } else if (arg == "-ub" || arg == "--ubatch-size") {
+        }
+        if (arg == "-ub" || arg == "--ubatch-size") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.n_ubatch = std::stoi(argv[i]);
-        } else if (arg == "--keep") {
+        }
+        if (arg == "--keep") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.n_keep = std::stoi(argv[i]);
-        } else if (arg == "--draft") {
+        }
+        if (arg == "--draft") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.n_draft = std::stoi(argv[i]);
-        } else if (arg == "--chunks") {
+        }
+        if (arg == "--chunks") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.n_chunks = std::stoi(argv[i]);
-        } else if (arg == "-np" || arg == "--parallel") {
+        }
+        if (arg == "-np" || arg == "--parallel") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.n_parallel = std::stoi(argv[i]);
-        } else if (arg == "-ns" || arg == "--sequences") {
+        }
+        if (arg == "-ns" || arg == "--sequences") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.n_sequences = std::stoi(argv[i]);
-        } else if (arg == "--p-split" || arg == "-ps") {
+        }
+        if (arg == "--p-split" || arg == "-ps") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.p_split = std::stof(argv[i]);
-        } else if (arg == "-m" || arg == "--model") {
+        }
+        if (arg == "-m" || arg == "--model") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.model = argv[i];
-        } else if (arg == "-md" || arg == "--model-draft") {
+        }
+        if (arg == "-md" || arg == "--model-draft") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.model_draft = argv[i];
-        } else if (arg == "-a" || arg == "--alias") {
+        }
+        if (arg == "-a" || arg == "--alias") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.model_alias = argv[i];
-        } else if (arg == "--lora") {
+        }
+        if (arg == "--lora") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.lora_adapter.emplace_back(argv[i], 1.0f);
             params.use_mmap = false;
-        } else if (arg == "--lora-scaled") {
+        }
+        if (arg == "--lora-scaled") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -562,19 +682,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             }
             params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
             params.use_mmap = false;
-        } else if (arg == "--lora-base") {
+        }
+        if (arg == "--lora-base") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.lora_base = argv[i];
-        } else if (arg == "--control-vector") {
+        }
+        if (arg == "--control-vector") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.control_vectors.push_back({ 1.0f, argv[i], });
-        } else if (arg == "--control-vector-scaled") {
+        }
+        if (arg == "--control-vector-scaled") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -585,7 +711,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.control_vectors.push_back({ std::stof(argv[i]), fname, });
-        } else if (arg == "--control-vector-layer-range") {
+        }
+        if (arg == "--control-vector-layer-range") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -596,49 +724,85 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.control_vector_layer_end = std::stoi(argv[i]);
-        } else if (arg == "--mmproj") {
+        }
+        if (arg == "--mmproj") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.mmproj = argv[i];
-        } else if (arg == "--image") {
+        }
+        if (arg == "--image") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.image = argv[i];
-        } else if (arg == "-i" || arg == "--interactive") {
+        }
+        if (arg == "-i" || arg == "--interactive") {
+            arg_found = true;
             params.interactive = true;
-        } else if (arg == "--embedding") {
+        }
+        if (arg == "--embedding") {
+            arg_found = true;
             params.embedding = true;
-        } else if (arg == "--interactive-first") {
+        }
+        if (arg == "--interactive-first") {
+            arg_found = true;
             params.interactive_first = true;
-        } else if (arg == "-ins" || arg == "--instruct") {
+        }
+        if (arg == "-ins" || arg == "--instruct") {
+            arg_found = true;
             params.instruct = true;
-        } else if (arg == "-cml" || arg == "--chatml") {
+        }
+        if (arg == "-cml" || arg == "--chatml") {
+            arg_found = true;
             params.chatml = true;
-        } else if (arg == "--infill") {
+        }
+        if (arg == "--infill") {
+            arg_found = true;
             params.infill = true;
-        } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
+        }
+        if (arg == "-dkvc" || arg == "--dump-kv-cache") {
+            arg_found = true;
             params.dump_kv_cache = true;
-        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
+        }
+        if (arg == "-nkvo" || arg == "--no-kv-offload") {
+            arg_found = true;
             params.no_kv_offload = true;
-        } else if (arg == "-ctk" || arg == "--cache-type-k") {
+        }
+        if (arg == "-ctk" || arg == "--cache-type-k") {
+            arg_found = true;
             params.cache_type_k = argv[++i];
-        } else if (arg == "-ctv" || arg == "--cache-type-v") {
+        }
+        if (arg == "-ctv" || arg == "--cache-type-v") {
+            arg_found = true;
             params.cache_type_v = argv[++i];
-        } else if (arg == "--multiline-input") {
+        }
+        if (arg == "--multiline-input") {
+            arg_found = true;
             params.multiline_input = true;
-        } else if (arg == "--simple-io") {
+        }
+        if (arg == "--simple-io") {
+            arg_found = true;
             params.simple_io = true;
-        } else if (arg == "-cb" || arg == "--cont-batching") {
+        }
+        if (arg == "-cb" || arg == "--cont-batching") {
+            arg_found = true;
             params.cont_batching = true;
-        } else if (arg == "--color") {
+        }
+        if (arg == "--color") {
+            arg_found = true;
             params.use_color = true;
-        } else if (arg == "--mlock") {
+        }
+        if (arg == "--mlock") {
+            arg_found = true;
             params.use_mlock = true;
-        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+        }
+        if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -648,7 +812,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
                 fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
             }
-        } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+        }
+        if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -658,7 +824,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
                 fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
             }
-        } else if (arg == "--main-gpu" || arg == "-mg") {
+        }
+        if (arg == "--main-gpu" || arg == "-mg") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -667,7 +835,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 #ifndef GGML_USE_CUBLAS_SYCL
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
 #endif // GGML_USE_CUBLAS_SYCL
-        } else if (arg == "--split-mode" || arg == "-sm") {
+        }
+        if (arg == "--split-mode" || arg == "-sm") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -691,7 +861,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
 #endif // GGML_USE_CUBLAS_SYCL
 
-        } else if (arg == "--tensor-split" || arg == "-ts") {
+        }
+        if (arg == "--tensor-split" || arg == "-ts") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -716,9 +888,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 #ifndef GGML_USE_CUBLAS_SYCL_VULKAN
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
 #endif // GGML_USE_CUBLAS_SYCL
-        } else if (arg == "--no-mmap") {
+        }
+        if (arg == "--no-mmap") {
+            arg_found = true;
             params.use_mmap = false;
-        } else if (arg == "--numa") {
+        }
+        if (arg == "--numa") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -728,17 +904,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
             else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
             else { invalid_param = true; break; }
-        } else if (arg == "--verbose-prompt") {
+        }
+        if (arg == "--verbose-prompt") {
+            arg_found = true;
             params.verbose_prompt = true;
-        } else if (arg == "--no-display-prompt") {
+        }
+        if (arg == "--no-display-prompt") {
+            arg_found = true;
             params.display_prompt = false;
-        } else if (arg == "-r" || arg == "--reverse-prompt") {
+        }
+        if (arg == "-r" || arg == "--reverse-prompt") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.antiprompt.emplace_back(argv[i]);
-        } else if (arg == "-ld" || arg == "--logdir") {
+        }
+        if (arg == "-ld" || arg == "--logdir") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -748,63 +932,93 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                 params.logdir += DIRECTORY_SEPARATOR;
             }
-        } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
+        }
+        if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.logits_file = argv[i];
-        } else if (arg == "--perplexity" || arg == "--all-logits") {
+        }
+        if (arg == "--perplexity" || arg == "--all-logits") {
+            arg_found = true;
             params.logits_all = true;
-        } else if (arg == "--ppl-stride") {
+        }
+        if (arg == "--ppl-stride") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.ppl_stride = std::stoi(argv[i]);
-        } else if (arg == "-ptc" || arg == "--print-token-count") {
+        }
+        if (arg == "-ptc" || arg == "--print-token-count") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.n_print = std::stoi(argv[i]);
-        } else if (arg == "--ppl-output-type") {
+        }
+        if (arg == "--ppl-output-type") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.ppl_output_type = std::stoi(argv[i]);
-        } else if (arg == "--hellaswag") {
+        }
+        if (arg == "--hellaswag") {
+            arg_found = true;
             params.hellaswag = true;
-        } else if (arg == "--hellaswag-tasks") {
+        }
+        if (arg == "--hellaswag-tasks") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.hellaswag_tasks = std::stoi(argv[i]);
-        } else if (arg == "--winogrande") {
+        }
+        if (arg == "--winogrande") {
+            arg_found = true;
             params.winogrande = true;
-        } else if (arg == "--winogrande-tasks") {
+        }
+        if (arg == "--winogrande-tasks") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.winogrande_tasks = std::stoi(argv[i]);
-        } else if (arg == "--multiple-choice") {
+        }
+        if (arg == "--multiple-choice") {
+            arg_found = true;
             params.multiple_choice = true;
-        } else if (arg == "--multiple-choice-tasks") {
+        }
+        if (arg == "--multiple-choice-tasks") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.multiple_choice_tasks = std::stoi(argv[i]);
-        } else if (arg == "--kl-divergence") {
+        }
+        if (arg == "--kl-divergence") {
+            arg_found = true;
             params.kl_divergence = true;
-        } else if (arg == "--ignore-eos") {
+        }
+        if (arg == "--ignore-eos") {
+            arg_found = true;
             params.ignore_eos = true;
-        } else if (arg == "--no-penalize-nl") {
+        }
+        if (arg == "--no-penalize-nl") {
+            arg_found = true;
             sparams.penalize_nl = false;
-        } else if (arg == "-l" || arg == "--logit-bias") {
+        }
+        if (arg == "-l" || arg == "--logit-bias") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -823,36 +1037,51 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-        } else if (arg == "-h" || arg == "--help") {
+        }
+        if (arg == "-h" || arg == "--help") {
+            arg_found = true;
             return false;
-
-        } else if (arg == "--version") {
+        }
+        if (arg == "--version") {
+            arg_found = true;
             fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
             fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
             exit(0);
-        } else if (arg == "--random-prompt") {
+        }
+        if (arg == "--random-prompt") {
+            arg_found = true;
             params.random_prompt = true;
-        } else if (arg == "--in-prefix-bos") {
+        }
+        if (arg == "--in-prefix-bos") {
+            arg_found = true;
             params.input_prefix_bos = true;
-        } else if (arg == "--in-prefix") {
+        }
+        if (arg == "--in-prefix") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.input_prefix = argv[i];
-        } else if (arg == "--in-suffix") {
+        }
+        if (arg == "--in-suffix") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             params.input_suffix = argv[i];
-        } else if (arg == "--grammar") {
+        }
+        if (arg == "--grammar") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
             sparams.grammar = argv[i];
-        } else if (arg == "--grammar-file") {
+        }
+        if (arg == "--grammar-file") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -868,7 +1097,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 std::istreambuf_iterator<char>(),
                 std::back_inserter(sparams.grammar)
             );
-        } else if (arg == "--override-kv") {
+        }
+        if (arg == "--override-kv") {
+            arg_found = true;
             if (++i >= argc) {
                 invalid_param = true;
                 break;
@@ -911,10 +1142,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             params.kv_overrides.push_back(kvo);
 #ifndef LOG_DISABLE_LOGS
         // Parse args for logging parameters
-        } else if ( log_param_single_parse( argv[i] ) ) {
+        }
+        if ( log_param_single_parse( argv[i] ) ) {
+            arg_found = true;
             // Do nothing, log_param_single_parse automatically does it's thing
             //  and returns if a match was found and parsed.
-        } else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) {
+        }
+        if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) {
+            arg_found = true;
             // We have a matching known parameter requiring an argument,
             //  now we need to check if there is anything after this argv
             //  and flag invalid_param or parse it.
@@ -928,7 +1163,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             }
         // End of Parse args for logging parameters
 #endif // LOG_DISABLE_LOGS
-        } else {
+        }
+
+        if (!arg_found) {
             throw std::invalid_argument("error: unknown argument: " + arg);
         }
     }

From dfbfdd60f90207404039c6578d709231496831d9 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Sat, 16 Mar 2024 16:42:08 +0100
Subject: [PATCH 29/31] readme : add wllama as a wasm binding (#6100)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5cbdf7e47..c2f3342f0 100644
--- a/README.md
+++ b/README.md
@@ -134,6 +134,7 @@ Typically finetunes of the base models below are supported as well.
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
+- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)

From b5f4ae09c3244ae1644b67c03ed9f4227ab25ad2 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Sat, 16 Mar 2024 16:46:29 +0100
Subject: [PATCH 30/31] gritlm : add initial README.md (#6086)

* gritlm: add initial README.md to examples/gritlm

This commit adds a suggestion for an initial README.md for the gritlm
example.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* squash! gritlm: add initial README.md to examples/gritlm

Use the `scripts/hf.sh` script to download the model file.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* squash! gritlm: add initial README.md to examples/gritlm

Fix editorconfig-checker error in examples/gritlm/README.md.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

---------

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/gritlm/README.md | 62 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 examples/gritlm/README.md

diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md
new file mode 100644
index 000000000..64cc19204
--- /dev/null
+++ b/examples/gritlm/README.md
@@ -0,0 +1,62 @@
+## Generative Representational Instruction Tuning (GRIT) Example
+[gritlm] a model which can generate embeddings as well as "normal" text
+generation depending on the instructions in the prompt.
+
+* Paper: https://arxiv.org/pdf/2402.09906.pdf
+
+### Retrieval-Augmented Generation (RAG) use case
+One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
+that we take documents that we want to use as context, to ground the large
+language model (LLM), and we create token embeddings for them. We then store
+these token embeddings in a vector database.
+
+When we perform a query, prompt the LLM, we will first create token embeddings
+for the query and then search the vector database to retrieve the most
+similar vectors, and return those documents so they can be passed to the LLM as
+context. Then the query and the context will be passed to the LLM which will
+have to _again_ create token embeddings for the query. But because gritlm is used
+the first query can be cached and the second query tokenization generation does
+not have to be performed at all.
+
+### Running the example
+Download a Grit model:
+```console
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
+```
+
+Run the example using the downloaded model:
+```console
+$ ./gritlm -m gritlm-7b_q4_1.gguf
+
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
+Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
+Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+When shadows lurk and ghosts do roam,
+And darkness reigns, a fearsome sight.
+
+Thou didst set out, with heart aglow,
+To conquer this mountain, so high,
+And reach the summit, where the stars do glow,
+And the moon shines bright, up in the sky.
+
+Through the mist and fog, thou didst press on,
+With steadfast courage, and a steadfast will,
+Through the darkness, thou didst not be gone,
+But didst climb on, with a steadfast skill.
+
+At last, thou didst reach the summit's crest,
+And gazed upon the world below,
+And saw the beauty of the night's best,
+And felt the peace, that only nature knows.
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+Thou art a hero, in the eyes of all,
+For thou didst conquer this mountain, so bright.
+```
+
+[gritlm]: https://github.com/ContextualAI/gritlm

From c47cf414efafb8f60596edc7edb5a2d68065e992 Mon Sep 17 00:00:00 2001
From: AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
Date: Sat, 16 Mar 2024 11:52:02 -0400
Subject: [PATCH 31/31] ggml : add AVX512F SIMD (#6088)

---
 ggml.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/ggml.c b/ggml.c
index c94006e51..fa23cb3c4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -931,6 +931,101 @@ inline static float vaddvq_f32(float32x4_t v) {
     #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
 #endif
 
+#elif defined(__AVX512F__)
+
+#define GGML_SIMD
+
+// F32 AVX512
+
+#define GGML_F32_STEP 64
+#define GGML_F32_EPR  16
+
+#define GGML_F32x16         __m512
+#define GGML_F32x16_ZERO    _mm512_setzero_ps()
+#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
+#define GGML_F32x16_LOAD    _mm512_loadu_ps
+#define GGML_F32x16_STORE   _mm512_storeu_ps
+// _mm512_fmadd_ps is defined in AVX512F so no guard is required
+#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32x16_ADD     _mm512_add_ps
+#define GGML_F32x16_MUL     _mm512_mul_ps
+#define GGML_F32x16_REDUCE(res, x)                                    \
+do {                                                                  \
+    int offset = GGML_F32_ARR >> 1;                                   \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    res = _mm512_reduce_add_ps(x[0]);                                 \
+} while (0)
+
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x16
+#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
+
+// F16 AVX512
+
+// F16 AVX
+
+#define GGML_F16_STEP 64
+#define GGML_F16_EPR  16
+
+// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
+
+#define GGML_F32Cx16             __m512
+#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
+#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
+
+// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
+// so F16C guard isn't required
+#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
+#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
+
+#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32Cx16_ADD         _mm512_add_ps
+#define GGML_F32Cx16_MUL         _mm512_mul_ps
+#define GGML_F32Cx16_REDUCE(res, x)                               \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    res = _mm512_reduce_add_ps(x[0]);                             \
+} while (0)
+
+#define GGML_F16_VEC                GGML_F32Cx16
+#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
+
 #elif defined(__AVX__)
 
 #define GGML_SIMD