From 4be8fb18ed1f5cafaa0ee62a90bbcf47d1d7ac42 Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Thu, 29 Feb 2024 09:50:41 -0600
Subject: [PATCH 01/10] add gritlm example

---
 Makefile                       |   4 +
 examples/CMakeLists.txt        |   1 +
 examples/gritlm/CMakeLists.txt |   5 +
 examples/gritlm/gritlm.cpp     | 168 +++++++++++++++++++++++++++++++++
 4 files changed, 178 insertions(+)
 create mode 100644 examples/gritlm/CMakeLists.txt
 create mode 100644 examples/gritlm/gritlm.cpp
diff --git a/Makefile b/Makefile
index 4f26c0463..64a2d5bad 100644
--- a/Makefile
+++ b/Makefile
@@ -720,6 +720,10 @@ embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+gritlm: examples/gritlm/gritlm.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 653abc73a..e762cf8b9 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,6 +20,7 @@ else()
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(embedding)
     add_subdirectory(finetune)
+    add_subdirectory(gritlm)
     add_subdirectory(infill)
     add_subdirectory(llama-bench)
     add_subdirectory(llava)
diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt
new file mode 100644
index 000000000..ac4a5ae79
--- /dev/null
+++ b/examples/gritlm/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET gritlm)
+add_executable(${TARGET} gritlm.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
new file mode 100644
index 000000000..462d6e620
--- /dev/null
+++ b/examples/gritlm/gritlm.cpp
@@ -0,0 +1,168 @@
+#include "common.h"
+#include "llama.h"
+
+#include <string>
+#include <vector>
+#include <format>
+
+static float dot_product(const std::vector<float>& v1, const std::vector<float>& v2) {
+	float dot = 0.0f;
+	for (uint64_t i = 0; i < v1.size(); ++i) {
+		dot += v1[i] * v2[i];
+    }
+	return dot;
+}
+
+static float norm(const std::vector<float>& v) {
+	return std::sqrt(dot_product(v, v));
+}
+
+static float cosine_similarity(const std::vector<float>& v1, const std::vector<float>& v2) {
+	return dot_product(v1, v2) / (norm(v1) * norm(v2));
+}
+
+static void normalize(std::vector<float> in, float* out) {
+	float inorm = norm(in);
+	for (uint64_t i = 0; i < in.size(); i++) {
+		out[i] = in[i] / inorm;
+    }
+}
+
+static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vector<std::string>& sentences, const std::string& instruction) {
+	auto result = std::vector<std::vector<float>>{};
+
+	auto mdl = llama_get_model(ctx);
+
+	for (uint64_t i = 0; i < sentences.size(); i++) {
+		auto batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
+
+		// testing with and without EOS - unexpected embeddings in both cases - GritLM seems to have EOS = ""
+        std::string input_string = instruction + sentences[i];
+        // std::string input_string = sentences[i];
+		auto inputs = llama_tokenize(mdl, input_string, true, false);
+		// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L116
+		// inputs.push_back(llama_token_eos(mdl));
+
+		// debug tokens - these are matching as referenced in their sample so doesn't appear to be a token issue
+		std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
+            std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
+        });
+		std::printf("\n");
+
+        // add input to batch (this increments n_tokens)
+		for (uint64_t j = 0; j < inputs.size(); j++) {
+			llama_batch_add(batch, inputs[j], j, { 0 }, false);
+        }
+
+		// clear previous kv_cache values (irrelevant for embeddings)
+        llama_kv_cache_clear(ctx);
+
+		// run model
+		llama_decode(ctx, batch);
+
+        // get embedding dimensions
+        int n_toks = inputs.size();
+        int n_embd = llama_n_embd(mdl);
+
+        // allocate embedding output
+        std::vector<float> emb_unorm(n_embd, 0.0f);
+
+        // sum up all token embeddings
+        for (int k = 0; k < n_toks; k++) {
+            float * emb = llama_get_embeddings_ith(ctx, k);
+            for (int j = 0; j < n_embd; j++) {
+                emb_unorm[j] += emb[j];
+            }
+        }
+
+        // divide by number of tokens (mean pooling)
+        for (int j = 0; j < n_embd; j++) {
+            emb_unorm[j] /= n_toks;
+        }
+
+		auto emb_norm = std::vector<float>(emb_unorm.size());
+		normalize(emb_unorm, emb_norm.data());
+		result.push_back(emb_norm);
+
+        // print out emb_norm
+        std::printf("embedding %ld: ", i);
+        for (int j = 0; j < n_embd; j++) {
+            std::printf("%.5f ", emb_norm[j]);
+        }
+        std::printf("\n");
+
+		llama_batch_free(batch);
+	}
+
+	return result;
+}
+
+// ./embeddings -m ggml-gritlm-7b-q8_0.gguf -ngl 33
+int main(int argc, char* argv[])
+{
+	gpt_params params;
+	if (!gpt_params_parse(argc, argv, params))
+		return 1;
+
+	auto mparams = llama_model_params_from_gpt_params(params);
+	auto cparams = llama_context_params_from_gpt_params(params);
+
+	mparams.progress_callback = [](std::float_t progress, void* state) {
+        std::printf(
+            "%s\rLoading model... %u%%\r",
+            std::string(32, ' ').c_str(),
+            static_cast<std::uint8_t>(progress * 100)
+        );
+        return true;
+    };
+	cparams.embedding = true;
+    // cparams.do_pooling = false;
+
+	llama_backend_init();
+
+	auto mdl = llama_load_model_from_file(params.model.c_str(), mparams);
+	auto ctx = llama_new_context_with_model(mdl, cparams);
+	auto bat = llama_batch_init(llama_n_ctx(ctx), 0, 1);
+
+	// ### Embedding/Representation ### taken sample from here:
+	// https://github.com/ContextualAI/gritlm?tab=readme-ov-file#basic
+	{
+		auto instruction = std::string{ "Given a scientific paper title, retrieve the paper's abstract" };
+
+		auto queries = std::vector<std::string>{
+            // "hello world",
+			"Bitcoin: A Peer-to-Peer Electronic Cash System",
+			"Generative Representational Instruction Tuning",
+		};
+
+		auto documents = std::vector<std::string>{
+			"A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
+			"All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
+		};
+
+		auto gritlm_instruction = [](const std::string& instruction) -> std::string {
+            return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
+        };
+
+		// No need to add instruction for retrieval documents
+		auto d_rep = encode(ctx, documents, gritlm_instruction(""));
+		auto q_rep = encode(ctx, queries, gritlm_instruction(instruction));
+
+		auto cosine_sim_q0_d0 = 1 - cosine_similarity(q_rep[0], d_rep[0]);
+		auto cosine_sim_q0_d1 = 1 - cosine_similarity(q_rep[0], d_rep[1]);
+		auto cosine_sim_q1_d0 = 1 - cosine_similarity(q_rep[1], d_rep[0]);
+		auto cosine_sim_q1_d1 = 1 - cosine_similarity(q_rep[1], d_rep[1]);
+
+		std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
+		std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
+		std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0);
+		std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
+	}
+
+	llama_batch_free(bat);
+	llama_free(ctx);
+	llama_free_model(mdl);
+	llama_backend_free();
+
+	return 0;
+}

From e79195fc531302d8c88ec169c8a2da488447c85c Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Sun, 3 Mar 2024 23:59:28 -0600
Subject: [PATCH 02/10] gritlm results match

---
 examples/gritlm/gritlm.cpp | 32 ++++++++++++++++++--------------
 llama.cpp                  |  2 +-
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 462d6e620..5b1999729 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -38,11 +38,14 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
 
 		// testing with and without EOS - unexpected embeddings in both cases - GritLM seems to have EOS = ""
         std::string input_string = instruction + sentences[i];
-        // std::string input_string = sentences[i];
 		auto inputs = llama_tokenize(mdl, input_string, true, false);
 		// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L116
 		// inputs.push_back(llama_token_eos(mdl));
 
+		// we want to ignore instruction tokens for mean pooling
+		auto inputs_instruct = llama_tokenize(mdl, instruction, true, false);
+		int n_inst = inputs_instruct.size();
+
 		// debug tokens - these are matching as referenced in their sample so doesn't appear to be a token issue
 		std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
             std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
@@ -68,7 +71,7 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         std::vector<float> emb_unorm(n_embd, 0.0f);
 
         // sum up all token embeddings
-        for (int k = 0; k < n_toks; k++) {
+        for (int k = n_inst; k < n_toks; k++) {
             float * emb = llama_get_embeddings_ith(ctx, k);
             for (int j = 0; j < n_embd; j++) {
                 emb_unorm[j] += emb[j];
@@ -76,8 +79,9 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         }
 
         // divide by number of tokens (mean pooling)
+		int n_sent = n_toks - n_inst;
         for (int j = 0; j < n_embd; j++) {
-            emb_unorm[j] /= n_toks;
+            emb_unorm[j] /= n_sent;
         }
 
 		auto emb_norm = std::vector<float>(emb_unorm.size());
@@ -97,12 +101,13 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
 	return result;
 }
 
-// ./embeddings -m ggml-gritlm-7b-q8_0.gguf -ngl 33
+// ./gritlm -m ggml-gritlm-7b-q8_0.gguf -ngl 99
 int main(int argc, char* argv[])
 {
 	gpt_params params;
-	if (!gpt_params_parse(argc, argv, params))
+	if (!gpt_params_parse(argc, argv, params)) {
 		return 1;
+	}
 
 	auto mparams = llama_model_params_from_gpt_params(params);
 	auto cparams = llama_context_params_from_gpt_params(params);
@@ -116,7 +121,7 @@ int main(int argc, char* argv[])
         return true;
     };
 	cparams.embedding = true;
-    // cparams.do_pooling = false;
+    cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
 
 	llama_backend_init();
 
@@ -127,15 +132,14 @@ int main(int argc, char* argv[])
 	// ### Embedding/Representation ### taken sample from here:
 	// https://github.com/ContextualAI/gritlm?tab=readme-ov-file#basic
 	{
-		auto instruction = std::string{ "Given a scientific paper title, retrieve the paper's abstract" };
+		std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
 
-		auto queries = std::vector<std::string>{
-            // "hello world",
+		std::vector<std::string> queries = {
 			"Bitcoin: A Peer-to-Peer Electronic Cash System",
 			"Generative Representational Instruction Tuning",
 		};
 
-		auto documents = std::vector<std::string>{
+		std::vector<std::string> documents = {
 			"A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
 			"All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
 		};
@@ -148,10 +152,10 @@ int main(int argc, char* argv[])
 		auto d_rep = encode(ctx, documents, gritlm_instruction(""));
 		auto q_rep = encode(ctx, queries, gritlm_instruction(instruction));
 
-		auto cosine_sim_q0_d0 = 1 - cosine_similarity(q_rep[0], d_rep[0]);
-		auto cosine_sim_q0_d1 = 1 - cosine_similarity(q_rep[0], d_rep[1]);
-		auto cosine_sim_q1_d0 = 1 - cosine_similarity(q_rep[1], d_rep[0]);
-		auto cosine_sim_q1_d1 = 1 - cosine_similarity(q_rep[1], d_rep[1]);
+		auto cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]);
+		auto cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
+		auto cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]);
+		auto cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]);
 
 		std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
 		std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
diff --git a/llama.cpp b/llama.cpp
index e9192b4fa..1442dd4d2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1684,7 +1684,6 @@ struct llama_cparams {
 
     bool embeddings;
     bool offload_kqv;
-
     enum llama_pooling_type pooling_type;
 
     ggml_backend_sched_eval_callback cb_eval;
@@ -12145,6 +12144,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.pooling_type     = params.pooling_type;
+    cparams.causal_attn      = !params.embedding;
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;

From a71842d7efce86dd6fb6625b277b8b22a90cd572 Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Mon, 4 Mar 2024 00:16:29 -0600
Subject: [PATCH 03/10] tabs to spaces

---
 examples/gritlm/gritlm.cpp | 154 ++++++++++++++++++-------------------
 1 file changed, 77 insertions(+), 77 deletions(-)

diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 5b1999729..9ab0d5875 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -6,62 +6,62 @@
 #include <format>
 
 static float dot_product(const std::vector<float>& v1, const std::vector<float>& v2) {
-	float dot = 0.0f;
-	for (uint64_t i = 0; i < v1.size(); ++i) {
-		dot += v1[i] * v2[i];
+    float dot = 0.0f;
+    for (uint64_t i = 0; i < v1.size(); ++i) {
+        dot += v1[i] * v2[i];
     }
-	return dot;
+    return dot;
 }
 
 static float norm(const std::vector<float>& v) {
-	return std::sqrt(dot_product(v, v));
+    return std::sqrt(dot_product(v, v));
 }
 
 static float cosine_similarity(const std::vector<float>& v1, const std::vector<float>& v2) {
-	return dot_product(v1, v2) / (norm(v1) * norm(v2));
+    return dot_product(v1, v2) / (norm(v1) * norm(v2));
 }
 
 static void normalize(std::vector<float> in, float* out) {
-	float inorm = norm(in);
-	for (uint64_t i = 0; i < in.size(); i++) {
-		out[i] = in[i] / inorm;
+    float inorm = norm(in);
+    for (uint64_t i = 0; i < in.size(); i++) {
+        out[i] = in[i] / inorm;
     }
 }
 
 static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vector<std::string>& sentences, const std::string& instruction) {
-	auto result = std::vector<std::vector<float>>{};
+    auto result = std::vector<std::vector<float>>{};
 
-	auto mdl = llama_get_model(ctx);
+    auto mdl = llama_get_model(ctx);
 
-	for (uint64_t i = 0; i < sentences.size(); i++) {
-		auto batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
+    for (uint64_t i = 0; i < sentences.size(); i++) {
+        auto batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
-		// testing with and without EOS - unexpected embeddings in both cases - GritLM seems to have EOS = ""
+        // testing with and without EOS - unexpected embeddings in both cases - GritLM seems to have EOS = ""
         std::string input_string = instruction + sentences[i];
-		auto inputs = llama_tokenize(mdl, input_string, true, false);
-		// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L116
-		// inputs.push_back(llama_token_eos(mdl));
+        auto inputs = llama_tokenize(mdl, input_string, true, false);
+        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L116
+        // inputs.push_back(llama_token_eos(mdl));
 
-		// we want to ignore instruction tokens for mean pooling
-		auto inputs_instruct = llama_tokenize(mdl, instruction, true, false);
-		int n_inst = inputs_instruct.size();
+        // we want to ignore instruction tokens for mean pooling
+        auto inputs_instruct = llama_tokenize(mdl, instruction, true, false);
+        int n_inst = inputs_instruct.size();
 
-		// debug tokens - these are matching as referenced in their sample so doesn't appear to be a token issue
-		std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
+        // debug tokens - these are matching as referenced in their sample so doesn't appear to be a token issue
+        std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
             std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
         });
-		std::printf("\n");
+        std::printf("\n");
 
         // add input to batch (this increments n_tokens)
-		for (uint64_t j = 0; j < inputs.size(); j++) {
-			llama_batch_add(batch, inputs[j], j, { 0 }, false);
+        for (uint64_t j = 0; j < inputs.size(); j++) {
+            llama_batch_add(batch, inputs[j], j, { 0 }, false);
         }
 
-		// clear previous kv_cache values (irrelevant for embeddings)
+        // clear previous kv_cache values (irrelevant for embeddings)
         llama_kv_cache_clear(ctx);
 
-		// run model
-		llama_decode(ctx, batch);
+        // run model
+        llama_decode(ctx, batch);
 
         // get embedding dimensions
         int n_toks = inputs.size();
@@ -79,14 +79,14 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         }
 
         // divide by number of tokens (mean pooling)
-		int n_sent = n_toks - n_inst;
+        int n_sent = n_toks - n_inst;
         for (int j = 0; j < n_embd; j++) {
             emb_unorm[j] /= n_sent;
         }
 
-		auto emb_norm = std::vector<float>(emb_unorm.size());
-		normalize(emb_unorm, emb_norm.data());
-		result.push_back(emb_norm);
+        auto emb_norm = std::vector<float>(emb_unorm.size());
+        normalize(emb_unorm, emb_norm.data());
+        result.push_back(emb_norm);
 
         // print out emb_norm
         std::printf("embedding %ld: ", i);
@@ -95,24 +95,24 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         }
         std::printf("\n");
 
-		llama_batch_free(batch);
-	}
+        llama_batch_free(batch);
+    }
 
-	return result;
+    return result;
 }
 
 // ./gritlm -m ggml-gritlm-7b-q8_0.gguf -ngl 99
 int main(int argc, char* argv[])
 {
-	gpt_params params;
-	if (!gpt_params_parse(argc, argv, params)) {
-		return 1;
-	}
+    gpt_params params;
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
 
-	auto mparams = llama_model_params_from_gpt_params(params);
-	auto cparams = llama_context_params_from_gpt_params(params);
+    auto mparams = llama_model_params_from_gpt_params(params);
+    auto cparams = llama_context_params_from_gpt_params(params);
 
-	mparams.progress_callback = [](std::float_t progress, void* state) {
+    mparams.progress_callback = [](std::float_t progress, void* state) {
         std::printf(
             "%s\rLoading model... %u%%\r",
             std::string(32, ' ').c_str(),
@@ -120,53 +120,53 @@ int main(int argc, char* argv[])
         );
         return true;
     };
-	cparams.embedding = true;
+    cparams.embedding = true;
     cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
 
-	llama_backend_init();
+    llama_backend_init();
 
-	auto mdl = llama_load_model_from_file(params.model.c_str(), mparams);
-	auto ctx = llama_new_context_with_model(mdl, cparams);
-	auto bat = llama_batch_init(llama_n_ctx(ctx), 0, 1);
+    auto mdl = llama_load_model_from_file(params.model.c_str(), mparams);
+    auto ctx = llama_new_context_with_model(mdl, cparams);
+    auto bat = llama_batch_init(llama_n_ctx(ctx), 0, 1);
 
-	// ### Embedding/Representation ### taken sample from here:
-	// https://github.com/ContextualAI/gritlm?tab=readme-ov-file#basic
-	{
-		std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
+    // ### Embedding/Representation ### taken sample from here:
+    // https://github.com/ContextualAI/gritlm?tab=readme-ov-file#basic
+    {
+        std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
 
-		std::vector<std::string> queries = {
-			"Bitcoin: A Peer-to-Peer Electronic Cash System",
-			"Generative Representational Instruction Tuning",
-		};
+        std::vector<std::string> queries = {
+            "Bitcoin: A Peer-to-Peer Electronic Cash System",
+            "Generative Representational Instruction Tuning",
+        };
 
-		std::vector<std::string> documents = {
-			"A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
-			"All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
-		};
+        std::vector<std::string> documents = {
+            "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
+            "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
+        };
 
-		auto gritlm_instruction = [](const std::string& instruction) -> std::string {
+        auto gritlm_instruction = [](const std::string& instruction) -> std::string {
             return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
         };
 
-		// No need to add instruction for retrieval documents
-		auto d_rep = encode(ctx, documents, gritlm_instruction(""));
-		auto q_rep = encode(ctx, queries, gritlm_instruction(instruction));
+        // No need to add instruction for retrieval documents
+        auto d_rep = encode(ctx, documents, gritlm_instruction(""));
+        auto q_rep = encode(ctx, queries, gritlm_instruction(instruction));
 
-		auto cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]);
-		auto cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
-		auto cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]);
-		auto cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]);
+        auto cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]);
+        auto cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
+        auto cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]);
+        auto cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]);
 
-		std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
-		std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
-		std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0);
-		std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
-	}
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0);
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
+    }
 
-	llama_batch_free(bat);
-	llama_free(ctx);
-	llama_free_model(mdl);
-	llama_backend_free();
+    llama_batch_free(bat);
+    llama_free(ctx);
+    llama_free_model(mdl);
+    llama_backend_free();
 
-	return 0;
+    return 0;
 }

From 805ae529c460616a402da6b71b3ed23c835f9796 Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Mon, 4 Mar 2024 00:18:41 -0600
Subject: [PATCH 04/10] comment out debug printing

---
 examples/gritlm/gritlm.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 9ab0d5875..bf9043750 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -46,11 +46,13 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         auto inputs_instruct = llama_tokenize(mdl, instruction, true, false);
         int n_inst = inputs_instruct.size();
 
+		/*/
         // debug tokens - these are matching as referenced in their sample so doesn't appear to be a token issue
         std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
             std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
         });
         std::printf("\n");
+		*/
 
         // add input to batch (this increments n_tokens)
         for (uint64_t j = 0; j < inputs.size(); j++) {
@@ -88,12 +90,14 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         normalize(emb_unorm, emb_norm.data());
         result.push_back(emb_norm);
 
+		/*
         // print out emb_norm
         std::printf("embedding %ld: ", i);
         for (int j = 0; j < n_embd; j++) {
             std::printf("%.5f ", emb_norm[j]);
         }
         std::printf("\n");
+		*/
 
         llama_batch_free(batch);
     }

From 97936078b79c9f249466a7c5791d80ab6d0fb66e Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Tue, 5 Mar 2024 23:23:17 -0600
Subject: [PATCH 05/10] rebase to new embed

---
 examples/gritlm/gritlm.cpp | 31 +++++++++++++------------------
 llama.cpp                  |  6 ++++--
 llama.h                    |  1 +
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index bf9043750..41e444901 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -39,24 +39,23 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         // testing with and without EOS - unexpected embeddings in both cases - GritLM seems to have EOS = ""
         std::string input_string = instruction + sentences[i];
         auto inputs = llama_tokenize(mdl, input_string, true, false);
+        uint64_t n_toks = inputs.size();
         // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L116
         // inputs.push_back(llama_token_eos(mdl));
 
         // we want to ignore instruction tokens for mean pooling
         auto inputs_instruct = llama_tokenize(mdl, instruction, true, false);
-        int n_inst = inputs_instruct.size();
+        uint64_t n_inst = inputs_instruct.size();
 
-		/*/
         // debug tokens - these are matching as referenced in their sample so doesn't appear to be a token issue
         std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
             std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
         });
         std::printf("\n");
-		*/
 
         // add input to batch (this increments n_tokens)
-        for (uint64_t j = 0; j < inputs.size(); j++) {
-            llama_batch_add(batch, inputs[j], j, { 0 }, false);
+        for (uint64_t j = 0; j < n_toks; j++) {
+            llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
         }
 
         // clear previous kv_cache values (irrelevant for embeddings)
@@ -66,23 +65,22 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         llama_decode(ctx, batch);
 
         // get embedding dimensions
-        int n_toks = inputs.size();
-        int n_embd = llama_n_embd(mdl);
+        uint64_t n_embd = llama_n_embd(mdl);
 
         // allocate embedding output
         std::vector<float> emb_unorm(n_embd, 0.0f);
 
         // sum up all token embeddings
-        for (int k = n_inst; k < n_toks; k++) {
+        for (uint64_t k = n_inst; k < n_toks; k++) {
             float * emb = llama_get_embeddings_ith(ctx, k);
-            for (int j = 0; j < n_embd; j++) {
+            for (uint64_t j = 0; j < n_embd; j++) {
                 emb_unorm[j] += emb[j];
             }
         }
 
         // divide by number of tokens (mean pooling)
-        int n_sent = n_toks - n_inst;
-        for (int j = 0; j < n_embd; j++) {
+        uint64_t n_sent = n_toks - n_inst;
+        for (uint64_t j = 0; j < n_embd; j++) {
             emb_unorm[j] /= n_sent;
         }
 
@@ -90,14 +88,12 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         normalize(emb_unorm, emb_norm.data());
         result.push_back(emb_norm);
 
-		/*
         // print out emb_norm
         std::printf("embedding %ld: ", i);
-        for (int j = 0; j < n_embd; j++) {
+        for (uint64_t j = 0; j < 20; j++) {
             std::printf("%.5f ", emb_norm[j]);
         }
-        std::printf("\n");
-		*/
+        std::printf("\n\n");
 
         llama_batch_free(batch);
     }
@@ -124,14 +120,14 @@ int main(int argc, char* argv[])
         );
         return true;
     };
-    cparams.embedding = true;
+    cparams.embeddings = true;
+    cparams.causal_attn = false;
     cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
 
     llama_backend_init();
 
     auto mdl = llama_load_model_from_file(params.model.c_str(), mparams);
     auto ctx = llama_new_context_with_model(mdl, cparams);
-    auto bat = llama_batch_init(llama_n_ctx(ctx), 0, 1);
 
     // ### Embedding/Representation ### taken sample from here:
     // https://github.com/ContextualAI/gritlm?tab=readme-ov-file#basic
@@ -167,7 +163,6 @@ int main(int argc, char* argv[])
         std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
     }
 
-    llama_batch_free(bat);
     llama_free(ctx);
     llama_free_model(mdl);
     llama_backend_free();
diff --git a/llama.cpp b/llama.cpp
index 1442dd4d2..fd0e58cca 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1684,6 +1684,7 @@ struct llama_cparams {
 
     bool embeddings;
     bool offload_kqv;
+    bool causal_attn;
     enum llama_pooling_type pooling_type;
 
     ggml_backend_sched_eval_callback cb_eval;
@@ -8029,7 +8030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
     }
 
-    if (hparams.causal_attn) {
+    if (cparams.causal_attn) {
         const int64_t n_kv     = kv_self.n;
         const int64_t n_tokens = batch.n_tokens;
 
@@ -11992,6 +11993,7 @@ struct llama_context_params llama_context_default_params() {
         /*.logits_all                  =*/ false,
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
+        /*.causal_attn                 =*/ true,
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
     };
@@ -12143,8 +12145,8 @@ struct llama_context * llama_new_context_with_model(
     cparams.defrag_thold     = params.defrag_thold;
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
+    cparams.causal_attn      = params.causal_attn;
     cparams.pooling_type     = params.pooling_type;
-    cparams.causal_attn      = !params.embedding;
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
     cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
diff --git a/llama.h b/llama.h
index 3dc162b07..6265d6901 100644
--- a/llama.h
+++ b/llama.h
@@ -262,6 +262,7 @@ extern "C" {
         bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool causal_attn; // whether to use causal attention
 
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted

From 1ab6aeeeeecc151a0774c585a6f40d0aba6d9c68 Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Thu, 7 Mar 2024 01:37:08 -0600
Subject: [PATCH 06/10] gritlm embeddings are back babeee

---
 common/common.cpp          | 1 +
 examples/gritlm/gritlm.cpp | 7 ++++++-
 llama.cpp                  | 7 ++++++-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index c244db644..d8baf7782 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1304,6 +1304,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.pooling_type      = params.pooling_type;
     cparams.defrag_thold      = params.defrag_thold;
     cparams.offload_kqv       = !params.no_kv_offload;
+    cparams.causal_attn       = !params.embedding;
 
     cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
     cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 41e444901..d06f64e24 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -47,11 +47,13 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         auto inputs_instruct = llama_tokenize(mdl, instruction, true, false);
         uint64_t n_inst = inputs_instruct.size();
 
+        /*
         // debug tokens - these are matching as referenced in their sample so doesn't appear to be a token issue
         std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
             std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
         });
         std::printf("\n");
+        */
 
         // add input to batch (this increments n_tokens)
         for (uint64_t j = 0; j < n_toks; j++) {
@@ -88,12 +90,14 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         normalize(emb_unorm, emb_norm.data());
         result.push_back(emb_norm);
 
+        /*
         // print out emb_norm
         std::printf("embedding %ld: ", i);
-        for (uint64_t j = 0; j < 20; j++) {
+        for (uint64_t j = 0; j < n_embd; j++) {
             std::printf("%.5f ", emb_norm[j]);
         }
         std::printf("\n\n");
+        */
 
         llama_batch_free(batch);
     }
@@ -120,6 +124,7 @@ int main(int argc, char* argv[])
         );
         return true;
     };
+
     cparams.embeddings = true;
     cparams.causal_attn = false;
     cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
diff --git a/llama.cpp b/llama.cpp
index fd0e58cca..04816ea9e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8057,6 +8057,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
     } else {
         // non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
         const int64_t n_tokens = batch.n_tokens;
+        const int64_t n_stride = hparams.causal_attn ? kv_self.n : n_tokens;
 
         assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
 
@@ -8075,7 +8076,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                         }
                     }
 
-                    data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
+                    data[h*(n_tokens*n_tokens) + j*n_stride + i] = f;
+                }
+
+                for (int i = n_tokens; i < n_stride; ++i) {
+                    data[h*(n_tokens*n_tokens) + j*n_stride + i] = -INFINITY;
                 }
             }
         }

From f618e5060a5fb1e8c2249715336f4ce50c2f1649 Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Thu, 7 Mar 2024 01:38:30 -0600
Subject: [PATCH 07/10] add to gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 62b6b8b1a..d28f4d1b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,7 @@ models-mnt
 /embedding
 /gguf
 /gguf-llama-simple
+/gritlm
 /imatrix
 /infill
 /libllama.so

From bd3d9fbfed2fcb413fb5327ba7f4c01bb4d5b2a8 Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Thu, 7 Mar 2024 11:55:27 -0600
Subject: [PATCH 08/10] allow to toggle embedding mode

---
 Makefile                   |  2 +-
 common/common.cpp          |  1 -
 examples/gritlm/gritlm.cpp |  7 +++----
 llama.cpp                  | 18 +++++++++++++-----
 llama.h                    |  5 ++++-
 5 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/Makefile b/Makefile
index 64a2d5bad..223d37eb4 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = \
diff --git a/common/common.cpp b/common/common.cpp
index d8baf7782..c244db644 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1304,7 +1304,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.pooling_type      = params.pooling_type;
     cparams.defrag_thold      = params.defrag_thold;
     cparams.offload_kqv       = !params.no_kv_offload;
-    cparams.causal_attn       = !params.embedding;
 
     cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
     cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index d06f64e24..7495e2894 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -125,15 +125,14 @@ int main(int argc, char* argv[])
         return true;
     };
 
-    cparams.embeddings = true;
-    cparams.causal_attn = false;
-    cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
-
     llama_backend_init();
 
     auto mdl = llama_load_model_from_file(params.model.c_str(), mparams);
     auto ctx = llama_new_context_with_model(mdl, cparams);
 
+    // set to embedding mode
+    llama_set_embeddings(ctx, true);
+
     // ### Embedding/Representation ### taken sample from here:
     // https://github.com/ContextualAI/gritlm?tab=readme-ov-file#basic
     {
diff --git a/llama.cpp b/llama.cpp
index 04816ea9e..79171c749 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1684,7 +1684,6 @@ struct llama_cparams {
 
     bool embeddings;
     bool offload_kqv;
-    bool causal_attn;
     enum llama_pooling_type pooling_type;
 
     ggml_backend_sched_eval_callback cb_eval;
@@ -8030,7 +8029,14 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
     }
 
-    if (cparams.causal_attn) {
+    GGML_ASSERT(
+        (hparams.causal_attn || cparams.embeddings) &&
+        "non-causal attention with generative models is not supported"
+    );
+
+    // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
+    // But if cparams.embeddings is set, the attention will be non-causal nonetheless.
+    if (!cparams.embeddings) {
         const int64_t n_kv     = kv_self.n;
         const int64_t n_tokens = batch.n_tokens;
 
@@ -8055,7 +8061,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
             }
         }
     } else {
-        // non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used)
+        // with causal attention, the mask needs to match the kv cache size
         const int64_t n_tokens = batch.n_tokens;
         const int64_t n_stride = hparams.causal_attn ? kv_self.n : n_tokens;
 
@@ -11998,7 +12004,6 @@ struct llama_context_params llama_context_default_params() {
         /*.logits_all                  =*/ false,
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
-        /*.causal_attn                 =*/ true,
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
     };
@@ -12150,7 +12155,6 @@ struct llama_context * llama_new_context_with_model(
     cparams.defrag_thold     = params.defrag_thold;
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
-    cparams.causal_attn      = params.causal_attn;
     cparams.pooling_type     = params.pooling_type;
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
@@ -13165,6 +13169,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
     ctx->abort_callback_data = abort_callback_data;
 }
 
+void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
+    ctx->cparams.embeddings = embeddings;
+}
+
 struct llama_batch llama_batch_get_one(
              llama_token * tokens,
                  int32_t   n_tokens,
diff --git a/llama.h b/llama.h
index 6265d6901..0fe7b0105 100644
--- a/llama.h
+++ b/llama.h
@@ -262,7 +262,6 @@ extern "C" {
         bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool causal_attn; // whether to use causal attention
 
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
@@ -642,6 +641,10 @@ extern "C" {
     // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
     LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
 
+    // Set whether to use causal attention or not
+    // If set to true, the model will only attend to the past tokens
+    LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
+
     // Set abort callback
     LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 

From 03acc82a85e60981457fe67a3e74eabcd4239871 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?DAN=E2=84=A2?= <dranger003@gmail.com>
Date: Sat, 9 Mar 2024 07:44:25 -0500
Subject: [PATCH 09/10] Clean-up GritLM sample code.

---
 examples/gritlm/gritlm.cpp | 148 +++++++++++++++++++++++++++----------
 1 file changed, 107 insertions(+), 41 deletions(-)

diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 7495e2894..2f98d45f4 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -3,7 +3,6 @@
 
 #include <string>
 #include <vector>
-#include <format>
 
 static float dot_product(const std::vector<float>& v1, const std::vector<float>& v2) {
     float dot = 0.0f;
@@ -21,7 +20,7 @@ static float cosine_similarity(const std::vector<float>& v1, const std::vector<f
     return dot_product(v1, v2) / (norm(v1) * norm(v2));
 }
 
-static void normalize(std::vector<float> in, float* out) {
+static void normalize(const std::vector<float>& in, float* out) {
     float inorm = norm(in);
     for (uint64_t i = 0; i < in.size(); i++) {
         out[i] = in[i] / inorm;
@@ -32,23 +31,25 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
     auto result = std::vector<std::vector<float>>{};
 
     auto mdl = llama_get_model(ctx);
+    auto batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
     for (uint64_t i = 0; i < sentences.size(); i++) {
-        auto batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
+        llama_batch_clear(batch);
+
+        std::string input_string = instruction + sentences[i];
+        std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
+        auto n_toks = (int32_t)inputs.size();
 
         // testing with and without EOS - unexpected embeddings in both cases - GritLM seems to have EOS = ""
-        std::string input_string = instruction + sentences[i];
-        auto inputs = llama_tokenize(mdl, input_string, true, false);
-        uint64_t n_toks = inputs.size();
         // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L116
         // inputs.push_back(llama_token_eos(mdl));
 
         // we want to ignore instruction tokens for mean pooling
-        auto inputs_instruct = llama_tokenize(mdl, instruction, true, false);
-        uint64_t n_inst = inputs_instruct.size();
+        std::vector<llama_token> inputs_instruct = llama_tokenize(mdl, instruction, true, false);
+        auto n_inst = (int32_t)inputs_instruct.size();
 
         /*
-        // debug tokens - these are matching as referenced in their sample so doesn't appear to be a token issue
+        // debug tokens - should be matching as referenced in the GritLM sample
         std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
             std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
         });
@@ -56,7 +57,7 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         */
 
         // add input to batch (this increments n_tokens)
-        for (uint64_t j = 0; j < n_toks; j++) {
+        for (int32_t j = 0; j < n_toks; j++) {
             llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
         }
 
@@ -73,8 +74,8 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         std::vector<float> emb_unorm(n_embd, 0.0f);
 
         // sum up all token embeddings
-        for (uint64_t k = n_inst; k < n_toks; k++) {
-            float * emb = llama_get_embeddings_ith(ctx, k);
+        for (int32_t k = n_inst; k < n_toks; k++) {
+            float* emb = llama_get_embeddings_ith(ctx, k);
             for (uint64_t j = 0; j < n_embd; j++) {
                 emb_unorm[j] += emb[j];
             }
@@ -98,14 +99,80 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         }
         std::printf("\n\n");
         */
+    }
 
-        llama_batch_free(batch);
+    llama_batch_free(batch);
+    return result;
+}
+
+static std::string aggregate_pieces(const std::vector<std::string>& pieces) {
+    // calculate total length required
+    size_t length = 0;
+    for (const auto& str : pieces) {
+        length += str.size();
+    }
+
+    // reserve memory
+    std::string result;
+    result.reserve(length);
+
+    // append pieces
+    for (const auto& str : pieces) {
+        result += str;
     }
 
     return result;
 }
 
-// ./gritlm -m ggml-gritlm-7b-q8_0.gguf -ngl 99
+static std::string generate(llama_context* ctx, const std::string& prompt, bool stream) {
+    std::vector<std::string> pieces;
+
+    const llama_model* mdl = llama_get_model(ctx);
+    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
+
+    std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
+    int32_t i_current_token = 0;
+
+    while (true) {
+        llama_batch_clear(bat);
+
+        for (auto i = 0; i < inputs.size(); i++)
+            llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == inputs.size() - 1);
+
+        inputs.clear();
+
+        llama_decode(ctx, bat);
+
+        auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
+
+        auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
+        for (auto token = 0; token < candidates.size(); token++)
+            candidates[token] = llama_token_data{ token, logits[token], 0.0f };
+
+        auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
+
+        llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
+        if (token == llama_token_eos(mdl))
+            break;
+
+        std::string piece = llama_token_to_piece(ctx, token);
+        if (stream) {
+            std::printf("%s", piece.c_str());
+        }
+
+        pieces.push_back(piece);
+        inputs.push_back(token);
+    }
+
+    llama_batch_free(bat);
+
+    return aggregate_pieces(pieces);
+}
+
+static std::string gritlm_instruction(const std::string& instruction) {
+    return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
+}
+
 int main(int argc, char* argv[])
 {
     gpt_params params;
@@ -113,27 +180,21 @@ int main(int argc, char* argv[])
         return 1;
     }
 
-    auto mparams = llama_model_params_from_gpt_params(params);
-    auto cparams = llama_context_params_from_gpt_params(params);
-
-    mparams.progress_callback = [](std::float_t progress, void* state) {
-        std::printf(
-            "%s\rLoading model... %u%%\r",
-            std::string(32, ' ').c_str(),
-            static_cast<std::uint8_t>(progress * 100)
-        );
-        return true;
-    };
+    llama_model_params mparams = llama_model_params_from_gpt_params(params);
+    llama_context_params cparams = llama_context_params_from_gpt_params(params);
 
     llama_backend_init();
 
-    auto mdl = llama_load_model_from_file(params.model.c_str(), mparams);
-    auto ctx = llama_new_context_with_model(mdl, cparams);
+    llama_model* mdl = llama_load_model_from_file(params.model.c_str(), mparams);
 
-    // set to embedding mode
-    llama_set_embeddings(ctx, true);
+    // create new context - set to embedding mode
+    llama_context* embd_ctx = llama_new_context_with_model(mdl, cparams);
+    llama_set_embeddings(embd_ctx, true);
 
-    // ### Embedding/Representation ### taken sample from here:
+    // create new context - default mode is causal
+    llama_context* causal_ctx = llama_new_context_with_model(mdl, cparams);
+
+    // ### Embedding/Representation ### samples taken from here:
     // https://github.com/ContextualAI/gritlm?tab=readme-ov-file#basic
     {
         std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
@@ -148,18 +209,14 @@ int main(int argc, char* argv[])
             "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
         };
 
-        auto gritlm_instruction = [](const std::string& instruction) -> std::string {
-            return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
-        };
-
         // No need to add instruction for retrieval documents
-        auto d_rep = encode(ctx, documents, gritlm_instruction(""));
-        auto q_rep = encode(ctx, queries, gritlm_instruction(instruction));
+        std::vector<std::vector<float>> d_rep = encode(embd_ctx, documents, gritlm_instruction(""));
+        std::vector<std::vector<float>> q_rep = encode(embd_ctx, queries, gritlm_instruction(instruction));
 
-        auto cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]);
-        auto cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
-        auto cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]);
-        auto cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]);
+        float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]);
+        float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
+        float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]);
+        float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]);
 
         std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
         std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
@@ -167,7 +224,16 @@ int main(int argc, char* argv[])
         std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
     }
 
-    llama_free(ctx);
+    // ### Generation ###
+    // # GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
+    {
+        const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
+        std::string response = generate(causal_ctx, prompt, true);
+    }
+
+    llama_free(embd_ctx);
+    llama_free(causal_ctx);
+
     llama_free_model(mdl);
     llama_backend_free();
 

From b54afce9f42652e59c81e978e7cd5f9861a7d4e9 Mon Sep 17 00:00:00 2001
From: Douglas Hanley <thesecretaryofwar@gmail.com>
Date: Sat, 9 Mar 2024 13:03:46 -0600
Subject: [PATCH 10/10] mostly style fixes; fix KQ_mask comment

---
 examples/gritlm/gritlm.cpp | 68 ++++++++++++++++++++------------------
 llama.cpp                  |  2 +-
 2 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 2f98d45f4..9b75d4f82 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -4,7 +4,9 @@
 #include <string>
 #include <vector>
 
-static float dot_product(const std::vector<float>& v1, const std::vector<float>& v2) {
+// #define GRIT_DEBUG
+
+static float dot_product(const std::vector<float> & v1, const std::vector<float> & v2) {
     float dot = 0.0f;
     for (uint64_t i = 0; i < v1.size(); ++i) {
         dot += v1[i] * v2[i];
@@ -12,22 +14,22 @@ static float dot_product(const std::vector<float>& v1, const std::vector<float>&
     return dot;
 }
 
-static float norm(const std::vector<float>& v) {
+static float norm(const std::vector<float> & v) {
     return std::sqrt(dot_product(v, v));
 }
 
-static float cosine_similarity(const std::vector<float>& v1, const std::vector<float>& v2) {
+static float cosine_similarity(const std::vector<float> & v1, const std::vector<float> & v2) {
     return dot_product(v1, v2) / (norm(v1) * norm(v2));
 }
 
-static void normalize(const std::vector<float>& in, float* out) {
+static void normalize(const std::vector<float> & in, float * out) {
     float inorm = norm(in);
     for (uint64_t i = 0; i < in.size(); i++) {
         out[i] = in[i] / inorm;
     }
 }
 
-static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vector<std::string>& sentences, const std::string& instruction) {
+static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
     auto result = std::vector<std::vector<float>>{};
 
     auto mdl = llama_get_model(ctx);
@@ -40,21 +42,21 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
         auto n_toks = (int32_t)inputs.size();
 
-        // testing with and without EOS - unexpected embeddings in both cases - GritLM seems to have EOS = ""
-        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L116
+        // GritLM seems to have embed EOS = ""
+        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
         // inputs.push_back(llama_token_eos(mdl));
 
         // we want to ignore instruction tokens for mean pooling
         std::vector<llama_token> inputs_instruct = llama_tokenize(mdl, instruction, true, false);
         auto n_inst = (int32_t)inputs_instruct.size();
 
-        /*
+#ifdef GRIT_DEBUG
         // debug tokens - should be matching as referenced in the GritLM sample
         std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
             std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
         });
         std::printf("\n");
-        */
+#endif
 
         // add input to batch (this increments n_tokens)
         for (int32_t j = 0; j < n_toks; j++) {
@@ -75,7 +77,7 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
 
         // sum up all token embeddings
         for (int32_t k = n_inst; k < n_toks; k++) {
-            float* emb = llama_get_embeddings_ith(ctx, k);
+            float * emb = llama_get_embeddings_ith(ctx, k);
             for (uint64_t j = 0; j < n_embd; j++) {
                 emb_unorm[j] += emb[j];
             }
@@ -91,24 +93,24 @@ static std::vector<std::vector<float>> encode(llama_context* ctx, const std::vec
         normalize(emb_unorm, emb_norm.data());
         result.push_back(emb_norm);
 
-        /*
+#ifdef GRIT_DEBUG
         // print out emb_norm
         std::printf("embedding %ld: ", i);
         for (uint64_t j = 0; j < n_embd; j++) {
             std::printf("%.5f ", emb_norm[j]);
         }
         std::printf("\n\n");
-        */
+#endif
     }
 
     llama_batch_free(batch);
     return result;
 }
 
-static std::string aggregate_pieces(const std::vector<std::string>& pieces) {
+static std::string aggregate_pieces(const std::vector<std::string> & pieces) {
     // calculate total length required
     size_t length = 0;
-    for (const auto& str : pieces) {
+    for (const auto & str : pieces) {
         length += str.size();
     }
 
@@ -117,17 +119,18 @@ static std::string aggregate_pieces(const std::vector<std::string>& pieces) {
     result.reserve(length);
 
     // append pieces
-    for (const auto& str : pieces) {
+    for (const auto & str : pieces) {
         result += str;
     }
 
     return result;
 }
 
-static std::string generate(llama_context* ctx, const std::string& prompt, bool stream) {
+static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
     std::vector<std::string> pieces;
 
-    const llama_model* mdl = llama_get_model(ctx);
+    const llama_model * mdl = llama_get_model(ctx);
+    llama_token eos_token = llama_token_eos(mdl);
     llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
 
     std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
@@ -135,25 +138,24 @@ static std::string generate(llama_context* ctx, const std::string& prompt, bool
 
     while (true) {
         llama_batch_clear(bat);
-
-        for (auto i = 0; i < inputs.size(); i++)
+        for (auto i = 0; i < inputs.size(); i++) {
             llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == inputs.size() - 1);
-
+        }
         inputs.clear();
 
         llama_decode(ctx, bat);
-
         auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
 
         auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
-        for (auto token = 0; token < candidates.size(); token++)
+        for (auto token = 0; token < candidates.size(); token++) {
             candidates[token] = llama_token_data{ token, logits[token], 0.0f };
-
+        }
         auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
 
         llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
-        if (token == llama_token_eos(mdl))
+        if (token == eos_token) {
             break;
+        }
 
         std::string piece = llama_token_to_piece(ctx, token);
         if (stream) {
@@ -169,11 +171,11 @@ static std::string generate(llama_context* ctx, const std::string& prompt, bool
     return aggregate_pieces(pieces);
 }
 
-static std::string gritlm_instruction(const std::string& instruction) {
+static std::string gritlm_instruction(const std::string & instruction) {
     return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
 }
 
-int main(int argc, char* argv[])
+int main(int argc, char * argv[])
 {
     gpt_params params;
     if (!gpt_params_parse(argc, argv, params)) {
@@ -185,17 +187,17 @@ int main(int argc, char* argv[])
 
     llama_backend_init();
 
-    llama_model* mdl = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
 
     // create new context - set to embedding mode
-    llama_context* embd_ctx = llama_new_context_with_model(mdl, cparams);
+    llama_context * embd_ctx = llama_new_context_with_model(mdl, cparams);
     llama_set_embeddings(embd_ctx, true);
 
     // create new context - default mode is causal
-    llama_context* causal_ctx = llama_new_context_with_model(mdl, cparams);
+    llama_context * causal_ctx = llama_new_context_with_model(mdl, cparams);
 
-    // ### Embedding/Representation ### samples taken from here:
-    // https://github.com/ContextualAI/gritlm?tab=readme-ov-file#basic
+    // samples taken from here: https://github.com/ContextualAI/gritlm#basic
+    // Embedding/Representation
     {
         std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
 
@@ -224,8 +226,8 @@ int main(int argc, char* argv[])
         std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
     }
 
-    // ### Generation ###
-    // # GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
+    // Generation
+    // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
     {
         const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
         std::string response = generate(causal_ctx, prompt, true);
diff --git a/llama.cpp b/llama.cpp
index 79171c749..991e1e673 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8061,7 +8061,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
             }
         }
     } else {
-        // with causal attention, the mask needs to match the kv cache size
+        // for models using the kv cache, the mask needs to match the kv cache size
         const int64_t n_tokens = batch.n_tokens;
         const int64_t n_stride = hparams.causal_attn ? kv_self.n : n_tokens;