From 224e7d5b14cbabab7ae45c64db2cfde979c8455d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 2 Nov 2023 20:44:12 +0200
Subject: [PATCH 1/6] readme : add notice about #3912

---
 README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index b56ecaec7..9c9e36ad0 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,6 @@
 
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 
-[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@@ -11,8 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 ### Hot topics
 
-- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
-- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
+- ⚠️ **Upcoming change that might break functionality. Help with testing is needed:** https://github.com/ggerganov/llama.cpp/pull/3912
 
 ----
 

From 51b2fc11f7f605fff49725a4540e9a6ef7b51b70 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Thu, 2 Nov 2023 15:40:31 -0400
Subject: [PATCH 2/6] cmake : fix relative path to git submodule index (#3915)

---
 common/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 0150114e3..ac594b2ca 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -11,7 +11,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
     if(NOT IS_DIRECTORY "${GIT_DIR}")
         file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
         string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
-        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
+        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
     endif()
 
     set(GIT_INDEX "${GIT_DIR}/index")

From 629f917cd6b96ba1274c49a8aab163b1b189229d Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Thu, 2 Nov 2023 13:58:22 -0600
Subject: [PATCH 3/6] cuda : add ROCM aliases for CUDA pool stuff (#3918)

---
 ggml-cuda.cu | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 06c28f565..baf02df2b 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -39,6 +39,10 @@
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceGetMemPool hipDeviceGetMemPool
+#define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold
+#define cudaMemPoolSetAttribute hipMemPoolSetAttribute
+#define cudaMemPool_t hipMemPool_t
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
@@ -48,6 +52,7 @@
 #define cudaEvent_t hipEvent_t
 #define cudaEventDestroy hipEventDestroy
 #define cudaFree hipFree
+#define cudaFreeAsync hipFreeAsync
 #define cudaFreeHost hipHostFree
 #define cudaGetDevice hipGetDevice
 #define cudaGetDeviceCount hipGetDeviceCount
@@ -55,6 +60,7 @@
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaMalloc hipMalloc
+#define cudaMallocFromPoolAsync hipMallocFromPoolAsync
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpy2DAsync hipMemcpy2DAsync

From 3fdbe6b66b7b5c6ad3b2f245cbad1517c27ff776 Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Fri, 3 Nov 2023 02:31:58 -0400
Subject: [PATCH 4/6] llama : change yarn_ext_factor placeholder to -1 (#3922)

---
 llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index bb60044b4..cc0211ceb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7982,7 +7982,7 @@ struct llama_context_params llama_context_default_params() {
         /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
         /*.rope_freq_base              =*/ 0.0f,
         /*.rope_freq_scale             =*/ 0.0f,
-        /*.yarn_ext_factor             =*/ NAN,
+        /*.yarn_ext_factor             =*/ -1.0f,
         /*.yarn_attn_factor            =*/ 1.0f,
         /*.yarn_beta_fast              =*/ 32.0f,
         /*.yarn_beta_slow              =*/ 1.0f,
@@ -8125,7 +8125,7 @@ struct llama_context * llama_new_context_with_model(
         cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
     }
 
-    if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set'
+    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
         cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
     }
 

From 05816027d649f977468fc804cdb54e99eac246d1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 3 Nov 2023 09:24:00 +0200
Subject: [PATCH 5/6] common : YAYF (yet another YARN fix) (#3925)

ggml-ci
---
 common/common.h | 44 ++++++++++++++++++++++----------------------
 llama.h         | 10 +++++-----
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/common/common.h b/common/common.h
index 72a49b890..9ad625633 100644
--- a/common/common.h
+++ b/common/common.h
@@ -43,29 +43,29 @@ extern char const *LLAMA_BUILD_TARGET;
 int32_t get_num_physical_cores();
 
 struct gpt_params {
-    uint32_t seed                           = -1;   // RNG seed
+    uint32_t seed                           = -1;    // RNG seed
     int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_threads_batch                 = -1;   // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_predict                       = -1;   // new tokens to predict
-    int32_t n_ctx                           = 512;  // context size
-    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
-    int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
-    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
-    int32_t n_parallel                      = 1;    // number of parallel sequences to decode
-    int32_t n_sequences                     = 1;    // number of sequences to decode
-    int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
-    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
-    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
-    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
-    float   rope_freq_base                  = 0.0f; // RoPE base frequency
-    float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
-    float   yarn_ext_factor                 = NAN;  // YaRN extrapolation mix factor
-    float   yarn_attn_factor                = 1.0f; // YaRN magnitude scaling factor
-    float   yarn_beta_fast                  = 32.0f;// YaRN low correction dim
-    float   yarn_beta_slow                  = 1.0f; // YaRN high correction dim
-    int32_t yarn_orig_ctx                   = 0;    // YaRN original context length
+    int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_predict                       = -1;    // new tokens to predict
+    int32_t n_ctx                           = 512;   // context size
+    int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                          = 0;     // number of tokens to keep from initial prompt
+    int32_t n_draft                         = 16;    // number of tokens to draft during speculative decoding
+    int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel                      = 1;     // number of parallel sequences to decode
+    int32_t n_sequences                     = 1;     // number of sequences to decode
+    int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
+    int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
+    float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
+    int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
+    float   rope_freq_base                  = 0.0f;  // RoPE base frequency
+    float   rope_freq_scale                 = 0.0f;  // RoPE frequency scaling factor
+    float   yarn_ext_factor                 = -1.0f; // YaRN extrapolation mix factor
+    float   yarn_attn_factor                = 1.0f;  // YaRN magnitude scaling factor
+    float   yarn_beta_fast                  = 32.0f; // YaRN low correction dim
+    float   yarn_beta_slow                  = 1.0f;  // YaRN high correction dim
+    int32_t yarn_orig_ctx                   = 0;     // YaRN original context length
     int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED;
 
     // // sampling parameters
diff --git a/llama.h b/llama.h
index 3f1becd76..e8dc04bb5 100644
--- a/llama.h
+++ b/llama.h
@@ -175,11 +175,11 @@ extern "C" {
     };
 
     struct llama_context_params {
-        uint32_t seed;            // RNG seed, -1 for random
-        uint32_t n_ctx;           // text context, 0 = from model
-        uint32_t n_batch;         // prompt processing maximum batch size
-        uint32_t n_threads;       // number of threads to use for generation
-        uint32_t n_threads_batch; // number of threads to use for batch processing
+        uint32_t seed;              // RNG seed, -1 for random
+        uint32_t n_ctx;             // text context, 0 = from model
+        uint32_t n_batch;           // prompt processing maximum batch size
+        uint32_t n_threads;         // number of threads to use for generation
+        uint32_t n_threads_batch;   // number of threads to use for batch processing
         int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 
         // ref: https://github.com/ggerganov/llama.cpp/pull/2054

From 8f961abdc4e134c83bf8c2ad618ab256b4cae0f9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 3 Nov 2023 09:41:17 +0200
Subject: [PATCH 6/6] speculative : change default p_accept to 0.5 + CLI args
 (#3919)

ggml-ci
---
 common/common.cpp                    | 14 ++++++++++++++
 common/common.h                      |  8 ++++++--
 examples/speculative/speculative.cpp |  8 +++++---
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index e938dee16..20cc4a081 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -403,6 +403,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_sequences = std::stoi(argv[i]);
+        } else if (arg == "--p-accept" || arg == "-pa") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.p_accept = std::stof(argv[i]);
+        } else if (arg == "--p-split" || arg == "-ps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.p_split = std::stof(argv[i]);
         } else if (arg == "-m" || arg == "--model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -778,6 +790,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
     printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
     printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
+    printf("  -pa N, --p-accept N   speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
+    printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
     printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
     printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
     printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
diff --git a/common/common.h b/common/common.h
index 9ad625633..dd6b002eb 100644
--- a/common/common.h
+++ b/common/common.h
@@ -44,6 +44,7 @@ int32_t get_num_physical_cores();
 
 struct gpt_params {
     uint32_t seed                           = -1;    // RNG seed
+
     int32_t n_threads                       = get_num_physical_cores();
     int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_predict                       = -1;    // new tokens to predict
@@ -54,6 +55,8 @@ struct gpt_params {
     int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)
     int32_t n_parallel                      = 1;     // number of parallel sequences to decode
     int32_t n_sequences                     = 1;     // number of sequences to decode
+    float   p_accept                        = 0.5f;  // speculative decoding accept probability
+    float   p_split                         = 0.1f;  // speculative decoding split probability
     int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
@@ -66,7 +69,8 @@ struct gpt_params {
     float   yarn_beta_fast                  = 32.0f; // YaRN low correction dim
     float   yarn_beta_slow                  = 1.0f;  // YaRN high correction dim
     int32_t yarn_orig_ctx                   = 0;     // YaRN original context length
-    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED;
+    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
+                                                                              //       pinging @cebtenzzre
 
     // // sampling parameters
     struct llama_sampling_params sparams;
@@ -90,7 +94,7 @@ struct gpt_params {
     int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
                                     //                                       (which is more convenient to use for plotting)
                                     //
-    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
 
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 798684f66..3a8e27811 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -37,9 +37,11 @@ int main(int argc, char ** argv) {
     // max number of parallel drafting sequences (i.e. tree branches)
     const int n_seq_dft = params.n_parallel;
 
-    // TODO: make this configurable
-    const float p_accept = 0.80f;
-    const float p_split  = 0.10f;
+    // probability threshold for accepting a token from the draft model
+    const float p_accept = params.p_accept;
+
+    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
+    const float p_split  = params.p_split;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("speculative", "log"));