Merge branch 'master' into gg/llama-perf

ggml-ci
2024-09-10 11:38:03 +03:00 · 2024-09-10 11:38:03 +03:00 · 6cce78c2ed
commit 6cce78c2ed
parent ade52b6cc6 00ba2ff781
69 changed files with 4810 additions and 3270 deletions
--- a/7
+++ b/7
@ -925,6 +925,7 @@ OBJ_LLAMA = \

 OBJ_COMMON = \
 	common/common.o \
+	common/arg.o \
 	common/console.o \
 	common/ngram-cache.o \
 	common/sampling.o \
@ -1157,6 +1158,11 @@ common/common.o: \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+common/arg.o: \
+	common/arg.cpp \
+	common/arg.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 common/sampling.o: \
 	common/sampling.cpp \
 	common/sampling.h \
@ -1448,7 +1454,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-	./llama-gen-docs

 libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \
--- a/README.md
+++ b/README.md
@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ## Hot topics

- *add hot topics here*
+- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

 ----

@ -163,6 +163,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
+- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)

 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -54,6 +54,8 @@ add_library(${TARGET} STATIC
    base64.hpp
    common.h
    common.cpp
+    arg.h
+    arg.cpp
    sampling.h
    sampling.cpp
    console.h
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@ -0,0 +1,77 @@
+#pragma once
+
+#include "common.h"
+
+#include <set>
+#include <string>
+#include <vector>
+
+//
+// CLI argument parsing
+//
+
+struct llama_arg {
+    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
+    std::vector<const char *> args;
+    const char * value_hint   = nullptr; // help text or example for arg value
+    const char * value_hint_2 = nullptr; // for second arg value
+    const char * env          = nullptr;
+    std::string help;
+    bool is_sparam = false; // is current arg a sampling param?
+    void (*handler_void)   (gpt_params & params) = nullptr;
+    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
+    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
+    void (*handler_int)    (gpt_params & params, int) = nullptr;
+
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(gpt_params & params, const std::string &)
+    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
+
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(gpt_params & params, int)
+    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
+
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const std::string & help,
+        void (*handler)(gpt_params & params)
+    ) : args(args), help(help), handler_void(handler) {}
+
+    // support 2 values for arg
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const char * value_hint_2,
+        const std::string & help,
+        void (*handler)(gpt_params & params, const std::string &, const std::string &)
+    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
+
+    llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
+    llama_arg & set_env(const char * env);
+    llama_arg & set_sparam();
+    bool in_example(enum llama_example ex);
+    bool get_value_from_env(std::string & output);
+    bool has_value_from_env();
+    std::string to_string();
+};
+
+struct gpt_params_context {
+    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
+    gpt_params & params;
+    std::vector<llama_arg> options;
+    void(*print_usage)(int, char **) = nullptr;
+    gpt_params_context(gpt_params & params) : params(params) {}
+};
+
+// parse input arguments from CLI
+// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+
+// function to be used by test-arg-parser
+gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@ -4,20 +4,11 @@

 #include "llama.h"

-#include "sampling.h"
-
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"

-#include <cmath>
 #include <string>
 #include <vector>
-#include <random>
-#include <thread>
-#include <set>
-#include <unordered_map>
-#include <tuple>
-#include <functional>

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@ -56,11 +47,20 @@ struct llama_control_vector_load_info;
 // CPU utils
 //

+struct cpu_params {
+    int      n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+};
+
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();

 //
-// CLI argument parsing
+// Common params
 //

 enum llama_example {
@ -78,28 +78,72 @@ enum llama_example {
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
    LLAMA_EXAMPLE_EXPORT_LORA,
    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_LOOKUP,
+    LLAMA_EXAMPLE_PARALLEL,

    LLAMA_EXAMPLE_COUNT,
 };

+enum gpt_sampler_type {
+    GPT_SAMPLER_TYPE_NONE        = 0,
+    GPT_SAMPLER_TYPE_TOP_K       = 1,
+    GPT_SAMPLER_TYPE_TOP_P       = 2,
+    GPT_SAMPLER_TYPE_MIN_P       = 3,
+    GPT_SAMPLER_TYPE_TFS_Z       = 4,
+    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
+    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
+};
+
 // dimensionality reduction methods, used by cvector-generator
 enum dimre_method {
    DIMRE_METHOD_PCA,
    DIMRE_METHOD_MEAN,
 };

-struct cpu_params {
-    int      n_threads                   = -1;
-    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
-    bool     mask_valid                  = false;   // Default: any CPU
-    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
-    bool     strict_cpu                  = false;   // Use strict CPU placement
-    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+// sampler parameters
+struct gpt_sampler_params {
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
+
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
+    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range    = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.00f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+    bool    penalize_nl       = false; // consider newlines as a repeatable token
+    bool    ignore_eos        = false;
+    bool    no_perf           = false; // disable performance metrics
+
+    std::vector<enum gpt_sampler_type> samplers = {
+        GPT_SAMPLER_TYPE_TOP_K,
+        GPT_SAMPLER_TYPE_TFS_Z,
+        GPT_SAMPLER_TYPE_TYPICAL_P,
+        GPT_SAMPLER_TYPE_TOP_P,
+        GPT_SAMPLER_TYPE_MIN_P,
+        GPT_SAMPLER_TYPE_TEMPERATURE
+    };
+
+    std::string grammar; // optional BNF-like grammar to constrain sampling
+
+    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+
+    // print the parameters into a string
+    std::string print() const;
 };

 struct gpt_params {
-    enum llama_example curr_ex    = LLAMA_EXAMPLE_COMMON;
-
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =     0; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -143,23 +187,23 @@ struct gpt_params {

    struct gpt_sampler_params sparams;

-    std::string model                = ""; // model path
-    std::string model_draft          = ""; // draft model for speculative decoding
-    std::string model_alias          = "unknown"; // model alias
-    std::string model_url            = ""; // model url to download
-    std::string hf_token             = ""; // HF token
-    std::string hf_repo              = ""; // HF repo
-    std::string hf_file              = ""; // HF file
-    std::string prompt               = "";
-    std::string prompt_file          = ""; // store the external prompt file name
-    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
-    std::string input_prefix         = ""; // string to prefix user inputs with
-    std::string input_suffix         = ""; // string to suffix user inputs with
-    std::string logdir               = ""; // directory in which to save YAML log files
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
-    std::string logits_file          = ""; // file for saving *all* logits
-    std::string rpc_servers          = ""; // comma separated list of RPC servers
+    std::string model                = ""; // model path                                                    // NOLINT
+    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
+    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
+    std::string model_url            = ""; // model url to download                                         // NOLINT
+    std::string hf_token             = ""; // HF token                                                      // NOLINT
+    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
+    std::string hf_file              = ""; // HF file                                                       // NOLINT
+    std::string prompt               = "";                                                                  // NOLINT
+    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
+    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
+    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
+    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT

    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@ -189,7 +233,6 @@ struct gpt_params {

    bool   kl_divergence    = false; // compute KL divergence

-    std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
    bool usage             = false; // print usage
    bool use_color         = false; // use color to distinguish generations and inputs
    bool special           = false; // enable special token output
@ -204,7 +247,7 @@ struct gpt_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool flash_attn        = false; // flash attention
-    bool no_perf           = false; // no perf (TODO: add llama_arg)
+    bool no_perf           = false; // disable performance metrics

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool logits_all        = false; // return logits for all tokens in the batch
@ -212,7 +255,6 @@ struct gpt_params {
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
-    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
@ -222,7 +264,7 @@ struct gpt_params {
    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector
+    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
    std::vector<std::string> image; // path to image file(s)

    // embedding
@ -238,15 +280,15 @@ struct gpt_params {
    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)

    std::string hostname      = "127.0.0.1";
-    std::string public_path   = "";
-    std::string chat_template = "";
-    std::string system_prompt = "";
+    std::string public_path   = "";                                                                         // NOLINT
+    std::string chat_template = "";                                                                         // NOLINT
+    std::string system_prompt = "";                                                                         // NOLINT
    bool enable_chat_template = true;

    std::vector<std::string> api_keys;

-    std::string ssl_file_key  = "";
-    std::string ssl_file_cert = "";
+    std::string ssl_file_key  = "";                                                                         // NOLINT
+    std::string ssl_file_cert = "";                                                                         // NOLINT

    bool endpoint_slots   = true;
    bool endpoint_metrics = false;
@ -301,92 +343,6 @@ struct gpt_params {
    bool batched_bench_output_jsonl = false;
 };

-struct llama_arg {
-    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
-    std::vector<const char *> args;
-    const char * value_hint   = nullptr; // help text or example for arg value
-    const char * value_hint_2 = nullptr; // for second arg value
-    const char * env          = nullptr;
-    std::string help;
-    void (*handler_void)   (gpt_params & params) = nullptr;
-    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
-    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
-    void (*handler_int)    (gpt_params & params, int) = nullptr;
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &)
-    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, int)
-    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const std::string & help,
-        void (*handler)(gpt_params & params)
-    ) : args(args), help(help), handler_void(handler) {}
-
-    // support 2 values for arg
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const char * value_hint_2,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &, const std::string &)
-    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
-
-    llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
-        this->examples = std::move(examples);
-        return *this;
-    }
-
-    llama_arg & set_env(const char * env) {
-        help = help + "\n(env: " + env + ")";
-        this->env = env;
-        return *this;
-    }
-
-    bool in_example(enum llama_example ex) {
-        return examples.find(ex) != examples.end();
-    }
-
-    bool get_value_from_env(std::string & output) const {
-        if (env == nullptr) return false;
-        char * value = std::getenv(env);
-        if (value) {
-            output = value;
-            return true;
-        }
-        return false;
-    }
-
-    bool has_value_from_env() const {
-        return env != nullptr && std::getenv(env);
-    }
-
-    std::string to_string();
-};
-
-// initialize list of options (arguments) that can be used by the current example
-std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
-// optionally, we can provide "print_usage" to print example usage
-std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
-
-// parse input arguments from CLI
-// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool gpt_params_parse   (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
-
-// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
-void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
-
 std::string gpt_params_get_system_info(const gpt_params & params);

 bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -2,6 +2,9 @@

 #include "common.h"

+#include <cmath>
+#include <unordered_map>
+
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
 template<typename T>
@ -420,7 +423,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
 }

 std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_sampler_type> sampler_name_map {
+    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
--- a/common/sampling.h
+++ b/common/sampling.h
@ -2,62 +2,11 @@

 #include "llama.h"

+#include "common.h"
+
 #include <string>
 #include <vector>

-enum gpt_sampler_type {
-    GPT_SAMPLER_TYPE_NONE        = 0,
-    GPT_SAMPLER_TYPE_TOP_K       = 1,
-    GPT_SAMPLER_TYPE_TOP_P       = 2,
-    GPT_SAMPLER_TYPE_MIN_P       = 3,
-    GPT_SAMPLER_TYPE_TFS_Z       = 4,
-    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
-    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
-};
-
-// sampling parameters
-struct gpt_sampler_params {
-    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
-
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
-    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range    = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.00f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    bool    ignore_eos        = false;
-    bool    no_perf           = false; // disable performance metrics
-
-    std::vector<enum gpt_sampler_type> samplers = {
-        GPT_SAMPLER_TYPE_TOP_K,
-        GPT_SAMPLER_TYPE_TFS_Z,
-        GPT_SAMPLER_TYPE_TYPICAL_P,
-        GPT_SAMPLER_TYPE_TOP_P,
-        GPT_SAMPLER_TYPE_MIN_P,
-        GPT_SAMPLER_TYPE_TEMPERATURE
-    };
-
-    std::string grammar; // optional BNF-like grammar to constrain sampling
-
-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
-
-    // print the parameters into a string
-    std::string print() const;
-};
-
 // gpt_sampler extends llama_sampler with additional functionality:
 //
 //  - grammar support
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -302,6 +302,8 @@ class Model:
                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
                            gguf.MODEL_TENSOR.TIME_MIX_W1,
                            gguf.MODEL_TENSOR.TIME_MIX_W2,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
                        )
                    )
                    or not new_name.endswith(".weight")
--- a/docs/build.md
+++ b/docs/build.md
@ -380,3 +380,9 @@ For detailed info, such as model/device supports, CANN install, please refer to
 ### Android

 To read documentation for how to build on Android, [click here](./android.md)
+
+### Arm CPU optimized mulmat kernels
+
+Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
+
+To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -37,8 +38,7 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
        return 1;
    }

--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -140,8 +140,6 @@ while n_cur <= n_len {

        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])

-        llama_sampler_accept(smpl, new_token_id)
-
        // is it an end of stream? -> mark the stream as finished
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            i_batch[i] = -1
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
        return 1;
    }

@ -172,8 +172,6 @@ int main(int argc, char ** argv) {

            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);

-            llama_sampler_accept(smpl, new_token_id);
-
            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                i_batch[i] = -1;
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@ -388,8 +389,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
        return 1;
    }

--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@ -12,12 +12,9 @@

 #include <cstdio>
 #include <ctime>
+#include <random>
 #include <string>
-#include <tuple>
 #include <vector>
-#include <algorithm>
-#include <iostream>
-#include <fstream>

 #define DEBUG_POS 5

--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -79,8 +80,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
        return 1;
    }

--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@ -144,8 +145,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@ -401,8 +402,7 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
        return 1;
    }

--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"

 #include <fstream>
@ -9,11 +10,11 @@ static void export_md(std::string fname, llama_example ex) {
    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);

    gpt_params params;
-    auto options = gpt_params_parser_init(params, ex);
+    auto ctx_arg = gpt_params_parser_init(params, ex);

    file << "| Argument | Explanation |\n";
    file << "| -------- | ----------- |\n";
-    for (auto & opt : options) {
+    for (auto & opt : ctx_arg.options) {
        file << "| `";
        // args
        for (const auto & arg : opt.args) {
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -121,7 +122,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
        llama_decode(ctx, bat);

        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
-        llama_sampler_accept(smpl, token);

        if (token == eos_token) {
            break;
@ -154,8 +154,7 @@ static std::string gritlm_instruction(const std::string & instruction) {
 int main(int argc, char * argv[]) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -577,8 +578,7 @@ int main(int argc, char ** argv) {
    params.logits_all = true;
    params.verbosity = 1;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
        return 1;
    }

--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -1,6 +1,7 @@
+#include "arg.h"
 #include "common.h"
-
 #include "console.h"
+#include "sampling.h"
 #include "llama.h"

 #include <cassert>
@ -105,8 +106,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
        return 1;
    }

@ -306,11 +306,6 @@ int main(int argc, char ** argv) {
    LOG_TEE("\n\n");

    LOG_TEE("\n#####  Infill mode  #####\n\n");
-    if (params.infill) {
-        printf("\n************\n");
-        printf("no need to specify '--infill', always running infill\n");
-        printf("************\n\n");
-    }
    if (params.interactive) {
        const char *control_message;
        if (params.multiline_input) {
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -414,8 +414,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
    // sample the most likely token
    const auto new_token_id = llama_sampler_sample(sampler, context, -1);

-    llama_sampler_accept(sampler, new_token_id);
-
    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
        return nullptr;
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -152,8 +152,6 @@ actor LlamaContext {

        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)

-        llama_sampler_accept(sampling, new_token_id)
-
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
            is_done = true
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -1,11 +1,12 @@
-#include "ggml.h"
+#include "arg.h"
+#include "base64.hpp"
 #include "log.h"
 #include "common.h"
+#include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
-
-#include "base64.hpp"
+#include "ggml.h"

 #include <cstdio>
 #include <cstdlib>
@ -278,8 +279,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
        return 1;
    }

--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -1,9 +1,11 @@
-#include "ggml.h"
+#include "arg.h"
 #include "log.h"
 #include "common.h"
+#include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
+#include "ggml.h"

 #include <cstdio>
 #include <cstdlib>
@ -253,8 +255,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) {
        return 1;
    }

--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -1,4 +1,6 @@
+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "llama.h"

 #include <cstdio>
@ -36,8 +38,7 @@ struct ngram_container {
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@ -1,7 +1,8 @@
-#include "ggml.h"
-#include "llama.h"
+#include "arg.h"
 #include "common.h"
 #include "ngram-cache.h"
+#include "ggml.h"
+#include "llama.h"

 #include <cstdint>
 #include <fstream>
@ -13,8 +14,7 @@
 int main(int argc, char ** argv){
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }

@ -40,4 +40,6 @@ int main(int argc, char ** argv){
    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());

    llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
+
+    return 0;
 }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -1,8 +1,9 @@
-#include "ggml.h"
+#include "arg.h"
 #include "common.h"
-#include "llama.h"
 #include "log.h"
 #include "ngram-cache.h"
+#include "llama.h"
+#include "ggml.h"

 #include <cmath>
 #include <cstdint>
@ -15,8 +16,7 @@
 int main(int argc, char ** argv){
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }

--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -1,7 +1,9 @@
+#include "arg.h"
 #include "ggml.h"
-#include "llama.h"
 #include "common.h"
 #include "ngram-cache.h"
+#include "sampling.h"
+#include "llama.h"

 #include <cstdint>
 #include <cstdio>
@ -12,8 +14,7 @@
 int main(int argc, char ** argv){
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -1,6 +1,7 @@
+#include "arg.h"
 #include "common.h"
-
 #include "console.h"
+#include "sampling.h"
 #include "llama.h"

 #include <cassert>
@ -138,9 +139,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
 int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
-
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
        return 1;
    }

--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -1,7 +1,9 @@
 // A basic application simulating a server with multiple clients.
 // The clients submit requests to the server and they are processed in parallel.

+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "llama.h"

 #include <cmath>
@ -100,8 +102,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
        return 1;
    }

--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -19,8 +20,7 @@ int main(int argc, char ** argv) {
    params.n_keep = 32;
    params.i_pos  = -1;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
        return 1;
    }

@ -220,8 +220,6 @@ int main(int argc, char ** argv) {
        {
            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

-            llama_sampler_accept(smpl, new_token_id);
-
            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                LOG_TEE("\n");
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1,18 +1,19 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

+#include <array>
+#include <atomic>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <fstream>
+#include <mutex>
+#include <random>
 #include <sstream>
 #include <thread>
-#include <mutex>
-#include <atomic>
 #include <vector>
-#include <array>
-#include <fstream>
-#include <sstream>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -1967,8 +1968,7 @@ int main(int argc, char ** argv) {
    params.n_ctx = 512;
    params.logits_all = true;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
        return 1;
    }

--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis

 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.

+The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
+
 *(outdated)*

 | Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -111,8 +112,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
        return 1;
    }

--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@ -10,20 +10,21 @@ This can be used for distributed LLM inference with `llama.cpp` in the following

 ```mermaid
 flowchart TD
-    rpcb---|TCP|srva
-    rpcb---|TCP|srvb
-    rpcb-.-|TCP|srvn
+    rpcb<-->|TCP|srva
+    rpcb<-->|TCP|srvb
+    rpcb<-.->|TCP|srvn
    subgraph hostn[Host N]
-    srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
+    srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"]
    end
    subgraph hostb[Host B]
-    srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
+    srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"]
    end
    subgraph hosta[Host A]
-    srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
+    srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"]
    end
    subgraph host[Main Host]
-    ggml[llama.cpp]---rpcb[RPC backend]
+    local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli]
+    ggml[llama-cli]<-->rpcb[RPC backend]
    end
    style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
 ```
@ -62,17 +63,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.


-On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
-
-```bash
-mkdir build-rpc
-cd build-rpc
-cmake .. -DGGML_RPC=ON
-cmake --build . --config Release
-```
-
-Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
+On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
+Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:

 ```bash
 $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
+
+This way you can offload model layers to both local and remote devices.
+
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -10,8 +11,7 @@ int main(int argc, char ** argv) {
    params.prompt = "The quick brown fox";
    params.sparams.seed = 1234;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

@ -74,8 +74,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
        auto next_token_str = llama_token_to_piece(ctx, next_token);

-        llama_sampler_accept(smpl, next_token);
-
        printf("%s", next_token_str.c_str());
        result0 += next_token_str;

@ -132,8 +130,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);

-        llama_sampler_accept(smpl2, next_token);
-
        printf("%s", next_token_str.c_str());
        result1 += next_token_str;

@ -222,8 +218,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
        auto next_token_str = llama_token_to_piece(ctx3, next_token);

-        llama_sampler_accept(smpl3, next_token);
-
        printf("%s", next_token_str.c_str());
        result2 += next_token_str;

--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -23,36 +23,32 @@ The project is under active development, and we are [looking for feedback and co
 | `--version` | show version and build info |
 | `-v, --verbose` | print verbose information |
 | `--verbosity N` | set specific verbosity level (default: 0) |
-| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
-| `--no-display-prompt` | don't print prompt at generation (default: false) |
-| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
 | `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
 | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
 | `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
 | `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
+| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
 | `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
 | `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
 | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
+| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
-| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
-| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
 | `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
 | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
 | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
-| `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
 | `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
 | `-p, --prompt PROMPT` | prompt to start generation with |
 | `-f, --file FNAME` | a file containing the prompt (default: none) |
-| `--in-file FNAME` | an input file (repeat to specify multiple files) |
 | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
 | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
 | `--no-escape` | do not process escape sequences |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
-| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
+| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
 | `--penalize-nl` | penalize newline tokens (default: false) |
@ -92,13 +88,12 @@ The project is under active development, and we are [looking for feedback and co
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
-| `-ns, --sequences N` | number of sequences to decode (default: 1) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
-| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
@ -109,7 +104,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
 | `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
 | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
+| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
 | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
@ -123,7 +118,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
-| `--timeout N` | server read/write timeout in seconds (default: 600) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
 | `--log-format {text, json}` | log output format: json or text (default: json) |
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1,6 +1,8 @@
 #include "utils.hpp"

+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"

@ -613,7 +615,7 @@ struct server_context {

    gpt_params params;

-    llama_batch batch;
+    llama_batch batch = {};

    bool clean_kv_cache = true;
    bool add_bos_token  = true;
@ -2423,8 +2425,7 @@ int main(int argc, char ** argv) {
    // own arguments required by this example
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
        return 1;
    }

--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"

@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
        return 1;
    }

@ -118,8 +118,6 @@ int main(int argc, char ** argv) {
        {
            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

-            llama_sampler_accept(smpl, new_token_id);
-
            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                LOG_TEE("\n");
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -1,11 +1,13 @@
+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "llama.h"

-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <set>
+#include <random>

 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@ -27,8 +29,7 @@ struct seq_draft {
 int main(int argc, char ** argv) {
    gpt_params params;

-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
        return 1;
    }

--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -681,8 +681,8 @@ extern "C" {

    struct ggml_hash_set {
        size_t size;
-        ggml_bitset_t * used;
-        struct ggml_tensor ** keys;
+        ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
+        struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
    };

    // computation graph
@ -1272,7 +1272,7 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes

    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_inplace(
@ -1282,19 +1282,19 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes

    GGML_API struct ggml_tensor * ggml_set_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes

    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes

    // b -> view(a,offset,nb1,nb2,3), return modified a
    GGML_API struct ggml_tensor * ggml_set_2d(
@ -1302,7 +1302,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes

    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@ -1310,7 +1310,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes

    // a -> b, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy(
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@ -827,6 +827,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
        case GGML_OP_MUL_MAT:
            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+        case GGML_OP_ROPE_BACK:
+            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
+        case GGML_OP_IM2COL_BACK:
+            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
        default:
            return true;
    }
--- a/ggml/src/ggml-cann/Doxyfile
+++ b/ggml/src/ggml-cann/Doxyfile
@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.

-PROJECT_NAME           = "llama.cpp"
+PROJECT_NAME           = "ggml"

 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@ -44,7 +44,7 @@ PROJECT_NUMBER         =
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.

-PROJECT_BRIEF          = "llama inference engine"
+PROJECT_BRIEF          = "Tensor library for machine learning"

 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -27,6 +27,7 @@
 #include "ggml-cuda/rope.cuh"
 #include "ggml-cuda/scale.cuh"
 #include "ggml-cuda/softmax.cuh"
+#include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
@ -2180,6 +2181,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
            ggml_cuda_dup(ctx, dst);
            break;
        case GGML_OP_ADD:
+        case GGML_OP_ADD1: // TODO: more efficient implementation
            ggml_cuda_op_add(ctx, dst);
            break;
        case GGML_OP_SUB:
@ -2196,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
            break;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(dst)) {
+                case GGML_UNARY_OP_NEG:
+                    ggml_cuda_op_neg(ctx, dst);
+                    break;
                case GGML_UNARY_OP_GELU:
                    ggml_cuda_op_gelu(ctx, dst);
                    break;
@ -2304,6 +2309,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_POOL_2D:
            ggml_cuda_op_pool2d(ctx, dst);
            break;
+        case GGML_OP_SUM:
+            ggml_cuda_op_sum(ctx, dst);
+            break;
        case GGML_OP_SUM_ROWS:
            ggml_cuda_op_sum_rows(ctx, dst);
            break;
@ -2544,7 +2552,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
        for (int i = 0; i < cgraph->n_nodes; i++) {
            ggml_tensor * node = cgraph->nodes[i];

-            if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
+            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+                continue;
+            }
+
+            if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
                use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
 #ifndef NDEBUG
                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
@ -2748,6 +2760,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_NEG:
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_RELU:
@ -2877,6 +2890,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_TRANSPOSE:
        case GGML_OP_NORM:
        case GGML_OP_ADD:
+        case GGML_OP_ADD1:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
@ -2887,14 +2901,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_SIN:
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
+            return true;
        case GGML_OP_CONT:
+            return op->src[0]->type != GGML_TYPE_BF16;
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_IM2COL:
+            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_2D:
+        case GGML_OP_SUM:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
--- a/ggml/src/ggml-cuda/cross-entropy-loss.cu
+++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu
@ -1,6 +1,6 @@
 #include "common.cuh"
 #include "cross-entropy-loss.cuh"
-#include "sumrows.cuh"
+#include "sum.cuh"

 #include <cmath>
 #include <cstdint>
@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
    cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);

    // Combine results from individual blocks:
-    sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
+    sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
 }
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@ -152,7 +152,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
    }                                                                       \

 static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * Q = dst->src[1];
+    ggml_tensor * Q = dst->src[0];
    ggml_tensor * K = dst->src[1];
    ggml_tensor * V = dst->src[2];

@ -227,7 +227,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
    }                                                                       \

 static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * Q = dst->src[1];
+    ggml_tensor * Q = dst->src[0];
    ggml_tensor * K = dst->src[1];
    ggml_tensor * V = dst->src[2];

--- a/ggml/src/ggml-cuda/sum.cu
+++ b/ggml/src/ggml-cuda/sum.cu
@ -0,0 +1,43 @@
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
+// For this reason CUB must be included BEFORE anything else.
+#include <cub/cub.cuh>
+using namespace cub;
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+
+#include "sumrows.cuh"
+#include "sum.cuh"
+
+#include <cstdint>
+
+void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+    size_t tmp_size = 0;
+    DeviceReduce::Sum(nullptr,       tmp_size, x, dst, ne, stream);
+    ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
+    DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
+#else
+    // Use (inefficient) sum_rows implementation as a fallback.
+    // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
+    sum_rows_f32_cuda(x, dst, ne, 1, stream);
+    GGML_UNUSED(pool);
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+}
+
+void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    const float * src0_d = (const float *) src0->data;
+    float * dst_d = (float *) dst->data;
+
+    const int64_t ne = ggml_nelements(src0);
+
+    ggml_cuda_pool & pool = ctx.pool();
+    cudaStream_t stream = ctx.stream();
+
+    sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
+}
--- a/ggml/src/ggml-cuda/sum.cuh
+++ b/ggml/src/ggml-cuda/sum.cuh
@ -0,0 +1,5 @@
+#include "common.cuh"
+
+void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
+
+void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@ -1,5 +1,15 @@
 #include "unary.cuh"

+static __global__ void neg_f32(const float * x, float * dst, const int k) {
+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = -x[i];
+}
+
 static __global__ void gelu_f32(const float * x, float * dst, const int k) {
    const float GELU_COEF_A    = 0.044715f;
    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
    dst[i] = cosf(x[i]);
 }

+static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+    const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
+    neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
 static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
    cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }

+void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const float * src0_d = (const float *)src0->data;
+    float * dst_d = (float *)dst->data;
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
+}
+
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@ -1,5 +1,6 @@
 #include "common.cuh"

+#define CUDA_NEG_BLOCK_SIZE 256
 #define CUDA_GELU_BLOCK_SIZE 256
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_TANH_BLOCK_SIZE 256
@ -12,6 +13,8 @@
 #define CUDA_SIN_BLOCK_SIZE 256
 #define CUDA_COS_BLOCK_SIZE 256

+void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@ -799,8 +799,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
            return ctx->support_simdgroup_reduction;
        case GGML_OP_NORM:
        case GGML_OP_ROPE:
-        case GGML_OP_IM2COL:
            return true;
+        case GGML_OP_IM2COL:
+            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_1D:
        case GGML_OP_POOL_2D:
            return false;
@ -3038,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute(
        if (status != MTLCommandBufferStatusCompleted) {
            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
            if (status == MTLCommandBufferStatusError) {
-                NSString * error_code = [command_buffer error].localizedDescription;
-                GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
+                GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
            }

            return GGML_STATUS_FAILED;
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -4003,13 +4003,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;

 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
-        const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
-        const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
-
    svfloat32_t sumv0 = svdup_n_f32(0.0f);
    svfloat32_t sumv1 = svdup_n_f32(0.0f);

+    const int vector_length = ggml_sve_cnt_b*8;
+
+    // VLA Implementation using switch case
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating higher lanes for 4 float32 elements
+                const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
+
                for (; ib + 1 < nb; ib += 2) {
                    const block_q4_0 * restrict x0 = &x[ib + 0];
                    const block_q4_0 * restrict x1 = &x[ib + 1];
@ -4021,8 +4026,54 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);

                    // 4-bit -> 8-bit
-            const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
-            const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
+                    const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
+                    const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
+                    const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
+                    const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
+
+                    // sub 8
+                    const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
+                    const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
+                    const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
+                    const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
+
+                    // load y
+                    const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
+                    const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
+                    const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
+                                    svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
+                                    svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for  16 int8 elements
+                const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));

                    // sub 8
                    const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
@ -4033,12 +4084,60 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);

                    // dot product
-            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
                }

                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating higher lanes for 32 int8 elements
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
+                const svbool_t pl16 = svnot_b_z(ph32, ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
+
+                    // sub 8
+                    const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
+                    const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(ph32, y0->qs);
+                    const svint8_t qy1 = svld1_s8(ph32, y1->qs);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
                }
+
+                sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
+            } break;
+        default:
+            assert(false && "Unsupported vector length");
+            break;
+    }
+
 #elif defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);
@ -5488,10 +5587,50 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;

 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
    svfloat32_t sumv0 = svdup_n_f32(0.0f);
    svfloat32_t sumv1 = svdup_n_f32(0.0f);

+    const int vector_length = ggml_sve_cnt_b*8;
+
+    //VLA Implemenation for SVE
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating lanes for 16 Int8 elements
+                const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
+                const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
+                    const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
+                    const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
+                    const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
+
+                    // load y
+                    const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
+                    const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
+                    const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
+                    const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
+
+                    sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
+                                    svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
+                                    svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                //printf("sve256");
                for (; ib + 1 < nb; ib += 2) {
                    const block_q8_0 * restrict x0 = &x[ib + 0];
                    const block_q8_0 * restrict x1 = &x[ib + 1];
@ -5506,11 +5645,66 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);

-            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
                }

                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating high 256 bit
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+                // predicate for activating low 256 bit
+                const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
+
+                // predicate for activating high lanes for 8 float32 elements
+                const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
+                // predicate for activating low lanes for 8 float32 elements
+                const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
+
+                svfloat32_t sumv00 = svdup_n_f32(0.0f);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
+                    // and add them to make one 64 element vector
+                    // load x
+                    const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
+                          svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
+
+                    qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
+
+                    // load y
+                    const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
+                          svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
+
+                    qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
+
+                    // scale creation
+                    const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
+                    const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
+
+                    // duplicate deq1 in first half of vector and deq2 in second half of vector
+                    const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
+
+                    const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
+
+                    sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), sumv00);
+                break;
+            }
+        default:
+            assert(false && "Unsupported vector length");
+            break;
    }
 #elif defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@ -883,15 +883,17 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
    }
    result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
    if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
-        return nullptr;
+        result->buffer = nullptr;
    }

+    if (result->buffer) {
        // require that the tensor data does not go beyond the buffer end
        uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
        uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
        uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
        GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
        GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+    }

    result->op = (ggml_op) tensor->op;
    for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
@ -1060,7 +1062,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
    const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
    GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);

-    static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+    size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_size,
        /*.mem_buffer =*/ NULL,
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -1954,6 +1954,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
        SYCL_CHECK(
            CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
                                look_ahead_size, *qptr)));
+        if (!ptr) {
+            fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
+            return nullptr;
+        }
+
        *actual_size = look_ahead_size;
        pool_size += look_ahead_size;

@ -4350,6 +4355,10 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
    void * dev_ptr;
    SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
                                    size, *stream)));
+    if (!dev_ptr) {
+        fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
+        return nullptr;
+    }
    ggml_backend_sycl_buffer_context * ctx = new  ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
    return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
 }
@ -4570,7 +4579,11 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
        */
        SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
                                        size, *stream)));
-
+        if (!buf) {
+            char err_buf[1024];
+            snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
+            throw std::runtime_error(err_buf);
+        }
        // set padding to 0 to avoid possible NaN values
        if (size > original_size) {
            /*
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s

 static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
    if (ctx->seqs.empty()) {
+        if (fence) {
+            ctx->q->queue.submit({}, fence);
+        }
        return;
    }
    VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
@ -4616,7 +4619,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
    }, dryrun);
 }

-static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);

@ -4626,10 +4629,10 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
-    });
+    }, dryrun);
 }

-static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);

@ -4639,7 +4642,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
-    });
+    }, dryrun);
 }

 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
    }
 }

-static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
+static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
+
+// Returns true if node has enqueued work into the queue, false otherwise
+// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
+static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;

    if (ggml_is_empty(node) || extra == nullptr) {
-        return;
+        return false;
    }

    VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    case GGML_OP_PERMUTE:
    case GGML_OP_TRANSPOSE:
    case GGML_OP_NONE:
-        return;
+        return false;
    case GGML_OP_UNARY:
        switch (ggml_get_unary_op(node)) {
        case GGML_UNARY_OP_SILU:
@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
        case GGML_UNARY_OP_TANH:
            break;
        default:
-            return;
+            return false;
        }
        break;
    case GGML_OP_REPEAT:
@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    default:
        std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
        GGML_ABORT("fatal error");
-        return;
+        return false;
    }

    vk_context compute_ctx;
@ -5783,11 +5790,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod

        break;
    case GGML_OP_SIN:
-        ggml_vk_sin(ctx, compute_ctx, src0, node);
+        ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);

        break;
    case GGML_OP_COS:
-        ggml_vk_cos(ctx, compute_ctx, src0, node);
+        ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);

        break;
    case GGML_OP_CLAMP:
@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
            ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
            break;
        default:
-            return;
+            return false;
        }
        break;
    case GGML_OP_DIAG_MASK_INF:
@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod

        break;
    default:
-        return;
+        return false;
    }

    if (dryrun) {
-        return;
+        return false;
    }

    ctx->tensor_ctxs[node_idx] = compute_ctx;
@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    last_node = true;
 #endif

-    if (last_node) {
+    if (submit || last_node) {
        ggml_vk_ctx_end(compute_ctx);
-        compute_ctx->exit_tensor_idx = node_idx;
-        ctx->compute_ctx.reset();
+
+        // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
+        if (last_node) {
+            compute_ctx->exit_tensor_idx = node_idx_begin;
        }
+        else {
+            compute_ctx->exit_tensor_idx = -1;
+        }
+
+        ctx->compute_ctx.reset();
+
+        bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
+        if (!ok) {
+            if (node->op == GGML_OP_UNARY) {
+                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
+            }
+            else {
+                std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
+            }
+        }
+
+    }
+    return true;
 }

-static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
+static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
    ggml_tensor_extra_gpu * extra = nullptr;

    switch (tensor->op) {
@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *

    VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");

-#ifdef GGML_VULKAN_CHECK_RESULTS
-    ggml_vk_check_results_0(tensor);
-#endif
-
    vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();

-#ifdef GGML_VULKAN_PERF
-    std::chrono::steady_clock::time_point start;
-#endif // GGML_VULKAN_PERF
+    // always wait for the GPU work to be done for the last submit
+    if (tensor_idx == subctx->exit_tensor_idx) {
+        use_fence = true;
+    }

    // Only run if ctx hasn't been submitted yet
    if (!subctx->seqs.empty()) {
+#ifdef GGML_VULKAN_CHECK_RESULTS
+        ggml_vk_check_results_0(tensor);
+        use_fence = true;
+#endif
+
        // Do staging buffer copies
        for (auto& cpy : subctx->in_memcpys) {
            memcpy(cpy.dst, cpy.src, cpy.n);
        }

-#ifdef GGML_VULKAN_PERF
-        start = std::chrono::steady_clock::now();
-#endif // GGML_VULKAN_PERF
+        ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});

-        ggml_vk_submit(subctx, ctx->fence);
+        if (use_fence) {
+            VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
+
+            ctx->device->device.resetFences({ ctx->fence });
+        }
+#ifdef GGML_VULKAN_CHECK_RESULTS
+        ggml_vk_check_results_1(tensor);
+#endif
    }

    if (tensor_idx == subctx->exit_tensor_idx) {
-        VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
-
-#ifdef GGML_VULKAN_PERF
-        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start);
-        ctx->device->perf_logger->log_timing(tensor, duration.count());
-#endif // GGML_VULKAN_PERF
-
-        ctx->device->device.resetFences({ ctx->fence });
-
        // Do staging buffer copies
        for (auto& cpy : subctx->out_memcpys) {
            memcpy(cpy.dst, cpy.src, cpy.n);
@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;

    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true);
+        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
    }
    ggml_vk_preallocate_buffers(ctx);
    ggml_pipeline_allocate_descriptor_sets(ctx->device);
@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
    // Reserve tensor context space for all nodes
    ctx->tensor_ctxs.resize(cgraph->n_nodes);

+    bool first_node_in_batch = true; // true if next node will be first node in a batch
+    int submit_node_idx = 0; // index to first node in a batch
+
+    // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
+    constexpr int submit_count = 100;
+    int submitted_nodes = 0;
    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false);
+        if (first_node_in_batch) {
+            submit_node_idx = i;
        }

-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
+        bool submit = (submitted_nodes >= submit_count) || (i == last_node);

-        if (ggml_vk_is_empty(node)) {
-            continue;
-        }

-        bool ok = ggml_vk_compute_forward(ctx, node, i);
-        if (!ok) {
-            if (node->op == GGML_OP_UNARY) {
-                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
-            } else {
-                std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
-            }
-        }
-#ifdef GGML_VULKAN_CHECK_RESULTS
-        else {
-            ggml_vk_check_results_1(node);
+        bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
+
+        if (enqueued) {
+            ++submitted_nodes;
+
+#ifndef GGML_VULKAN_CHECK_RESULTS
+            if (first_node_in_batch) {
+                first_node_in_batch = false;
            }
 #endif
-        GGML_ASSERT(ok);
+        }
+
+        if (submit) {
+            first_node_in_batch = true;
+            submitted_nodes = 0;
+        }
    }

 #ifdef GGML_VULKAN_PERF
@ -6602,6 +6632,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
                        return false;
                }
            } break;
+        case GGML_OP_CONT:
        case GGML_OP_CPY:
        case GGML_OP_DUP:
            {
@ -6642,7 +6673,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
        case GGML_OP_PAD:
-        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_ARGSORT:
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -3847,7 +3847,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml

    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
        GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                __func__, cur_end + size_needed, ctx->mem_size);
+                __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
        assert(false);
        return NULL;
    }
@ -5267,6 +5267,7 @@ struct ggml_tensor * ggml_concat(
    bool is_node = false;

    if (a->grad || b->grad) {
+        GGML_ABORT("fatal error"); // TODO: implement
        is_node = true;
    }

@ -5388,6 +5389,7 @@ struct ggml_tensor * ggml_leaky_relu(
    bool is_node = false;

    if (!inplace && (a->grad)) {
+        GGML_ABORT("fatal error"); // TODO: not implemented
        is_node = true;
    }

@ -5826,6 +5828,7 @@ static struct ggml_tensor * ggml_set_impl(
    // make a view of the destination
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

+    GGML_ASSERT(offset < (size_t)(1 << 30));
    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
    ggml_set_op_params(result, params, sizeof(params));

@ -6783,14 +6786,12 @@ struct ggml_tensor * ggml_rope_back(
    GGML_ASSERT(ggml_is_vector(b));
    GGML_ASSERT(b->type == GGML_TYPE_I32);
    GGML_ASSERT(a->ne[2] == b->ne[0]);
-    GGML_ASSERT(c == NULL && "freq factors not implemented yet");
-
-    GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");

    bool is_node = false;

    if (a->grad) {
-        is_node = false; // TODO: implement backward
+        GGML_ASSERT(false && "backwards pass not implemented");
+        is_node = false;
    }

    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
@ -6808,6 +6809,7 @@ struct ggml_tensor * ggml_rope_back(
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;
+    result->src[2] = c;

    return result;
 }
@ -7361,6 +7363,11 @@ struct ggml_tensor * ggml_argsort(
        enum ggml_sort_order  order) {
    bool is_node = false;

+    if (a->grad) {
+        GGML_ABORT("fatal error"); // TODO: not implemented
+        is_node = true;
+    }
+
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);

    ggml_set_op_params_i32(result, 0, (int32_t) order);
@ -8322,8 +8329,7 @@ static void ggml_compute_forward_dup_same_cont(
    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == dst->type);

-    const size_t nb00 = src0->nb[0];
-    const size_t nb0 = dst->nb[0];
+    const size_t nb0 = ggml_type_size(src0->type);

    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads
@ -8337,8 +8343,8 @@ static void ggml_compute_forward_dup_same_cont(
    if (ie0 < ie1) {
        memcpy(
            ((char *)  dst->data + ie0*nb0),
-            ((char *) src0->data + ie0*nb00),
-            (ie1 - ie0) * ggml_type_size(src0->type));
+            ((char *) src0->data + ie0*nb0),
+            (ie1 - ie0) * nb0);
    }
 }

@ -8355,11 +8361,6 @@ static void ggml_compute_forward_dup_f16(
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads

-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
-
    // parallelize by rows
    const int nr = ne01;
    // number of rows per thread
@ -8624,11 +8625,6 @@ static void ggml_compute_forward_dup_bf16(
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads

-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
-
    // parallelize by rows
    const int nr = ne01;
    // number of rows per thread
@ -8980,11 +8976,6 @@ static void ggml_compute_forward_dup_f32(
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads

-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        ggml_compute_forward_dup_same_cont(params, dst);
-        return;
-    }
-
    // parallelize by rows
    const int nr = ne01;
    // number of rows per thread
@ -9294,13 +9285,13 @@ static void ggml_compute_forward_dup_bytes(
    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
    GGML_ASSERT(src0->type == dst->type);

+    GGML_TENSOR_UNARY_OP_LOCALS;
+
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
        ggml_compute_forward_dup_same_cont(params, dst);
        return;
    }

-    GGML_TENSOR_UNARY_OP_LOCALS;
-
    const size_t type_size = ggml_type_size(src0->type);
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads
@ -10969,9 +10960,6 @@ static void ggml_compute_forward_sum_f32(
        return;
    }

-    assert(ggml_is_scalar(dst));
-
-
    assert(ggml_is_scalar(dst));
    assert(src0->nb[0] == sizeof(float));

@ -18372,14 +18360,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                if (src0->grad || src1->grad) {
                    GGML_ASSERT(src0->type == tensor->type);
                    GGML_ASSERT(tensor->grad->type == tensor->type);
-                    GGML_ASSERT(tensor->grad->type == src1->grad->type);
+                    GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);

                    tensor_grad_view = ggml_view_4d(ctx,
-                        tensor->grad,
-                        src1->grad->ne[0],
-                        src1->grad->ne[1],
-                        src1->grad->ne[2],
-                        src1->grad->ne[3],
+                        tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                        nb1, nb2, nb3, offset);
                }

@ -19146,7 +19130,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
    }

    for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
-        if (src->visited_hash_set.keys[i]) {
+        // copy all hashset keys (tensors) that are in use
+        if (ggml_bitset_get(src->visited_hash_set.used, i)) {
            ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
        }
    }
--- a/include/llama.h
+++ b/include/llama.h
@ -1127,15 +1127,16 @@ extern "C" {
                             int32_t   n_logit_bias,
              const llama_logit_bias * logit_bias);

-    // Shorthand for:
+    /// @details Sample and accept a token from the idx-th output of the last evaluation
    //
+    // Shorthand for:
    //    const auto * logits = llama_get_logits_ith(ctx, idx);
    //    llama_token_data_array cur_p = { ... init from logits ... };
    //    llama_sampler_apply(smpl, &cur_p);
-    //    return cur_p.data[cur_p.selected].id;
-    //
-    // At this point, this is mostly a convenience function.
-    //
+    //    auto token = cur_p.data[cur_p.selected].id;
+    //    llama_sampler_accept(smpl, token);
+    //    return token;
+    // Returns the sampled token
    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);

    // TODO: extend in the future
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@ -5,7 +5,7 @@
 # Usage:
 #
 #   $ cd /path/to/llama.cpp
-#   $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2...
+#   $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2... -C 3
 #

 set -e
@ -25,9 +25,23 @@ lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
 echo "Syncing ggml changes since commit $lc"

 to_skip=""
-if [ "$1" == "-skip" ]; then
-    to_skip=$2
-fi
+
+# context for git patches in number of lines
+ctx="8"
+
+while [ "$1" != "" ]; do
+    case $1 in
+        -skip )
+            shift
+            to_skip=$1
+            ;;
+        -C )
+            shift
+            ctx=$1
+            ;;
+    esac
+    shift
+done

 cd $SRC_GGML

@ -52,7 +66,7 @@ while read c; do
        fi
    fi

-    git format-patch -k $c~1..$c --stdout -- \
+    git format-patch -U${ctx} -k $c~1..$c --stdout -- \
        CMakeLists.txt \
        src/CMakeLists.txt \
        cmake/FindSIMD.cmake \
@ -191,7 +205,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        > ggml-src.patch.tmp
    mv ggml-src.patch.tmp ggml-src.patch

-    git am ggml-src.patch
+    git am -C${ctx} ggml-src.patch

    rm -v $SRC_LLAMA/ggml-src.patch
 fi
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-28b7633d733bbeef0026570fbc61c79c5e9aa5ae
+10e83a412717c20d57ba19f025248e18e43addf3
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@ -101,6 +101,10 @@ struct ring_buffer {
    }

    void push_back(const T & value) {
+        if (capacity == 0) {
+            throw std::runtime_error("ring buffer: capacity is zero");
+        }
+
        if (sz == capacity) {
            // advance the start when buffer is full
            first = (first + 1) % capacity;
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@ -23,16 +23,6 @@ struct llama_sampler_chain {
    mutable int32_t n_sample;
 };

-using llama_token_cnt = std::unordered_map<llama_token, int>;
-
-// TODO: tmp exposed until test-sampling is fixed
-void llama_sampler_penalties_impl(
-       llama_token_data_array * cur_p,
-        const llama_token_cnt & token_count,
-                        float   penalty_repeat,
-                        float   penalty_freq,
-                        float   penalty_present);
-
 struct llama_sampler * llama_sampler_init_grammar_impl(
        const struct llama_vocab & vocab,
                      const char * grammar_str,
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -9260,7 +9260,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
    // FIXME: zero-out NANs?
    states = ggml_mul(ctx, states, state_mask);

-    // copy states which won't be changed further (between n_seqs and n_rs)
+    // copy states which won't be changed further (between n_seqs and n_kv)
    ggml_build_forward_expand(graph,
        ggml_cpy(ctx,
            ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
@ -16079,7 +16079,7 @@ static int llama_decode_internal(
    }

    for (uint32_t i = 0; i < n_tokens_all; ++i) {
-        if (batch_all.token[i] < 0) {
+        if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
            return -1;
        }
@ -16378,7 +16378,7 @@ static int llama_encode_internal(
    }

    for (uint32_t i = 0; i < n_tokens; ++i) {
-        if (batch.token[i] < 0) {
+        if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
            return -1;
        }
@ -17532,6 +17532,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        quantize &= name.find("time_mix_first.weight") == std::string::npos;
        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;

        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@ -1,19 +1,43 @@
+#include "arg.h"
+#include "common.h"
+
 #include <string>
 #include <vector>
 #include <sstream>
+#include <unordered_set>

 #undef NDEBUG
 #include <cassert>

-#include "common.h"
-
 int main(void) {
    gpt_params params;

    printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
    for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
        try {
-            gpt_params_parser_init(params, (enum llama_example)ex);
+            auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
+            std::unordered_set<std::string> seen_args;
+            std::unordered_set<std::string> seen_env_vars;
+            for (const auto & opt : ctx_arg.options) {
+                // check for args duplications
+                for (const auto & arg : opt.args) {
+                    if (seen_args.find(arg) == seen_args.end()) {
+                        seen_args.insert(arg);
+                    } else {
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
+                        exit(1);
+                    }
+                }
+                // check for env var duplications
+                if (opt.env) {
+                    if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
+                        seen_env_vars.insert(opt.env);
+                    } else {
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
+                        exit(1);
+                    }
+                }
+            }
        } catch (std::exception & e) {
            printf("%s\n", e.what());
            assert(false);
@ -29,40 +53,51 @@ int main(void) {
    };

    std::vector<std::string> argv;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);

    printf("test-arg-parser: test invalid usage\n\n");

+    // missing value
    argv = {"binary_name", "-m"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));

+    // wrong value (int)
    argv = {"binary_name", "-ngl", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));

+    // wrong value (enum)
    argv = {"binary_name", "-sm", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
+    argv = {"binary_name", "--draft", "123"};
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));


    printf("test-arg-parser: test valid usage\n\n");

    argv = {"binary_name", "-m", "model_file.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "model_file.gguf");

    argv = {"binary_name", "-t", "1234"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.cpuparams.n_threads == 1234);

    argv = {"binary_name", "--verbose"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.verbosity == 1);

    argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "abc.gguf");
    assert(params.n_predict == 6789);
    assert(params.n_batch == 9090);

+    // --draft cannot be used outside llama-speculative
+    argv = {"binary_name", "--draft", "123"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
+    assert(params.n_draft == 123);
+
 // skip this part on windows, because setenv is not supported
 #ifdef _WIN32
    printf("test-arg-parser: skip on windows build\n");
@ -71,12 +106,12 @@ int main(void) {

    setenv("LLAMA_ARG_THREADS", "blah", true);
    argv = {"binary_name"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));

    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
    setenv("LLAMA_ARG_THREADS", "1010", true);
    argv = {"binary_name"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "blah.gguf");
    assert(params.cpuparams.n_threads == 1010);

@ -86,7 +121,7 @@ int main(void) {
    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
    setenv("LLAMA_ARG_THREADS", "1010", true);
    argv = {"binary_name", "-m", "overwritten.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "overwritten.gguf");
    assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -148,15 +148,17 @@ static void test_penalties(
        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }

-    llama_token_cnt token_count;
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+
+    auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
+
    for (size_t i = 0; i < last_tokens.size(); i++) {
-        token_count[last_tokens[i]]++;
+        llama_sampler_accept(sampler, last_tokens[i]);
    }

-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
    APPLY(llama_sampler_init_softmax(), &cur_p);
    DUMP(&cur_p);
-    llama_sampler_penalties_impl(&cur_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); // TODO: avoid
+    APPLY(sampler, &cur_p);
    APPLY(llama_sampler_init_softmax(), &cur_p);
    DUMP(&cur_p);

@ -243,7 +245,7 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
        }
    }

-    printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
+    printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n",
           samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
 }