Merge branch 'ggerganov:master' into avx_optimizations

2024-09-11 01:40:58 +00:00 · 2024-09-11 01:40:58 +00:00 · d635c75b85
commit d635c75b85
parent a753b25933 51b6038636
70 changed files with 4901 additions and 3299 deletions
--- a/7
+++ b/7
@ -925,6 +925,7 @@ OBJ_LLAMA = \
 OBJ_COMMON = \
 	common/common.o \
 	common/arg.o \
 	common/console.o \
 	common/ngram-cache.o \
 	common/sampling.o \
@ -1157,6 +1158,11 @@ common/common.o: \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common/arg.o: \
 	common/arg.cpp \
 	common/arg.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common/sampling.o: \
 	common/sampling.cpp \
 	common/sampling.h \
@ -1448,7 +1454,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	./llama-gen-docs
 libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \
--- a/README.md
+++ b/README.md
@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ## Hot topics
- *add hot topics here*
+- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
 ----
@ -163,6 +163,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
 - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -54,6 +54,8 @@ add_library(${TARGET} STATIC
    base64.hpp
    common.h
    common.cpp
    arg.h
    arg.cpp
    sampling.h
    sampling.cpp
    console.h
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@ -0,0 +1,77 @@
 #pragma once
 #include "common.h"
 #include <set>
 #include <string>
 #include <vector>
 //
 // CLI argument parsing
 //
 struct llama_arg {
    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
    std::vector<const char *> args;
    const char * value_hint   = nullptr; // help text or example for arg value
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
    std::string help;
    bool is_sparam = false; // is current arg a sampling param?
    void (*handler_void)   (gpt_params & params) = nullptr;
    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
    void (*handler_int)    (gpt_params & params, int) = nullptr;
    llama_arg(
        const std::initializer_list<const char *> & args,
        const char * value_hint,
        const std::string & help,
        void (*handler)(gpt_params & params, const std::string &)
    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
    llama_arg(
        const std::initializer_list<const char *> & args,
        const char * value_hint,
        const std::string & help,
        void (*handler)(gpt_params & params, int)
    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
    llama_arg(
        const std::initializer_list<const char *> & args,
        const std::string & help,
        void (*handler)(gpt_params & params)
    ) : args(args), help(help), handler_void(handler) {}
    // support 2 values for arg
    llama_arg(
        const std::initializer_list<const char *> & args,
        const char * value_hint,
        const char * value_hint_2,
        const std::string & help,
        void (*handler)(gpt_params & params, const std::string &, const std::string &)
    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
    llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
    llama_arg & set_env(const char * env);
    llama_arg & set_sparam();
    bool in_example(enum llama_example ex);
    bool get_value_from_env(std::string & output);
    bool has_value_from_env();
    std::string to_string();
 };
 struct gpt_params_context {
    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
    gpt_params & params;
    std::vector<llama_arg> options;
    void(*print_usage)(int, char **) = nullptr;
    gpt_params_context(gpt_params & params) : params(params) {}
 };
 // parse input arguments from CLI
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 // function to be used by test-arg-parser
 gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@ -4,20 +4,11 @@
 #include "llama.h"
 #include "sampling.h"
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
 #include <cmath>
 #include <string>
 #include <vector>
 #include <random>
 #include <thread>
 #include <set>
 #include <unordered_map>
 #include <tuple>
 #include <functional>
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@ -56,11 +47,20 @@ struct llama_control_vector_load_info;
 // CPU utils
 //
 struct cpu_params {
    int      n_threads                   = -1;
    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
    bool     mask_valid                  = false;   // Default: any CPU
    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
    bool     strict_cpu                  = false;   // Use strict CPU placement
    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();
 //
-// CLI argument parsing
+// Common params
 //
 enum llama_example {
@ -78,28 +78,71 @@ enum llama_example {
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
    LLAMA_EXAMPLE_EXPORT_LORA,
    LLAMA_EXAMPLE_LLAVA,
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
    LLAMA_EXAMPLE_COUNT,
 };
 enum gpt_sampler_type {
    GPT_SAMPLER_TYPE_NONE        = 0,
    GPT_SAMPLER_TYPE_TOP_K       = 1,
    GPT_SAMPLER_TYPE_TOP_P       = 2,
    GPT_SAMPLER_TYPE_MIN_P       = 3,
    GPT_SAMPLER_TYPE_TFS_Z       = 4,
    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
 };
 // dimensionality reduction methods, used by cvector-generator
 enum dimre_method {
    DIMRE_METHOD_PCA,
    DIMRE_METHOD_MEAN,
 };
-struct cpu_params {
+// sampler parameters
-    int      n_threads                   = -1;
+struct gpt_sampler_params {
-    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
-    bool     mask_valid                  = false;   // Default: any CPU
+
-    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    int32_t n_prev            = 64;    // number of previous tokens to remember
-    bool     strict_cpu                  = false;   // Use strict CPU placement
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   min_p             = 0.05f; // 0.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
    float   dynatemp_range    = 0.00f; // 0.0 = disabled
    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
    float   penalty_repeat    = 1.00f; // 1.0 = disabled
    float   penalty_freq      = 0.00f; // 0.0 = disabled
    float   penalty_present   = 0.00f; // 0.0 = disabled
    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate
    bool    penalize_nl       = false; // consider newlines as a repeatable token
    bool    ignore_eos        = false;
    std::vector<enum gpt_sampler_type> samplers = {
        GPT_SAMPLER_TYPE_TOP_K,
        GPT_SAMPLER_TYPE_TFS_Z,
        GPT_SAMPLER_TYPE_TYPICAL_P,
        GPT_SAMPLER_TYPE_TOP_P,
        GPT_SAMPLER_TYPE_MIN_P,
        GPT_SAMPLER_TYPE_TEMPERATURE
    };
    std::string grammar; // optional BNF-like grammar to constrain sampling
    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
    // print the parameters into a string
    std::string print() const;
 };
 struct gpt_params {
    enum llama_example curr_ex    = LLAMA_EXAMPLE_COMMON;
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =     0; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -143,23 +186,23 @@ struct gpt_params {
    struct gpt_sampler_params sparams;
-    std::string model                = ""; // model path
+    std::string model                = ""; // model path                                                    // NOLINT
-    std::string model_draft          = ""; // draft model for speculative decoding
+    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
-    std::string model_alias          = "unknown"; // model alias
+    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
-    std::string model_url            = ""; // model url to download
+    std::string model_url            = ""; // model url to download                                         // NOLINT
-    std::string hf_token             = ""; // HF token
+    std::string hf_token             = ""; // HF token                                                      // NOLINT
-    std::string hf_repo              = ""; // HF repo
+    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
-    std::string hf_file              = ""; // HF file
+    std::string hf_file              = ""; // HF file                                                       // NOLINT
-    std::string prompt               = "";
+    std::string prompt               = "";                                                                  // NOLINT
-    std::string prompt_file          = ""; // store the external prompt file name
+    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
-    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
-    std::string input_prefix         = ""; // string to prefix user inputs with
+    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
-    std::string input_suffix         = ""; // string to suffix user inputs with
+    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
-    std::string logdir               = ""; // directory in which to save YAML log files
+    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
-    std::string logits_file          = ""; // file for saving *all* logits
+    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
-    std::string rpc_servers          = ""; // comma separated list of RPC servers
+    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@ -189,7 +232,6 @@ struct gpt_params {
    bool   kl_divergence    = false; // compute KL divergence
    std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
    bool usage             = false; // print usage
    bool use_color         = false; // use color to distinguish generations and inputs
    bool special           = false; // enable special token output
@ -211,7 +253,6 @@ struct gpt_params {
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
@ -221,7 +262,7 @@ struct gpt_params {
    std::string cache_type_v = "f16"; // KV cache data type for the V
    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector
+    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
    std::vector<std::string> image; // path to image file(s)
    // embedding
@ -237,15 +278,15 @@ struct gpt_params {
    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
    std::string hostname      = "127.0.0.1";
-    std::string public_path   = "";
+    std::string public_path   = "";                                                                         // NOLINT
-    std::string chat_template = "";
+    std::string chat_template = "";                                                                         // NOLINT
-    std::string system_prompt = "";
+    std::string system_prompt = "";                                                                         // NOLINT
    bool enable_chat_template = true;
    std::vector<std::string> api_keys;
-    std::string ssl_file_key  = "";
+    std::string ssl_file_key  = "";                                                                         // NOLINT
-    std::string ssl_file_cert = "";
+    std::string ssl_file_cert = "";                                                                         // NOLINT
    bool endpoint_slots   = true;
    bool endpoint_metrics = false;
@ -300,92 +341,6 @@ struct gpt_params {
    bool batched_bench_output_jsonl = false;
 };
 struct llama_arg {
    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
    std::vector<const char *> args;
    const char * value_hint   = nullptr; // help text or example for arg value
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
    std::string help;
    void (*handler_void)   (gpt_params & params) = nullptr;
    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
    void (*handler_int)    (gpt_params & params, int) = nullptr;
    llama_arg(
        const std::initializer_list<const char *> & args,
        const char * value_hint,
        const std::string & help,
        void (*handler)(gpt_params & params, const std::string &)
    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
    llama_arg(
        const std::initializer_list<const char *> & args,
        const char * value_hint,
        const std::string & help,
        void (*handler)(gpt_params & params, int)
    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
    llama_arg(
        const std::initializer_list<const char *> & args,
        const std::string & help,
        void (*handler)(gpt_params & params)
    ) : args(args), help(help), handler_void(handler) {}
    // support 2 values for arg
    llama_arg(
        const std::initializer_list<const char *> & args,
        const char * value_hint,
        const char * value_hint_2,
        const std::string & help,
        void (*handler)(gpt_params & params, const std::string &, const std::string &)
    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
    llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
        this->examples = std::move(examples);
        return *this;
    }
    llama_arg & set_env(const char * env) {
        help = help + "\n(env: " + env + ")";
        this->env = env;
        return *this;
    }
    bool in_example(enum llama_example ex) {
        return examples.find(ex) != examples.end();
    }
    bool get_value_from_env(std::string & output) const {
        if (env == nullptr) return false;
        char * value = std::getenv(env);
        if (value) {
            output = value;
            return true;
        }
        return false;
    }
    bool has_value_from_env() const {
        return env != nullptr && std::getenv(env);
    }
    std::string to_string();
 };
 // initialize list of options (arguments) that can be used by the current example
 std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
 // optionally, we can provide "print_usage" to print example usage
 std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
 // parse input arguments from CLI
 // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
 bool gpt_params_parse   (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
 // print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
 void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
 std::string gpt_params_get_system_info(const gpt_params & params);
 bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -2,6 +2,9 @@
 #include "common.h"
 #include <cmath>
 #include <unordered_map>
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
 template<typename T>
@ -307,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
    return cur_p.data[cur_p.selected].id;
 }
 uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
    return llama_sampler_get_seed(gsmpl->chain);
 }
 // helpers
 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
@ -420,7 +427,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
 }
 std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_sampler_type> sampler_name_map {
+    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
--- a/common/sampling.h
+++ b/common/sampling.h
@ -2,61 +2,11 @@
 #include "llama.h"
 #include "common.h"
 #include <string>
 #include <vector>
 enum gpt_sampler_type {
    GPT_SAMPLER_TYPE_NONE        = 0,
    GPT_SAMPLER_TYPE_TOP_K       = 1,
    GPT_SAMPLER_TYPE_TOP_P       = 2,
    GPT_SAMPLER_TYPE_MIN_P       = 3,
    GPT_SAMPLER_TYPE_TFS_Z       = 4,
    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
 };
 // sampling parameters
 struct gpt_sampler_params {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
    int32_t n_prev            = 64;    // number of previous tokens to remember
    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   min_p             = 0.05f; // 0.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
    float   dynatemp_range    = 0.00f; // 0.0 = disabled
    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
    float   penalty_repeat    = 1.00f; // 1.0 = disabled
    float   penalty_freq      = 0.00f; // 0.0 = disabled
    float   penalty_present   = 0.00f; // 0.0 = disabled
    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate
    bool    penalize_nl       = false; // consider newlines as a repeatable token
    bool    ignore_eos        = false;
    std::vector<enum gpt_sampler_type> samplers = {
        GPT_SAMPLER_TYPE_TOP_K,
        GPT_SAMPLER_TYPE_TFS_Z,
        GPT_SAMPLER_TYPE_TYPICAL_P,
        GPT_SAMPLER_TYPE_TOP_P,
        GPT_SAMPLER_TYPE_MIN_P,
        GPT_SAMPLER_TYPE_TEMPERATURE
    };
    std::string grammar; // optional BNF-like grammar to constrain sampling
    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
    // print the parameters into a string
    std::string print() const;
 };
 // gpt_sampler extends llama_sampler with additional functionality:
 //
 //  - grammar support
@ -110,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 //
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
 uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
 // helpers
 // access the internal list of current candidate tokens
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -302,6 +302,8 @@ class Model:
                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
                            gguf.MODEL_TENSOR.TIME_MIX_W1,
                            gguf.MODEL_TENSOR.TIME_MIX_W2,
                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
                        )
                    )
                    or not new_name.endswith(".weight")
--- a/docs/build.md
+++ b/docs/build.md
@ -380,3 +380,9 @@ For detailed info, such as model/device supports, CANN install, please refer to
 ### Android
 To read documentation for how to build on Android, [click here](./android.md)
 ### Arm CPU optimized mulmat kernels
 Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
 To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
@ -37,8 +38,7 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -140,8 +140,6 @@ while n_cur <= n_len {
        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
        llama_sampler_accept(smpl, new_token_id)
        // is it an end of stream? -> mark the stream as finished
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            i_batch[i] = -1
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
@ -172,8 +172,6 @@ int main(int argc, char ** argv) {
            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
            llama_sampler_accept(smpl, new_token_id);
            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                i_batch[i] = -1;
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@ -388,8 +389,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
 int main(int argc, char ** argv) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@ -12,12 +12,9 @@
 #include <cstdio>
 #include <ctime>
 #include <random>
 #include <string>
 #include <tuple>
 #include <vector>
 #include <algorithm>
 #include <iostream>
 #include <fstream>
 #define DEBUG_POS 5
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
@ -79,8 +80,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
@ -90,8 +90,6 @@ int main(int argc, char ** argv) {
    print_build_info();
    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
    llama_backend_init();
    llama_numa_init(params.numa);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@ -144,8 +145,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@ -401,8 +402,7 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include <fstream>
@ -9,11 +10,11 @@ static void export_md(std::string fname, llama_example ex) {
    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
    gpt_params params;
-    auto options = gpt_params_parser_init(params, ex);
+    auto ctx_arg = gpt_params_parser_init(params, ex);
    file << "| Argument | Explanation |\n";
    file << "| -------- | ----------- |\n";
-    for (auto & opt : options) {
+    for (auto & opt : ctx_arg.options) {
        file << "| `";
        // args
        for (const auto & arg : opt.args) {
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
@ -121,7 +122,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
        llama_decode(ctx, bat);
        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
        llama_sampler_accept(smpl, token);
        if (token == eos_token) {
            break;
@ -154,8 +154,7 @@ static std::string gritlm_instruction(const std::string & instruction) {
 int main(int argc, char * argv[]) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
@ -577,8 +578,7 @@ int main(int argc, char ** argv) {
    params.logits_all = true;
    params.verbosity = 1;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -1,6 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "console.h"
 #include "sampling.h"
 #include "llama.h"
 #include <cassert>
@ -105,8 +106,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
@ -159,8 +159,6 @@ int main(int argc, char ** argv) {
    print_build_info();
    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);
@ -301,16 +299,14 @@ int main(int argc, char ** argv) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
    smpl = gpt_sampler_init(model, sparams);
    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
    LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");
    LOG_TEE("\n#####  Infill mode  #####\n\n");
    if (params.infill) {
        printf("\n************\n");
        printf("no need to specify '--infill', always running infill\n");
        printf("************\n\n");
    }
    if (params.interactive) {
        const char *control_message;
        if (params.multiline_input) {
@ -345,8 +341,6 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    smpl = gpt_sampler_init(model, sparams);
    while (n_remain != 0 || params.interactive) {
        // predict
        if (!embd.empty()) {
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -414,8 +414,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
    // sample the most likely token
    const auto new_token_id = llama_sampler_sample(sampler, context, -1);
    llama_sampler_accept(sampler, new_token_id);
    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
        return nullptr;
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -152,8 +152,6 @@ actor LlamaContext {
        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
        llama_sampler_accept(sampling, new_token_id)
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
            is_done = true
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -1,11 +1,12 @@
-#include "ggml.h"
+#include "arg.h"
 #include "base64.hpp"
 #include "log.h"
 #include "common.h"
 #include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
-
+#include "ggml.h"
 #include "base64.hpp"
 #include <cstdio>
 #include <cstdlib>
@ -278,8 +279,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -1,9 +1,11 @@
-#include "ggml.h"
+#include "arg.h"
 #include "log.h"
 #include "common.h"
 #include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
 #include "ggml.h"
 #include <cstdio>
 #include <cstdlib>
@ -253,8 +255,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -1,4 +1,6 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
 #include "llama.h"
 #include <cstdio>
@ -36,8 +38,7 @@ struct ngram_container {
 int main(int argc, char ** argv) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@ -1,7 +1,8 @@
-#include "ggml.h"
+#include "arg.h"
 #include "llama.h"
 #include "common.h"
 #include "ngram-cache.h"
 #include "ggml.h"
 #include "llama.h"
 #include <cstdint>
 #include <fstream>
@ -13,8 +14,7 @@
 int main(int argc, char ** argv){
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
@ -40,4 +40,6 @@ int main(int argc, char ** argv){
    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
    llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
    return 0;
 }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -1,8 +1,9 @@
-#include "ggml.h"
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "log.h"
 #include "ngram-cache.h"
 #include "llama.h"
 #include "ggml.h"
 #include <cmath>
 #include <cstdint>
@ -15,8 +16,7 @@
 int main(int argc, char ** argv){
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -1,7 +1,9 @@
 #include "arg.h"
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
 #include "ngram-cache.h"
 #include "sampling.h"
 #include "llama.h"
 #include <cstdint>
 #include <cstdio>
@ -12,8 +14,7 @@
 int main(int argc, char ** argv){
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -1,6 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "console.h"
 #include "sampling.h"
 #include "llama.h"
 #include <cassert>
@ -138,9 +139,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
 int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
@ -192,8 +191,6 @@ int main(int argc, char ** argv) {
    print_build_info();
    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);
@ -471,8 +468,10 @@ int main(int argc, char ** argv) {
        exit(1);
    }
    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
    LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
-    LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
+    LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    // group-attention state
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -1,7 +1,9 @@
 // A basic application simulating a server with multiple clients.
 // The clients submit requests to the server and they are processed in parallel.
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
 #include "llama.h"
 #include <cmath>
@ -100,8 +102,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
@ -19,8 +20,7 @@ int main(int argc, char ** argv) {
    params.n_keep = 32;
    params.i_pos  = -1;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
@ -220,8 +220,6 @@ int main(int argc, char ** argv) {
        {
            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
            llama_sampler_accept(smpl, new_token_id);
            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                LOG_TEE("\n");
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1,18 +1,19 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include <array>
 #include <atomic>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <mutex>
 #include <random>
 #include <sstream>
 #include <thread>
 #include <mutex>
 #include <atomic>
 #include <vector>
 #include <array>
 #include <fstream>
 #include <sstream>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -1967,8 +1968,7 @@ int main(int argc, char ** argv) {
    params.n_ctx = 512;
    params.logits_all = true;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
@ -2007,8 +2007,6 @@ int main(int argc, char ** argv) {
    print_build_info();
    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
    llama_backend_init();
    llama_numa_init(params.numa);
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
 The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
 *(outdated)*
 | Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
@ -111,8 +112,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@ -10,20 +10,21 @@ This can be used for distributed LLM inference with `llama.cpp` in the following
 ```mermaid
 flowchart TD
-    rpcb---|TCP|srva
+    rpcb<-->|TCP|srva
-    rpcb---|TCP|srvb
+    rpcb<-->|TCP|srvb
-    rpcb-.-|TCP|srvn
+    rpcb<-.->|TCP|srvn
    subgraph hostn[Host N]
-    srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
+    srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"]
    end
    subgraph hostb[Host B]
-    srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
+    srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"]
    end
    subgraph hosta[Host A]
-    srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
+    srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"]
    end
    subgraph host[Main Host]
-    ggml[llama.cpp]---rpcb[RPC backend]
+    local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli]
    ggml[llama-cli]<-->rpcb[RPC backend]
    end
    style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
 ```
@ -62,17 +63,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
-On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
+On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
-
+Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:
 ```bash
 mkdir build-rpc
 cd build-rpc
 cmake .. -DGGML_RPC=ON
 cmake --build . --config Release
 ```
 Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
 ```bash
 $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
 This way you can offload model layers to both local and remote devices.
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
@ -10,8 +11,7 @@ int main(int argc, char ** argv) {
    params.prompt = "The quick brown fox";
    params.sparams.seed = 1234;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
@ -74,8 +74,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
        auto next_token_str = llama_token_to_piece(ctx, next_token);
        llama_sampler_accept(smpl, next_token);
        printf("%s", next_token_str.c_str());
        result0 += next_token_str;
@ -132,8 +130,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);
        llama_sampler_accept(smpl2, next_token);
        printf("%s", next_token_str.c_str());
        result1 += next_token_str;
@ -222,8 +218,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
        auto next_token_str = llama_token_to_piece(ctx3, next_token);
        llama_sampler_accept(smpl3, next_token);
        printf("%s", next_token_str.c_str());
        result2 += next_token_str;
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -23,36 +23,32 @@ The project is under active development, and we are [looking for feedback and co
 | `--version` | show version and build info |
 | `-v, --verbose` | print verbose information |
 | `--verbosity N` | set specific verbosity level (default: 0) |
 | `--verbose-prompt` | print a verbose prompt before generation (default: false) |
 | `--no-display-prompt` | don't print prompt at generation (default: false) |
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
 | `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
 | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
 | `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
 | `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
 | `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
 | `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
 | `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
 | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
 | `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
 | `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
 | `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
 | `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
 | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
 | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
 | `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
 | `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
 | `-p, --prompt PROMPT` | prompt to start generation with |
 | `-f, --file FNAME` | a file containing the prompt (default: none) |
 | `--in-file FNAME` | an input file (repeat to specify multiple files) |
 | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
 | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
 | `--no-escape` | do not process escape sequences |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
-| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
 | `--penalize-nl` | penalize newline tokens (default: false) |
@ -92,13 +88,12 @@ The project is under active development, and we are [looking for feedback and co
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
 | `-ns, --sequences N` | number of sequences to decode (default: 1) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
-| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
@ -109,7 +104,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
 | `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
 | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
+| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
 | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
@ -123,7 +118,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
-| `--timeout N` | server read/write timeout in seconds (default: 600) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
 | `--log-format {text, json}` | log output format: json or text (default: json) |
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1,6 +1,8 @@
 #include "utils.hpp"
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
@ -613,7 +615,7 @@ struct server_context {
    gpt_params params;
-    llama_batch batch;
+    llama_batch batch = {};
    bool clean_kv_cache = true;
    bool add_bos_token  = true;
@ -1264,6 +1266,7 @@ struct server_context {
            {"n_predict",                 slot.n_predict},     // Server configured n_predict
            {"model",                     params.model_alias},
            {"seed",                      slot.sparams.seed},
            {"seed_cur",                  slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
            {"temperature",               slot.sparams.temp},
            {"dynatemp_range",            slot.sparams.dynatemp_range},
            {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
@ -2423,8 +2426,7 @@ int main(int argc, char ** argv) {
    // own arguments required by this example
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -1,3 +1,4 @@
 #include "arg.h"
 #include "common.h"
 #include "llama.h"
@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
@ -118,8 +118,6 @@ int main(int argc, char ** argv) {
        {
            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
            llama_sampler_accept(smpl, new_token_id);
            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                LOG_TEE("\n");
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -1,11 +1,13 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
 #include "llama.h"
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <set>
 #include <random>
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@ -27,8 +29,7 @@ struct seq_draft {
 int main(int argc, char ** argv) {
    gpt_params params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
    if (!gpt_params_parse(argc, argv, params, options)) {
        return 1;
    }
--- a/flake.lock
+++ b/flake.lock
@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1725024810,
+        "lastModified": 1725234343,
-        "narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
+        "narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
+        "rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
        "type": "github"
      },
      "original": {
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1724819573,
+        "lastModified": 1725634671,
-        "narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
+        "narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
+        "rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c",
        "type": "github"
      },
      "original": {
@ -36,14 +36,14 @@
    },
    "nixpkgs-lib": {
      "locked": {
-        "lastModified": 1722555339,
+        "lastModified": 1725233747,
-        "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
+        "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
      },
      "original": {
        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
      }
    },
    "root": {
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -681,8 +681,8 @@ extern "C" {
    struct ggml_hash_set {
        size_t size;
-        ggml_bitset_t * used;
+        ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
-        struct ggml_tensor ** keys;
+        struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
    };
    // computation graph
@ -1272,7 +1272,7 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_inplace(
@ -1282,19 +1282,19 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
    GGML_API struct ggml_tensor * ggml_set_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
    // b -> view(a,offset,nb1,nb2,3), return modified a
    GGML_API struct ggml_tensor * ggml_set_2d(
@ -1302,7 +1302,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@ -1310,7 +1310,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
    // a -> b, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy(
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@ -827,6 +827,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
        case GGML_OP_MUL_MAT:
            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
        case GGML_OP_ROPE_BACK:
            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
        case GGML_OP_IM2COL_BACK:
            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
        default:
            return true;
    }
--- a/ggml/src/ggml-cann/Doxyfile
+++ b/ggml/src/ggml-cann/Doxyfile
@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
-PROJECT_NAME           = "llama.cpp"
+PROJECT_NAME           = "ggml"
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@ -44,7 +44,7 @@ PROJECT_NUMBER         =
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.
-PROJECT_BRIEF          = "llama inference engine"
+PROJECT_BRIEF          = "Tensor library for machine learning"
 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -27,6 +27,7 @@
 #include "ggml-cuda/rope.cuh"
 #include "ggml-cuda/scale.cuh"
 #include "ggml-cuda/softmax.cuh"
 #include "ggml-cuda/sum.cuh"
 #include "ggml-cuda/sumrows.cuh"
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
@ -2180,6 +2181,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
            ggml_cuda_dup(ctx, dst);
            break;
        case GGML_OP_ADD:
        case GGML_OP_ADD1: // TODO: more efficient implementation
            ggml_cuda_op_add(ctx, dst);
            break;
        case GGML_OP_SUB:
@ -2196,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
            break;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(dst)) {
                case GGML_UNARY_OP_NEG:
                    ggml_cuda_op_neg(ctx, dst);
                    break;
                case GGML_UNARY_OP_GELU:
                    ggml_cuda_op_gelu(ctx, dst);
                    break;
@ -2304,6 +2309,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_POOL_2D:
            ggml_cuda_op_pool2d(ctx, dst);
            break;
        case GGML_OP_SUM:
            ggml_cuda_op_sum(ctx, dst);
            break;
        case GGML_OP_SUM_ROWS:
            ggml_cuda_op_sum_rows(ctx, dst);
            break;
@ -2544,7 +2552,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
        for (int i = 0; i < cgraph->n_nodes; i++) {
            ggml_tensor * node = cgraph->nodes[i];
-            if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
+            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
                continue;
            }
            if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
                use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
 #ifndef NDEBUG
                GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
@ -2748,6 +2760,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_NEG:
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_RELU:
@ -2877,6 +2890,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_TRANSPOSE:
        case GGML_OP_NORM:
        case GGML_OP_ADD:
        case GGML_OP_ADD1:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
@ -2887,14 +2901,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_SIN:
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
            return true;
        case GGML_OP_CONT:
            return op->src[0]->type != GGML_TYPE_BF16;
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_IM2COL:
            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
--- a/ggml/src/ggml-cuda/cross-entropy-loss.cu
+++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu
@ -1,6 +1,6 @@
 #include "common.cuh"
 #include "cross-entropy-loss.cuh"
-#include "sumrows.cuh"
+#include "sum.cuh"
 #include <cmath>
 #include <cstdint>
@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
    cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
    // Combine results from individual blocks:
-    sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
+    sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
 }
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@ -152,7 +152,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
    }                                                                       \
 static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * Q = dst->src[1];
+    ggml_tensor * Q = dst->src[0];
    ggml_tensor * K = dst->src[1];
    ggml_tensor * V = dst->src[2];
@ -227,7 +227,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
    }                                                                       \
 static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * Q = dst->src[1];
+    ggml_tensor * Q = dst->src[0];
    ggml_tensor * K = dst->src[1];
    ggml_tensor * V = dst->src[2];
--- a/ggml/src/ggml-cuda/sum.cu
+++ b/ggml/src/ggml-cuda/sum.cu
@ -0,0 +1,43 @@
 #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
 // On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
 // For this reason CUB must be included BEFORE anything else.
 #include <cub/cub.cuh>
 using namespace cub;
 #endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
 #include "sumrows.cuh"
 #include "sum.cuh"
 #include <cstdint>
 void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
 #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
    size_t tmp_size = 0;
    DeviceReduce::Sum(nullptr,       tmp_size, x, dst, ne, stream);
    ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
    DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
 #else
    // Use (inefficient) sum_rows implementation as a fallback.
    // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
    sum_rows_f32_cuda(x, dst, ne, 1, stream);
    GGML_UNUSED(pool);
 #endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
 }
 void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_is_contiguous(src0));
    const float * src0_d = (const float *) src0->data;
    float * dst_d = (float *) dst->data;
    const int64_t ne = ggml_nelements(src0);
    ggml_cuda_pool & pool = ctx.pool();
    cudaStream_t stream = ctx.stream();
    sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
 }
--- a/ggml/src/ggml-cuda/sum.cuh
+++ b/ggml/src/ggml-cuda/sum.cuh
@ -0,0 +1,5 @@
 #include "common.cuh"
 void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
 void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@ -1,5 +1,15 @@
 #include "unary.cuh"
 static __global__ void neg_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = -x[i];
 }
 static __global__ void gelu_f32(const float * x, float * dst, const int k) {
    const float GELU_COEF_A    = 0.044715f;
    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
    dst[i] = cosf(x[i]);
 }
 static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
    neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
    cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@ -1,5 +1,6 @@
 #include "common.cuh"
 #define CUDA_NEG_BLOCK_SIZE 256
 #define CUDA_GELU_BLOCK_SIZE 256
 #define CUDA_SILU_BLOCK_SIZE 256
 #define CUDA_TANH_BLOCK_SIZE 256
@ -12,6 +13,8 @@
 #define CUDA_SIN_BLOCK_SIZE 256
 #define CUDA_COS_BLOCK_SIZE 256
 void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@ -17,8 +17,8 @@
 #define GGML_METAL_LOG_WARN(...)
 #define GGML_METAL_LOG_ERROR(...)
 #else
-#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
+#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
-#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
+#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
 #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 #endif
@ -799,8 +799,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
            return ctx->support_simdgroup_reduction;
        case GGML_OP_NORM:
        case GGML_OP_ROPE:
        case GGML_OP_IM2COL:
            return true;
        case GGML_OP_IM2COL:
            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_1D:
        case GGML_OP_POOL_2D:
            return false;
@ -3038,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute(
        if (status != MTLCommandBufferStatusCompleted) {
            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
            if (status == MTLCommandBufferStatusError) {
-                NSString * error_code = [command_buffer error].localizedDescription;
+                GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
                GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
            }
            return GGML_STATUS_FAILED;
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -4009,42 +4009,141 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
+    svfloat32_t sumv0 = svdup_n_f32(0.0f);
-        const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
+    svfloat32_t sumv1 = svdup_n_f32(0.0f);
        const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
-        svfloat32_t sumv0 = svdup_n_f32(0.0f);
+    const int vector_length = ggml_sve_cnt_b*8;
        svfloat32_t sumv1 = svdup_n_f32(0.0f);
-        for (; ib + 1 < nb; ib += 2) {
+    // VLA Implementation using switch case
-            const block_q4_0 * restrict x0 = &x[ib + 0];
+    switch (vector_length) {
-            const block_q4_0 * restrict x1 = &x[ib + 1];
+        case 128:
-            const block_q8_0 * restrict y0 = &y[ib + 0];
+            {
-            const block_q8_0 * restrict y1 = &y[ib + 1];
+                // predicate for activating higher lanes for 4 float32 elements
                const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
-            // load x
+                for (; ib + 1 < nb; ib += 2) {
-            const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
-            const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
                    const block_q8_0 * restrict y0 = &y[ib + 0];
                    const block_q8_0 * restrict y1 = &y[ib + 1];
-            // 4-bit -> 8-bit
+                    // load x
-            const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
+                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
-            const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
+                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
-            // sub 8
+                    // 4-bit -> 8-bit
-            const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
+                    const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
-            const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
+                    const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
                    const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
                    const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
-            // load y
+                    // sub 8
-            const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
-            const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+                    const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
                    const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
                    const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
-            // dot product
+                    // load y
-            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
-            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                    const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
-        }
+                    const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
                    const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
-        sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+                    // dot product
                    sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
                                    svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
                                    svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
                    sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
                                    svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
                                    svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
                }
                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
            } break;
        case 256:
            {
                // predicate for activating higher lanes for 16 int8 elements
                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
                // predicate for activating lower lanes for  16 int8 elements
                const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
                for (; ib + 1 < nb; ib += 2) {
                    const block_q4_0 * restrict x0 = &x[ib + 0];
                    const block_q4_0 * restrict x1 = &x[ib + 1];
                    const block_q8_0 * restrict y0 = &y[ib + 0];
                    const block_q8_0 * restrict y1 = &y[ib + 1];
                    // load x
                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
                    // 4-bit -> 8-bit
                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
                    // sub 8
                    const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
                    const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
                    // load y
                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
                    // dot product
                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
                }
                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
            } break;
        case 512:
            {
                // predicate for activating higher lanes for 32 int8 elements
                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
                // predicate for activating higher lanes for 16 int8 elements
                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
                // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
                const svbool_t pl16 = svnot_b_z(ph32, ph16);
                for (; ib + 1 < nb; ib += 2) {
                    const block_q4_0 * restrict x0 = &x[ib + 0];
                    const block_q4_0 * restrict x1 = &x[ib + 1];
                    const block_q8_0 * restrict y0 = &y[ib + 0];
                    const block_q8_0 * restrict y1 = &y[ib + 1];
                    // load x
                    const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
                    const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
                    // 4-bit -> 8-bit
                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
                    // sub 8
                    const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
                    const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
                    // load y
                    const svint8_t qy0 = svld1_s8(ph32, y0->qs);
                    const svint8_t qy1 = svld1_s8(ph32, y1->qs);
                    // dot product
                    sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
                    sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
                }
                sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
            } break;
        default:
            assert(false && "Unsupported vector length");
            break;
    }
 #elif defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);
@ -5494,29 +5593,124 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    float sumf = 0;
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
+    svfloat32_t sumv0 = svdup_n_f32(0.0f);
-        svfloat32_t sumv0 = svdup_n_f32(0.0f);
+    svfloat32_t sumv1 = svdup_n_f32(0.0f);
        svfloat32_t sumv1 = svdup_n_f32(0.0f);
-        for (; ib + 1 < nb; ib += 2) {
+    const int vector_length = ggml_sve_cnt_b*8;
            const block_q8_0 * restrict x0 = &x[ib + 0];
            const block_q8_0 * restrict x1 = &x[ib + 1];
            const block_q8_0 * restrict y0 = &y[ib + 0];
            const block_q8_0 * restrict y1 = &y[ib + 1];
-            // load x
+    //VLA Implemenation for SVE
-            const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
+    switch (vector_length) {
-            const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
+        case 128:
            {
                // predicate for activating lanes for 16 Int8 elements
                const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
                const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
-            // load y
+                for (; ib + 1 < nb; ib += 2) {
-            const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
-            const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
                    const block_q8_0 * restrict y0 = &y[ib + 0];
                    const block_q8_0 * restrict y1 = &y[ib + 1];
-            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    // load x
-            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                    const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
-        }
+                    const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
                    const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
                    const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
-        sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+                    // load y
                    const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
                    const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
                    const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
                    const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
                    sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
                                    svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
                                    svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
                    sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
                                    svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
                                    svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
                }
                sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
            } break;
        case 256:
            {
                //printf("sve256");
                for (; ib + 1 < nb; ib += 2) {
                    const block_q8_0 * restrict x0 = &x[ib + 0];
                    const block_q8_0 * restrict x1 = &x[ib + 1];
                    const block_q8_0 * restrict y0 = &y[ib + 0];
                    const block_q8_0 * restrict y1 = &y[ib + 1];
                    // load x
                    const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
                    const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
                    // load y
                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
                                svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
                                svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
                }
                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
            } break;
        case 512:
            {
                // predicate for activating high 256 bit
                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
                // predicate for activating low 256 bit
                const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
                // predicate for activating high lanes for 8 float32 elements
                const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
                // predicate for activating low lanes for 8 float32 elements
                const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
                svfloat32_t sumv00 = svdup_n_f32(0.0f);
                for (; ib + 1 < nb; ib += 2) {
                    const block_q8_0 * restrict x0 = &x[ib + 0];
                    const block_q8_0 * restrict x1 = &x[ib + 1];
                    const block_q8_0 * restrict y0 = &y[ib + 0];
                    const block_q8_0 * restrict y1 = &y[ib + 1];
                    //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
                    // and add them to make one 64 element vector
                    // load x
                    const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
                          svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
                    qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
                    // load y
                    const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
                          svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
                    qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
                    // scale creation
                    const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
                    const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
                    // duplicate deq1 in first half of vector and deq2 in second half of vector
                    const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
                    const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
                    sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
                }
                sumf = svaddv_f32(svptrue_b32(), sumv00);
                break;
            }
        default:
            assert(false && "Unsupported vector length");
            break;
    }
 #elif defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@ -883,15 +883,17 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
    }
    result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
    if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
-        return nullptr;
+        result->buffer = nullptr;
    }
-    // require that the tensor data does not go beyond the buffer end
+    if (result->buffer) {
-    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+        // require that the tensor data does not go beyond the buffer end
-    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+        uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
-    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+        uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
-    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
+        uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
-    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+        GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
        GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
    }
    result->op = (ggml_op) tensor->op;
    for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
@ -1060,7 +1062,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
    const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
    GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
-    static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+    size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_size,
        /*.mem_buffer =*/ NULL,
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -1954,6 +1954,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
        SYCL_CHECK(
            CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
                                look_ahead_size, *qptr)));
        if (!ptr) {
            fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
            return nullptr;
        }
        *actual_size = look_ahead_size;
        pool_size += look_ahead_size;
@ -4350,6 +4355,10 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
    void * dev_ptr;
    SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
                                    size, *stream)));
    if (!dev_ptr) {
        fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
        return nullptr;
    }
    ggml_backend_sycl_buffer_context * ctx = new  ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
    return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
 }
@ -4570,7 +4579,11 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
        */
        SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
                                        size, *stream)));
-
+        if (!buf) {
            char err_buf[1024];
            snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
            throw std::runtime_error(err_buf);
        }
        // set padding to 0 to avoid possible NaN values
        if (size > original_size) {
            /*
@ -5124,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_CLAMP:
            return true;
        case GGML_OP_CONT:
            return op->src[0]->type != GGML_TYPE_BF16;
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_IM2COL:
            // TODO: add support for the new F32 operations
            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
 static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
    if (ctx->seqs.empty()) {
        if (fence) {
            ctx->q->queue.submit({}, fence);
        }
        return;
    }
    VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
@ -4616,7 +4619,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
    }, dryrun);
 }
-static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
@ -4626,10 +4629,10 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
-    });
+    }, dryrun);
 }
-static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
@ -4639,7 +4642,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
-    });
+    }, dryrun);
 }
 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
    }
 }
-static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
+static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
 // Returns true if node has enqueued work into the queue, false otherwise
 // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
 static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
    if (ggml_is_empty(node) || extra == nullptr) {
-        return;
+        return false;
    }
    VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    case GGML_OP_PERMUTE:
    case GGML_OP_TRANSPOSE:
    case GGML_OP_NONE:
-        return;
+        return false;
    case GGML_OP_UNARY:
        switch (ggml_get_unary_op(node)) {
        case GGML_UNARY_OP_SILU:
@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
        case GGML_UNARY_OP_TANH:
            break;
        default:
-            return;
+            return false;
        }
        break;
    case GGML_OP_REPEAT:
@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    default:
        std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
        GGML_ABORT("fatal error");
-        return;
+        return false;
    }
    vk_context compute_ctx;
@ -5783,11 +5790,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
        break;
    case GGML_OP_SIN:
-        ggml_vk_sin(ctx, compute_ctx, src0, node);
+        ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
        break;
    case GGML_OP_COS:
-        ggml_vk_cos(ctx, compute_ctx, src0, node);
+        ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
        break;
    case GGML_OP_CLAMP:
@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
            ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
            break;
        default:
-            return;
+            return false;
        }
        break;
    case GGML_OP_DIAG_MASK_INF:
@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
        break;
    default:
-        return;
+        return false;
    }
    if (dryrun) {
-        return;
+        return false;
    }
    ctx->tensor_ctxs[node_idx] = compute_ctx;
@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    last_node = true;
 #endif
-    if (last_node) {
+    if (submit || last_node) {
        ggml_vk_ctx_end(compute_ctx);
-        compute_ctx->exit_tensor_idx = node_idx;
+
        // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
        if (last_node) {
            compute_ctx->exit_tensor_idx = node_idx_begin;
        }
        else {
            compute_ctx->exit_tensor_idx = -1;
        }
        ctx->compute_ctx.reset();
        bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
        if (!ok) {
            if (node->op == GGML_OP_UNARY) {
                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
            }
            else {
                std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
            }
        }
    }
    return true;
 }
-static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
+static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
    ggml_tensor_extra_gpu * extra = nullptr;
    switch (tensor->op) {
@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
    VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
 #ifdef GGML_VULKAN_CHECK_RESULTS
    ggml_vk_check_results_0(tensor);
 #endif
    vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
-#ifdef GGML_VULKAN_PERF
+    // always wait for the GPU work to be done for the last submit
-    std::chrono::steady_clock::time_point start;
+    if (tensor_idx == subctx->exit_tensor_idx) {
-#endif // GGML_VULKAN_PERF
+        use_fence = true;
    }
    // Only run if ctx hasn't been submitted yet
    if (!subctx->seqs.empty()) {
 #ifdef GGML_VULKAN_CHECK_RESULTS
        ggml_vk_check_results_0(tensor);
        use_fence = true;
 #endif
        // Do staging buffer copies
        for (auto& cpy : subctx->in_memcpys) {
            memcpy(cpy.dst, cpy.src, cpy.n);
        }
-#ifdef GGML_VULKAN_PERF
+        ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
        start = std::chrono::steady_clock::now();
 #endif // GGML_VULKAN_PERF
-        ggml_vk_submit(subctx, ctx->fence);
+        if (use_fence) {
            VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
            ctx->device->device.resetFences({ ctx->fence });
        }
 #ifdef GGML_VULKAN_CHECK_RESULTS
        ggml_vk_check_results_1(tensor);
 #endif
    }
    if (tensor_idx == subctx->exit_tensor_idx) {
        VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
 #ifdef GGML_VULKAN_PERF
        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start);
        ctx->device->perf_logger->log_timing(tensor, duration.count());
 #endif // GGML_VULKAN_PERF
        ctx->device->device.resetFences({ ctx->fence });
        // Do staging buffer copies
        for (auto& cpy : subctx->out_memcpys) {
            memcpy(cpy.dst, cpy.src, cpy.n);
@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true);
+        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
    }
    ggml_vk_preallocate_buffers(ctx);
    ggml_pipeline_allocate_descriptor_sets(ctx->device);
@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
    // Reserve tensor context space for all nodes
    ctx->tensor_ctxs.resize(cgraph->n_nodes);
-    for (int i = 0; i < cgraph->n_nodes; i++) {
+    bool first_node_in_batch = true; // true if next node will be first node in a batch
-        ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false);
+    int submit_node_idx = 0; // index to first node in a batch
    }
    // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
    constexpr int submit_count = 100;
    int submitted_nodes = 0;
    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
+        if (first_node_in_batch) {
-
+            submit_node_idx = i;
        if (ggml_vk_is_empty(node)) {
            continue;
        }
-        bool ok = ggml_vk_compute_forward(ctx, node, i);
+        bool submit = (submitted_nodes >= submit_count) || (i == last_node);
-        if (!ok) {
+
-            if (node->op == GGML_OP_UNARY) {
+
-                std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
+        bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
-            } else {
+
-                std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
+        if (enqueued) {
            ++submitted_nodes;
 #ifndef GGML_VULKAN_CHECK_RESULTS
            if (first_node_in_batch) {
                first_node_in_batch = false;
            }
        }
 #ifdef GGML_VULKAN_CHECK_RESULTS
        else {
            ggml_vk_check_results_1(node);
        }
 #endif
-        GGML_ASSERT(ok);
+        }
        if (submit) {
            first_node_in_batch = true;
            submitted_nodes = 0;
        }
    }
 #ifdef GGML_VULKAN_PERF
@ -6602,6 +6632,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
                        return false;
                }
            } break;
        case GGML_OP_CONT:
        case GGML_OP_CPY:
        case GGML_OP_DUP:
            {
@ -6642,7 +6673,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
        case GGML_OP_PAD:
        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_ARGSORT:
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -3847,7 +3847,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
        GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                __func__, cur_end + size_needed, ctx->mem_size);
+                __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
        assert(false);
        return NULL;
    }
@ -5267,6 +5267,7 @@ struct ggml_tensor * ggml_concat(
    bool is_node = false;
    if (a->grad || b->grad) {
        GGML_ABORT("fatal error"); // TODO: implement
        is_node = true;
    }
@ -5388,6 +5389,7 @@ struct ggml_tensor * ggml_leaky_relu(
    bool is_node = false;
    if (!inplace && (a->grad)) {
        GGML_ABORT("fatal error"); // TODO: not implemented
        is_node = true;
    }
@ -5826,6 +5828,7 @@ static struct ggml_tensor * ggml_set_impl(
    // make a view of the destination
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    GGML_ASSERT(offset < (size_t)(1 << 30));
    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
    ggml_set_op_params(result, params, sizeof(params));
@ -6783,14 +6786,12 @@ struct ggml_tensor * ggml_rope_back(
    GGML_ASSERT(ggml_is_vector(b));
    GGML_ASSERT(b->type == GGML_TYPE_I32);
    GGML_ASSERT(a->ne[2] == b->ne[0]);
    GGML_ASSERT(c == NULL && "freq factors not implemented yet");
    GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
    bool is_node = false;
    if (a->grad) {
-        is_node = false; // TODO: implement backward
+        GGML_ASSERT(false && "backwards pass not implemented");
        is_node = false;
    }
    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
@ -6808,6 +6809,7 @@ struct ggml_tensor * ggml_rope_back(
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = b;
    result->src[2] = c;
    return result;
 }
@ -7361,6 +7363,11 @@ struct ggml_tensor * ggml_argsort(
        enum ggml_sort_order  order) {
    bool is_node = false;
    if (a->grad) {
        GGML_ABORT("fatal error"); // TODO: not implemented
        is_node = true;
    }
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
    ggml_set_op_params_i32(result, 0, (int32_t) order);
@ -8322,8 +8329,7 @@ static void ggml_compute_forward_dup_same_cont(
    GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == dst->type);
-    const size_t nb00 = src0->nb[0];
+    const size_t nb0 = ggml_type_size(src0->type);
    const size_t nb0 = dst->nb[0];
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads
@ -8337,8 +8343,8 @@ static void ggml_compute_forward_dup_same_cont(
    if (ie0 < ie1) {
        memcpy(
            ((char *)  dst->data + ie0*nb0),
-            ((char *) src0->data + ie0*nb00),
+            ((char *) src0->data + ie0*nb0),
-            (ie1 - ie0) * ggml_type_size(src0->type));
+            (ie1 - ie0) * nb0);
    }
 }
@ -8355,11 +8361,6 @@ static void ggml_compute_forward_dup_f16(
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
        ggml_compute_forward_dup_same_cont(params, dst);
        return;
    }
    // parallelize by rows
    const int nr = ne01;
    // number of rows per thread
@ -8624,11 +8625,6 @@ static void ggml_compute_forward_dup_bf16(
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
        ggml_compute_forward_dup_same_cont(params, dst);
        return;
    }
    // parallelize by rows
    const int nr = ne01;
    // number of rows per thread
@ -8980,11 +8976,6 @@ static void ggml_compute_forward_dup_f32(
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
        ggml_compute_forward_dup_same_cont(params, dst);
        return;
    }
    // parallelize by rows
    const int nr = ne01;
    // number of rows per thread
@ -9294,13 +9285,13 @@ static void ggml_compute_forward_dup_bytes(
    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
    GGML_ASSERT(src0->type == dst->type);
    GGML_TENSOR_UNARY_OP_LOCALS;
    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
        ggml_compute_forward_dup_same_cont(params, dst);
        return;
    }
    GGML_TENSOR_UNARY_OP_LOCALS;
    const size_t type_size = ggml_type_size(src0->type);
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads
@ -10969,9 +10960,6 @@ static void ggml_compute_forward_sum_f32(
        return;
    }
    assert(ggml_is_scalar(dst));
    assert(ggml_is_scalar(dst));
    assert(src0->nb[0] == sizeof(float));
@ -18372,14 +18360,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                if (src0->grad || src1->grad) {
                    GGML_ASSERT(src0->type == tensor->type);
                    GGML_ASSERT(tensor->grad->type == tensor->type);
-                    GGML_ASSERT(tensor->grad->type == src1->grad->type);
+                    GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);
                    tensor_grad_view = ggml_view_4d(ctx,
-                        tensor->grad,
+                        tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                        src1->grad->ne[0],
                        src1->grad->ne[1],
                        src1->grad->ne[2],
                        src1->grad->ne[3],
                        nb1, nb2, nb3, offset);
                }
@ -18448,9 +18432,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                    memcpy(&offset, tensor->op_params, sizeof(offset));
-                    size_t nb1     = tensor->nb[1];
+                    size_t nb1 = tensor->nb[1];
-                    size_t nb2     = tensor->nb[2];
+                    size_t nb2 = tensor->nb[2];
-                    size_t nb3     = tensor->nb[3];
+                    size_t nb3 = tensor->nb[3];
                    if (src0->type != src0->grad->type) {
                        // gradient is typically F32, but src0 could be other type
@ -19146,7 +19130,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
    }
    for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
-        if (src->visited_hash_set.keys[i]) {
+        // copy all hashset keys (tensors) that are in use
        if (ggml_bitset_get(src->visited_hash_set.used, i)) {
            ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
        }
    }
--- a/include/llama.h
+++ b/include/llama.h
@ -1127,15 +1127,20 @@ extern "C" {
                             int32_t   n_logit_bias,
              const llama_logit_bias * logit_bias);
-    // Shorthand for:
+
    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
    /// @details Sample and accept a token from the idx-th output of the last evaluation
    //
    // Shorthand for:
    //    const auto * logits = llama_get_logits_ith(ctx, idx);
    //    llama_token_data_array cur_p = { ... init from logits ... };
    //    llama_sampler_apply(smpl, &cur_p);
-    //    return cur_p.data[cur_p.selected].id;
+    //    auto token = cur_p.data[cur_p.selected].id;
-    //
+    //    llama_sampler_accept(smpl, token);
-    // At this point, this is mostly a convenience function.
+    //    return token;
-    //
+    // Returns the sampled token
    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
    // TODO: extend in the future
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@ -5,7 +5,7 @@
 # Usage:
 #
 #   $ cd /path/to/llama.cpp
-#   $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2...
+#   $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2... -C 3
 #
 set -e
@ -25,9 +25,23 @@ lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
 echo "Syncing ggml changes since commit $lc"
 to_skip=""
-if [ "$1" == "-skip" ]; then
+
-    to_skip=$2
+# context for git patches in number of lines
-fi
+ctx="8"
 while [ "$1" != "" ]; do
    case $1 in
        -skip )
            shift
            to_skip=$1
            ;;
        -C )
            shift
            ctx=$1
            ;;
    esac
    shift
 done
 cd $SRC_GGML
@ -52,7 +66,7 @@ while read c; do
        fi
    fi
-    git format-patch -k $c~1..$c --stdout -- \
+    git format-patch -U${ctx} -k $c~1..$c --stdout -- \
        CMakeLists.txt \
        src/CMakeLists.txt \
        cmake/FindSIMD.cmake \
@ -191,7 +205,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        > ggml-src.patch.tmp
    mv ggml-src.patch.tmp ggml-src.patch
-    git am ggml-src.patch
+    git am -C${ctx} ggml-src.patch
    rm -v $SRC_LLAMA/ggml-src.patch
 fi
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-28b7633d733bbeef0026570fbc61c79c5e9aa5ae
+10e83a412717c20d57ba19f025248e18e43addf3
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@ -101,6 +101,10 @@ struct ring_buffer {
    }
    void push_back(const T & value) {
        if (capacity == 0) {
            throw std::runtime_error("ring buffer: capacity is zero");
        }
        if (sz == capacity) {
            // advance the start when buffer is full
            first = (first + 1) % capacity;
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
--- a/src/llama-sampling.h
+++ b/src/llama-sampling.h
@ -23,16 +23,6 @@ struct llama_sampler_chain {
    mutable int32_t n_sample;
 };
 using llama_token_cnt = std::unordered_map<llama_token, int>;
 // TODO: tmp exposed until test-sampling is fixed
 void llama_sampler_penalties_impl(
       llama_token_data_array * cur_p,
        const llama_token_cnt & token_count,
                        float   penalty_repeat,
                        float   penalty_freq,
                        float   penalty_present);
 struct llama_sampler * llama_sampler_init_grammar_impl(
        const struct llama_vocab & vocab,
                      const char * grammar_str,
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -6399,6 +6399,11 @@ static void llm_load_vocab(
                        )
                   ) {
                    vocab.special_eot_id = t.second;
                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                            __func__, t.first.c_str());
                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                    break;
                }
            }
@ -6412,6 +6417,11 @@ static void llm_load_vocab(
            const auto & t = vocab.token_to_id.find("<|eom_id|>");
            if (t != vocab.token_to_id.end()) {
                vocab.special_eom_id = t->second;
                if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                        __func__, t->first.c_str());
                    vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                }
            }
        }
    }
@ -9248,7 +9258,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
    // FIXME: zero-out NANs?
    states = ggml_mul(ctx, states, state_mask);
-    // copy states which won't be changed further (between n_seqs and n_rs)
+    // copy states which won't be changed further (between n_seqs and n_kv)
    ggml_build_forward_expand(graph,
        ggml_cpy(ctx,
            ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
@ -16067,7 +16077,7 @@ static int llama_decode_internal(
    }
    for (uint32_t i = 0; i < n_tokens_all; ++i) {
-        if (batch_all.token[i] < 0) {
+        if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
            return -1;
        }
@ -16366,7 +16376,7 @@ static int llama_encode_internal(
    }
    for (uint32_t i = 0; i < n_tokens; ++i) {
-        if (batch.token[i] < 0) {
+        if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
            return -1;
        }
@ -17520,6 +17530,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        quantize &= name.find("time_mix_first.weight") == std::string::npos;
        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@ -1,19 +1,43 @@
 #include "arg.h"
 #include "common.h"
 #include <string>
 #include <vector>
 #include <sstream>
 #include <unordered_set>
 #undef NDEBUG
 #include <cassert>
 #include "common.h"
 int main(void) {
    gpt_params params;
    printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
    for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
        try {
-            gpt_params_parser_init(params, (enum llama_example)ex);
+            auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
            std::unordered_set<std::string> seen_args;
            std::unordered_set<std::string> seen_env_vars;
            for (const auto & opt : ctx_arg.options) {
                // check for args duplications
                for (const auto & arg : opt.args) {
                    if (seen_args.find(arg) == seen_args.end()) {
                        seen_args.insert(arg);
                    } else {
                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
                        exit(1);
                    }
                }
                // check for env var duplications
                if (opt.env) {
                    if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
                        seen_env_vars.insert(opt.env);
                    } else {
                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
                        exit(1);
                    }
                }
            }
        } catch (std::exception & e) {
            printf("%s\n", e.what());
            assert(false);
@ -29,40 +53,51 @@ int main(void) {
    };
    std::vector<std::string> argv;
    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
    printf("test-arg-parser: test invalid usage\n\n");
    // missing value
    argv = {"binary_name", "-m"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    // wrong value (int)
    argv = {"binary_name", "-ngl", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    // wrong value (enum)
    argv = {"binary_name", "-sm", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
    argv = {"binary_name", "--draft", "123"};
    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
    printf("test-arg-parser: test valid usage\n\n");
    argv = {"binary_name", "-m", "model_file.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "model_file.gguf");
    argv = {"binary_name", "-t", "1234"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.cpuparams.n_threads == 1234);
    argv = {"binary_name", "--verbose"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.verbosity == 1);
    argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "abc.gguf");
    assert(params.n_predict == 6789);
    assert(params.n_batch == 9090);
    // --draft cannot be used outside llama-speculative
    argv = {"binary_name", "--draft", "123"};
    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
    assert(params.n_draft == 123);
 // skip this part on windows, because setenv is not supported
 #ifdef _WIN32
    printf("test-arg-parser: skip on windows build\n");
@ -71,12 +106,12 @@ int main(void) {
    setenv("LLAMA_ARG_THREADS", "blah", true);
    argv = {"binary_name"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
    setenv("LLAMA_ARG_THREADS", "1010", true);
    argv = {"binary_name"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "blah.gguf");
    assert(params.cpuparams.n_threads == 1010);
@ -86,7 +121,7 @@ int main(void) {
    setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
    setenv("LLAMA_ARG_THREADS", "1010", true);
    argv = {"binary_name", "-m", "overwritten.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
    assert(params.model == "overwritten.gguf");
    assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -148,15 +148,17 @@ static void test_penalties(
        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
    }
-    llama_token_cnt token_count;
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
    auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
    for (size_t i = 0; i < last_tokens.size(); i++) {
-        token_count[last_tokens[i]]++;
+        llama_sampler_accept(sampler, last_tokens[i]);
    }
    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
    APPLY(llama_sampler_init_softmax(), &cur_p);
    DUMP(&cur_p);
-    llama_sampler_penalties_impl(&cur_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); // TODO: avoid
+    APPLY(sampler, &cur_p);
    APPLY(llama_sampler_init_softmax(), &cur_p);
    DUMP(&cur_p);
@ -243,7 +245,7 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
        }
    }
-    printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
+    printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n",
           samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
 }
`@ -1 +1 @@`
	`28b7633d733bbeef0026570fbc61c79c5e9aa5ae`	`10e83a412717c20d57ba19f025248e18e43addf3`