Merge branch 'ggerganov:master' into vulkan-initialize-value

2024-11-18 01:04:12 +13:00 · 2024-11-18 01:04:12 +13:00 · da1aab0d4a
commit da1aab0d4a
parent 245f5d49e9 ce2e59ba10
45 changed files with 4933 additions and 7511 deletions
--- a/.gitignore
+++ b/.gitignore
@ -134,3 +134,7 @@ poetry.toml
 # Test models for lora adapters
 /lora-tests
 # Local scripts
 /run-vim.sh
 /run-chat.sh
--- a/69
+++ b/69
@ -48,7 +48,6 @@ TEST_TARGETS = \
 	tests/test-backend-ops \
 	tests/test-chat-template \
 	tests/test-double-float \
 	tests/test-grad0 \
 	tests/test-grammar-integration \
 	tests/test-grammar-parser \
 	tests/test-json-schema-to-grammar \
@ -636,10 +635,6 @@ else ifndef CUDA_POWER_ARCH
 	MK_NVCCFLAGS += -arch=native
 endif # CUDA_DOCKER_ARCH
 ifdef GGML_CUDA_FORCE_DMMV
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # GGML_CUDA_FORCE_DMMV
 ifdef GGML_CUDA_FORCE_MMQ
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # GGML_CUDA_FORCE_MMQ
@ -648,20 +643,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
 endif # GGML_CUDA_FORCE_CUBLAS
 ifdef GGML_CUDA_DMMV_X
 	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # GGML_CUDA_DMMV_X
 ifdef GGML_CUDA_MMV_Y
 	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
 else ifdef GGML_CUDA_DMMV_Y
 	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # GGML_CUDA_MMV_Y
 ifdef GGML_CUDA_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_F16
@ -670,12 +651,6 @@ ifdef GGML_CUDA_DMMV_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_DMMV_F16
 ifdef GGML_CUDA_KQUANTS_ITER
 	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
 else
 	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
 ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
 	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
 else
@ -784,10 +759,6 @@ ifdef GGML_HIPBLAS
 		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
 	GGML_CUDA_DMMV_X       ?= 32
 	GGML_CUDA_MMV_Y        ?= 1
 	GGML_CUDA_KQUANTS_ITER ?= 2
 	MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
 ifdef GGML_HIP_UMA
@ -801,13 +772,6 @@ endif # GGML_HIP_UMA
 	HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
 	HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
 	HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
 	HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
 	HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
 ifdef GGML_CUDA_FORCE_DMMV
 	HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # GGML_CUDA_FORCE_DMMV
 ifdef GGML_CUDA_FORCE_MMQ
 	HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
@ -870,10 +834,6 @@ ifdef GGML_MUSA
 	MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
 ifdef GGML_CUDA_FORCE_DMMV
 	MUSAFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # GGML_CUDA_FORCE_DMMV
 ifdef GGML_CUDA_FORCE_MMQ
 	MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # GGML_CUDA_FORCE_MMQ
@ -882,18 +842,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
 	MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
 endif # GGML_CUDA_FORCE_CUBLAS
 ifdef GGML_CUDA_DMMV_X
 	MUSAFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
 else
 	MUSAFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # GGML_CUDA_DMMV_X
 ifdef GGML_CUDA_MMV_Y
 	MUSAFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
 else
 	MUSAFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # GGML_CUDA_MMV_Y
 ifdef GGML_CUDA_F16
 	MUSAFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_F16
@ -902,12 +850,6 @@ ifdef GGML_CUDA_DMMV_F16
 	MUSAFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_DMMV_F16
 ifdef GGML_CUDA_KQUANTS_ITER
 	MUSAFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
 else
 	MUSAFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
 ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
 	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
 else
@ -964,6 +906,7 @@ endif # GGML_METAL
 ifdef GGML_METAL
 ggml/src/ggml-metal/ggml-metal.o: \
 	ggml/src/ggml-metal/ggml-metal.m \
 	ggml/src/ggml-metal/ggml-metal-impl.h \
 	ggml/include/ggml-metal.h \
 	ggml/include/ggml.h
 	$(CC) $(CFLAGS) -c $< -o $@
@ -971,9 +914,11 @@ ggml/src/ggml-metal/ggml-metal.o: \
 ifdef GGML_METAL_EMBED_LIBRARY
 ggml/src/ggml-metal-embed.o: \
 	ggml/src/ggml-metal/ggml-metal.metal \
 	ggml/src/ggml-metal/ggml-metal-impl.h \
 	ggml/src/ggml-common.h
 	@echo "Embedding Metal library"
-	@sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal
+	@sed -e '/__embed_ggml-common.h__/r      ggml/src/ggml-common.h'                -e '/__embed_ggml-common.h__/d'      < ggml/src/ggml-metal/ggml-metal.metal           > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
 	@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
 	$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
 	@echo ".section __DATA, __ggml_metallib"                       >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
 	@echo ".globl _ggml_metallib_start"                            >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@ -997,6 +942,7 @@ OBJ_GGML = \
 	$(DIR_GGML)/src/ggml-alloc.o \
 	$(DIR_GGML)/src/ggml-backend.o \
 	$(DIR_GGML)/src/ggml-backend-reg.o \
 	$(DIR_GGML)/src/ggml-opt.o \
 	$(DIR_GGML)/src/ggml-quants.o \
 	$(DIR_GGML)/src/ggml-threading.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
@ -1499,11 +1445,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
 	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-grad0: tests/test-grad0.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-opt: tests/test-opt.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/README.md
+++ b/README.md
@ -459,14 +459,14 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
-## Other documentations
+## Other documentation
 - [main (cli)](./examples/main/README.md)
 - [server](./examples/server/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [GBNF grammars](./grammars/README.md)
-**Development documentations**
+**Development documentation**
 - [How to build](./docs/build.md)
 - [Running on Docker](./docs/docker.md)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1939,17 +1939,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.simple_io = true;
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
    add_opt(common_arg(
        {"-ld", "--logdir"}, "LOGDIR",
        "path under which to save YAML logs (no logging if unset)",
        [](common_params & params, const std::string & value) {
            params.logdir = value;
            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                params.logdir += DIRECTORY_SEPARATOR;
            }
        }
    ));
    add_opt(common_arg(
        {"--positive-file"}, "FNAME",
        string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1890,213 +1890,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
    return result;
 }
 //
 // YAML utils
 //
 void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
    if (data.empty()) {
        fprintf(stream, "%s:\n", prop_name);
        return;
    }
    fprintf(stream, "%s: [", prop_name);
    for (size_t i = 0; i < data.size() - 1; ++i) {
        fprintf(stream, "%e, ", data[i]);
    }
    fprintf(stream, "%e]\n", data.back());
 }
 void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
    if (data.empty()) {
        fprintf(stream, "%s:\n", prop_name);
        return;
    }
    fprintf(stream, "%s: [", prop_name);
    for (size_t i = 0; i < data.size() - 1; ++i) {
        fprintf(stream, "%d, ", data[i]);
    }
    fprintf(stream, "%d]\n", data.back());
 }
 void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
    std::string data_str(data == NULL ? "" : data);
    if (data_str.empty()) {
        fprintf(stream, "%s:\n", prop_name);
        return;
    }
    size_t pos_start = 0;
    size_t pos_found = 0;
    if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
        data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
        data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
        data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
        data_str = "\"" + data_str + "\"";
        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
        return;
    }
    if (data_str.find('\n') == std::string::npos) {
        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
        return;
    }
    fprintf(stream, "%s: |\n", prop_name);
    while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
        fprintf(stream, "  %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
        pos_start = pos_found + 1;
    }
 }
 void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
    ggml_cpu_init(); // some ARM features are detected at runtime
    const auto & sparams = params.sparams;
    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
    fprintf(stream, "cpu_has_avx_vnni: %s\n",    ggml_cpu_has_avx_vnni()    ? "true" : "false");
    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
    fprintf(stream, "cpu_has_neon: %s\n",        ggml_cpu_has_neon()        ? "true" : "false");
    fprintf(stream, "cpu_has_sve: %s\n",         ggml_cpu_has_sve()         ? "true" : "false");
    fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
    fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
    fprintf(stream, "cpu_has_riscv_v: %s\n",     ggml_cpu_has_riscv_v()     ? "true" : "false");
    fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
    fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
    fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
 #ifdef NDEBUG
    fprintf(stream, "debug: false\n");
 #else
    fprintf(stream, "debug: true\n");
 #endif // NDEBUG
    fprintf(stream, "model_desc: %s\n", model_desc);
    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
 #ifdef __OPTIMIZE__
    fprintf(stream, "optimize: true\n");
 #else
    fprintf(stream, "optimize: false\n");
 #endif // __OPTIMIZE__
    fprintf(stream, "time: %s\n", timestamp.c_str());
    fprintf(stream, "\n");
    fprintf(stream, "###############\n");
    fprintf(stream, "# User Inputs #\n");
    fprintf(stream, "###############\n");
    fprintf(stream, "\n");
    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
    fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
    fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
    fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
    fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
    fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
    yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
    fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
    yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
    fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
    yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
    fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
    fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
    fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
    fprintf(stream, "logit_bias:\n");
    for (const auto & logit_bias : sparams.logit_bias) {
        fprintf(stream, "  %d: %f", logit_bias.token, logit_bias.bias);
    }
    fprintf(stream, "lora:\n");
    for (auto & la : params.lora_adapters) {
        if (la.scale == 1.0f) {
            fprintf(stream, "  - %s\n", la.path.c_str());
        }
    }
    fprintf(stream, "lora_scaled:\n");
    for (auto & la : params.lora_adapters) {
        if (la.scale != 1.0f) {
            fprintf(stream, "  - %s: %f\n", la.path.c_str(), la.scale);
        }
    }
    fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
    fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
    fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
    yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
    fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
    yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
    fprintf(stream, "reverse_prompt:\n");
    for (std::string ap : params.antiprompt) {
        size_t pos = 0;
        while ((pos = ap.find('\n', pos)) != std::string::npos) {
            ap.replace(pos, 1, "\\n");
            pos += 1;
        }
        fprintf(stream, "  - %s\n", ap.c_str());
    }
    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
    fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
    yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
    fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
    fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
    fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
    fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
    fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
 }
--- a/common/common.h
+++ b/common/common.h
@ -209,7 +209,6 @@ struct common_params {
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
@ -584,15 +583,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 static const char * const LLM_KV_SPLIT_NO            = "split.no";
 static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 //
 // YAML utils
 //
 void yaml_dump_vector_float    (FILE * stream, const char * prop_name, const std::vector<float> & data);
 void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std::vector<int> & data);
 void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
 void yaml_dump_non_result_info(
    FILE * stream, const common_params & params, const llama_context * lctx,
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
--- a/docs/build.md
+++ b/docs/build.md
@ -186,13 +186,9 @@ The following compilation options are also available to tweak performance:
 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
 |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
 | GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
 | GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
 | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
 | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
 | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
 | GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
@ -268,13 +264,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
 If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
 The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
 | Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
 |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
 | GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
 | GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
 ### Vulkan
@ -282,9 +271,9 @@ The following compilation options are also available to tweak performance (yes,
 #### w64devkit
-Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
+Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
-Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
+Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
 Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
 ```sh
@ -302,6 +291,29 @@ EOF
 ```
 Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
 #### Git Bash MINGW64
 Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
 Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++`
 Download and install [`CMake`](https://cmake.org/download/) with the default settings
 Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
 Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands
 ```
 cmake -B build -DGGML_VULKAN=ON
 cmake --build build --config Release
 ```
 Now you can load the model in conversation mode using `Vulkan`
 ```
 build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
 ```
 #### MSYS2
 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
  ```sh
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -43,50 +43,6 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 static void write_logfile(
    const llama_context * ctx, const common_params & params, const llama_model * model,
    const std::vector<llama_token> & input_tokens, const std::string & output,
    const std::vector<llama_token> & output_tokens
 ) {
    if (params.logdir.empty()) {
        return;
    }
    const std::string timestamp = string_get_sortable_timestamp();
    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
        LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
    const std::string logfile_path = params.logdir + timestamp + ".yml";
    FILE * logfile = fopen(logfile_path.c_str(), "w");
    if (logfile == NULL) {
        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }
    fprintf(logfile, "binary: infill\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "# Generation Results #\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
    yaml_dump_string_multiline(logfile, "output", output.c_str());
    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
    llama_perf_dump_yaml(logfile, ctx);
    fclose(logfile);
 }
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
    if (signo == SIGINT) {
@ -96,7 +52,6 @@ static void sigint_handler(int signo) {
            console::cleanup();
            LOG("\n");
            common_perf_print(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            // make sure all logs are flushed
            LOG("Interrupted by user\n");
@ -625,7 +580,6 @@ int main(int argc, char ** argv) {
    LOG("\n");
    common_perf_print(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -62,49 +62,6 @@ static bool file_is_empty(const std::string & path) {
    return f.tellg() == 0;
 }
 static void write_logfile(
    const llama_context * ctx, const common_params & params, const llama_model * model,
    const std::vector<llama_token> & input_tokens, const std::string & output,
    const std::vector<llama_token> & output_tokens
 ) {
    if (params.logdir.empty()) {
        return;
    }
    const std::string timestamp = string_get_sortable_timestamp();
    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
        LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
        return;
    }
    const std::string logfile_path = params.logdir + timestamp + ".yml";
    FILE * logfile = fopen(logfile_path.c_str(), "w");
    if (logfile == NULL) {
        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }
    fprintf(logfile, "binary: main\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "# Generation Results #\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
    yaml_dump_string_multiline(logfile, "output", output.c_str());
    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
    llama_perf_dump_yaml(logfile, ctx);
    fclose(logfile);
 }
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
    if (signo == SIGINT) {
@ -115,7 +72,6 @@ static void sigint_handler(int signo) {
            console::cleanup();
            LOG("\n");
            common_perf_print(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            // make sure all logs are flushed
            LOG("Interrupted by user\n");
@ -926,7 +882,6 @@ int main(int argc, char ** argv) {
    LOG("\n\n");
    common_perf_print(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
    common_sampler_free(smpl);
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -34,55 +34,6 @@ struct results_log_softmax {
    float  prob;
 };
 static void write_logfile(
    const llama_context * ctx, const common_params & params, const llama_model * model,
    const struct results_perplexity & results
 ) {
    if (params.logdir.empty()) {
        return;
    }
    if (params.hellaswag) {
        LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
        return;
    }
    const std::string timestamp = string_get_sortable_timestamp();
    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
        LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
    const std::string logfile_path = params.logdir + timestamp + ".yml";
    FILE * logfile = fopen(logfile_path.c_str(), "w");
    if (logfile == NULL) {
        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }
    fprintf(logfile, "binary: main\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
    yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "# Perplexity Results #\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
    yaml_dump_vector_float(logfile, "logits", results.logits);
    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
    yaml_dump_vector_float(logfile, "probs", results.probs);
    llama_perf_dump_yaml(logfile, ctx);
    fclose(logfile);
 }
 static std::vector<float> softmax(const std::vector<float>& logits) {
    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
@ -2072,8 +2023,6 @@ int main(int argc, char ** argv) {
    LOG("\n");
    llama_perf_context_print(ctx);
    write_logfile(ctx, params, model, results);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -85,7 +85,6 @@ The project is under active development, and we are [looking for feedback and co
 | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
 | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
 | `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
 | `--log-disable` | Log disable |
 | `--log-file FNAME` | Log to file |
 | `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -128,14 +128,9 @@ option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
 set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
 set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
 set   (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
                                            "ggml: iters./thread per block for Q2_K/Q6_K")
 set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                            "ggml: max. batch size for using peer access")
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
@ -228,6 +223,7 @@ set(GGML_PUBLIC_HEADERS
    include/ggml-cann.h
    include/ggml-cuda.h
    include/ggml-kompute.h
    include/ggml-opt.h
    include/ggml-metal.h
    include/ggml-rpc.h
    include/ggml-sycl.h
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -86,7 +86,7 @@ extern "C" {
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-    // "offset" refers to the offset of the tensor data for setting/getting data
+    // "offset" refers to the offset in tensor->data for setting/getting data
    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
@ -242,14 +242,20 @@ extern "C" {
        ggml_backend_sched_reserve(sched, reserve_graph);
        // compute
-        graph = build_graph(sched);
+        graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
-        ggml_backend_sched_graph_compute(sched, graph);
+        for (int i = 0; i < 10; ++i) {
            ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
        }
        // if there are graph inputs:
-        ggml_backend_sched_reset(sched);
+        graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
-        ggml_backend_sched_alloc_graph(sched, graph);
+        ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
-        ggml_backend_tensor_set(input_tensor, ...);
+        ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
-        ggml_backend_sched_graph_compute(sched, graph);
+        ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
        ggml_backend_sched_graph_compute(sched, graph); // execute the graph
        // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
        // allocate them statically via ggml_backend_alloc_ctx_tensors
    }
    */
@ -264,7 +270,7 @@ extern "C" {
    //
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
-    // Initialize a backend scheduler
+    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
@ -289,7 +295,9 @@ extern "C" {
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
-    // Reset all assignments and allocators - must be called before changing the node backends
+    // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
    // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
    // The correct way to use this API is to discard the deallocated tensors and create new ones.
    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
    // Set a callback to be called for each resulting node during graph compute
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@ -0,0 +1,216 @@
 // This file contains functionality for training models using GGML.
 // It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
 // At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
 //
 // Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #include <stdint.h>
 #ifdef  __cplusplus
 extern "C" {
 #endif
    struct ggml_opt_dataset;
    struct ggml_opt_context;
    struct ggml_opt_result;
    typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
    typedef struct ggml_opt_context * ggml_opt_context_t;
    typedef struct ggml_opt_result  * ggml_opt_result_t;
    // ====== Loss ======
    // built-in loss types, i.e. the built-in quantities minimized by the optimizer
    // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
    enum ggml_opt_loss_type {
        GGML_OPT_LOSS_TYPE_MEAN,
        GGML_OPT_LOSS_TYPE_SUM,
        GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
        GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
    };
    // ====== Dataset ======
    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
            int64_t ne_datapoint, // number of elements per datapoint
            int64_t ne_label,     // number of elements per label
            int64_t ndata,        // total number of datapoints/labels
            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
    // get underlying tensors that store the data
    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
    // shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
    GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
    // get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
    GGML_API void ggml_opt_dataset_get_batch(
            ggml_opt_dataset_t   dataset,
            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
            int64_t              ibatch);
    // ====== Model / Context ======
    enum ggml_opt_build_type {
        GGML_OPT_BUILD_TYPE_FORWARD,
        GGML_OPT_BUILD_TYPE_GRAD,
        GGML_OPT_BUILD_TYPE_OPT,
    };
    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
    struct ggml_opt_optimizer_params {
        // AdamW optimizer parameters
        struct {
            float alpha; // learning rate
            float beta1;
            float beta2;
            float eps;   // epsilon for numerical stability
            float wd;    // weight decay for AdamW, use 0.0f to disable
        } adamw;
    };
    // callback to calculate optimizer parameters prior to a backward pass
    // userdata can be used to pass arbitrary data
    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
    // returns the default optimizer params (constant)
    // userdata is not used
    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
    // parameters for initializing a new optimization context
    struct ggml_opt_params {
        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
        // the forward graph is defined by inputs and outputs
        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
        struct ggml_tensor * inputs;
        struct ggml_tensor * outputs;
        enum ggml_opt_loss_type  loss_type;
        enum ggml_opt_build_type build_type;
        int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
    };
    // get parameters for an optimization context with defaults set where possible
    // parameters for which no sensible defaults exist are supplied as arguments to this function
    GGML_API ggml_opt_params ggml_opt_default_params(
            ggml_backend_sched_t      backend_sched,
            struct ggml_context     * ctx_compute,
            struct ggml_tensor      * inputs,
            struct ggml_tensor      * outputs,
            enum ggml_opt_loss_type   loss_type);
    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
    // set gradients to zero, initilize loss, and optionally reset the optimizer
    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
    // get underlying tensors that store data
    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
    GGML_API struct ggml_tensor * ggml_opt_loss(    ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
    // ====== Optimization Result ======
    GGML_API ggml_opt_result_t ggml_opt_result_init();
    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
    // get data from result, uncertainties are optional and can be ignored by passing NULL
    GGML_API void ggml_opt_result_ndata(   ggml_opt_result_t result, int64_t * ndata);                  // writes 1 value, number of datapoints
    GGML_API void ggml_opt_result_loss(    ggml_opt_result_t result, double  * loss,     double * unc); // writes 1 value
    GGML_API void ggml_opt_result_pred(    ggml_opt_result_t result, int32_t * pred);                   // writes ndata values
    GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double  * accuracy, double * unc); // writes 1 value
    // ====== Computation ======
    // do forward pass, increment result if not NULL
    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
    // do forward pass, increment result if not NULL, do backward pass
    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
    // ############################################################################
    // ## The high-level functions start here. They do not depend on any private ##
    // ## functions or structs and can be copied to and adapted for user code.   ##
    // ############################################################################
    // ====== Intended Usage ======
    //
    // 1. Select the appropriate loss for your problem.
    // 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
    //    Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
    // 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
    //    The first context should contain the model parameters and inputs and be allocated statically in user code.
    //    The second context should contain all other tensors and will be (re)allocated automatically.
    //    Due to this automated allocation the data of the second context is not defined when accessed in user code.
    //    Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
    // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
    // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
    typedef void (*ggml_opt_epoch_callback)(
            bool               train,       // true after training evaluation, false after validation evaluation
            ggml_opt_context_t opt_ctx,
            ggml_opt_dataset_t dataset,
            ggml_opt_result_t  result,      // result associated with the dataset subsection
            int64_t            ibatch,      // number of batches that have been evaluated so far
            int64_t            ibatch_max,  // total number of batches in this dataset subsection
            int64_t            t_start_us); // time at which the evaluation on the dataset subsection was started
    // do training on front of dataset, do evaluation only on back of dataset
    GGML_API void ggml_opt_epoch(
            ggml_opt_context_t      opt_ctx,
            ggml_opt_dataset_t      dataset,
            ggml_opt_result_t       result_train,   // result to increment during training, ignored if NULL
            ggml_opt_result_t       result_eval,    // result to increment during evaluation, ignored if NULL
            int64_t                 idata_split,    // data index at which to split training and evaluation
            ggml_opt_epoch_callback callback_train,
            ggml_opt_epoch_callback callback_eval);
    // callback that prints a progress bar on stderr
    GGML_API void ggml_opt_epoch_callback_progress_bar(
            bool               train,
            ggml_opt_context_t opt_ctx,
            ggml_opt_dataset_t dataset,
            ggml_opt_result_t  result,
            int64_t            ibatch,
            int64_t            ibatch_max,
            int64_t            t_start_us);
    // fit model defined by inputs and outputs to dataset
    GGML_API void ggml_opt_fit(
            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
            enum ggml_opt_loss_type         loss_type,      // loss to minimize
            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
            int64_t                         nepoch,         // how many times the dataset should be iterated over
            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
            bool                            silent);        // whether or not info prints to stderr should be suppressed
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -602,7 +602,6 @@ extern "C" {
        int32_t flags;
        struct ggml_tensor * grad;
        struct ggml_tensor * src[GGML_MAX_SRC];
        // source tensor and offset for views
@ -615,7 +614,7 @@ extern "C" {
        void * extra; // extra things e.g. for ggml-cuda.cu
-        // char padding[4];
+        char padding[8];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -1985,28 +1984,20 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * grad,
-            float                 alpha,
+            struct ggml_tensor  * m,
-            float                 beta1,
+            struct ggml_tensor  * v,
-            float                 beta2,
+            struct ggml_tensor  * adamw_params); // parameters such a the learning rate
            float                 eps,
            float                 wd); // weight decay
    //
    // automatic differentiation
    //
    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
+    GGML_API void ggml_build_backward_expand(
-
+        struct ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
-    GGML_API void ggml_build_opt_adamw(
+        struct ggml_context * ctx_compute, // context for gradient computation
-            struct ggml_context * ctx,
+        struct ggml_cgraph  * cgraph,
-            struct ggml_cgraph  * gf,
+        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
            struct ggml_cgraph  * gb,
            float                 alpha,
            float                 beta1,
            float                 beta2,
            float                 eps,
            float                 wd); // weight decay
    // graph allocation in a context
    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
@ -2026,7 +2017,9 @@ extern "C" {
    GGML_API size_t ggml_graph_overhead(void);
    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
-    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+    GGML_API struct ggml_tensor * ggml_graph_get_tensor  (const struct ggml_cgraph * cgraph, const char * name);
    GGML_API struct ggml_tensor * ggml_graph_get_grad    (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
    GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
@ -2037,198 +2030,15 @@ extern "C" {
    // dump the graph into a file using the dot format
    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
-    // build gradient checkpointing backward graph gb for gf using provided checkpoints
+    // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
    // gb_tmp will contain original backward graph with rewritten backward process nodes,
    // but without the second forward pass nodes.
    GGML_API void ggml_build_backward_gradient_checkpointing(
            struct ggml_context   * ctx,
            struct ggml_cgraph    * gf,
            struct ggml_cgraph    * gb,
            struct ggml_cgraph    * gb_tmp,
            struct ggml_tensor  * * checkpoints,
            int                     n_checkpoints);
    //
    // optimization
    //
    // optimization methods
    enum ggml_opt_type {
        GGML_OPT_TYPE_ADAM,
        GGML_OPT_TYPE_LBFGS,
    };
    // linesearch methods
    enum ggml_linesearch {
        GGML_LINESEARCH_DEFAULT = 1,
        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
    };
    // optimization return values
    enum ggml_opt_result {
        GGML_OPT_RESULT_OK = 0,
        GGML_OPT_RESULT_DID_NOT_CONVERGE,
        GGML_OPT_RESULT_NO_CONTEXT,
        GGML_OPT_RESULT_INVALID_WOLFE,
        GGML_OPT_RESULT_FAIL,
        GGML_OPT_RESULT_CANCEL,
        GGML_LINESEARCH_FAIL = -128,
        GGML_LINESEARCH_MINIMUM_STEP,
        GGML_LINESEARCH_MAXIMUM_STEP,
        GGML_LINESEARCH_MAXIMUM_ITERATIONS,
        GGML_LINESEARCH_INVALID_PARAMETERS,
    };
    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
    // Set callback for all future logging events.
    // If this is not called, or NULL is supplied, everything is output on stderr.
    GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
    // optimization parameters
    //
    //   see ggml.c (ggml_opt_default_params) for default values
    //
    struct ggml_opt_params {
        enum ggml_opt_type type;
        size_t graph_size;
        int n_threads;
        // delta-based convergence test
        //
        //   if past == 0 - disabled
        //   if past > 0:
        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
        //
        int past;
        float delta;
        // maximum number of iterations without improvement
        //
        //   if 0 - disabled
        //   if > 0:
        //     assume convergence if no cost improvement in this number of iterations
        //
        int max_no_improvement;
        bool print_forward_graph;
        bool print_backward_graph;
        int n_gradient_accumulation;
        // ADAM parameters
        struct {
            int n_iter;
            float sched; // schedule multiplier (fixed, decay or warmup)
            float decay; // weight decay for AdamW, use 0.0f to disable
            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
            float alpha; // learning rate
            float beta1;
            float beta2;
            float eps;   // epsilon for numerical stability
            float eps_f; // epsilon for convergence test
            float eps_g; // epsilon for convergence test
            float gclip; // gradient clipping
        } adam;
        // LBFGS parameters
        struct {
            int m; // number of corrections to approximate the inv. Hessian
            int n_iter;
            int max_linesearch;
            float eps;      // convergence tolerance
            float ftol;     // line search tolerance
            float wolfe;
            float min_step;
            float max_step;
            enum ggml_linesearch linesearch;
        } lbfgs;
    };
    struct ggml_opt_context {
        struct ggml_context * ctx;
        struct ggml_opt_params params;
        int iter;
        int64_t nx; // number of parameter elements
        bool just_initialized;
        float loss_before;
        float loss_after;
        struct {
            struct ggml_tensor * g;  // current gradient
            struct ggml_tensor * m;  // first moment
            struct ggml_tensor * v;  // second moment
            struct ggml_tensor * pf; // past function values
            float fx_best;
            float fx_prev;
            int n_no_improvement;
        } adam;
        struct {
            struct ggml_tensor * x;    // current parameters
            struct ggml_tensor * xp;   // previous parameters
            struct ggml_tensor * g;    // current gradient
            struct ggml_tensor * gp;   // previous gradient
            struct ggml_tensor * d;    // search direction
            struct ggml_tensor * pf;   // past function values
            struct ggml_tensor * lmal; // the L-BFGS memory alpha
            struct ggml_tensor * lmys; // the L-BFGS memory ys
            struct ggml_tensor * lms;  // the L-BFGS memory s
            struct ggml_tensor * lmy;  // the L-BFGS memory y
            float fx_best;
            float step;
            int j;
            int k;
            int end;
            int n_no_improvement;
        } lbfgs;
    };
    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
    // optimize the function defined by the tensor f
    GGML_API enum ggml_opt_result ggml_opt(
            struct ggml_context * ctx,
            struct ggml_opt_params params,
            struct ggml_tensor * f);
    // initialize optimizer context
    GGML_API void ggml_opt_init(
            struct ggml_context     * ctx,
            struct ggml_opt_context * opt,
            struct ggml_opt_params    params,
            int64_t                   nx);
    // continue optimizing the function defined by the tensor f
    GGML_API enum ggml_opt_result ggml_opt_resume(
            struct ggml_context * ctx,
            struct ggml_opt_context * opt,
            struct ggml_tensor * f);
    // continue optimizing the function defined by the tensor f
    GGML_API enum ggml_opt_result ggml_opt_resume_g(
            struct ggml_context * ctx,
            struct ggml_opt_context * opt,
            struct ggml_tensor * f,
            struct ggml_cgraph * gf,
            struct ggml_cgraph * gb,
            ggml_opt_callback callback,
            void * callback_data);
    //
    // quantization
    //
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -207,9 +207,11 @@ add_library(ggml-base
            ../include/ggml-alloc.h
            ../include/ggml-backend.h
            ../include/ggml-cpp.h
            ../include/ggml-opt.h
            ggml.c
            ggml-alloc.c
            ggml-backend.cpp
            ggml-opt.cpp
            ggml-threading.cpp
            ggml-threading.h
            ggml-quants.c
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -466,18 +466,12 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
    return ggml_gallocr_hash_get(galloc, t)->allocated;
 }
 static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
    hn->buffer_id = buffer_id;
    hn->offset = offset;
    hn->allocated = true;
 }
 static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
    return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
 }
 static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
    GGML_ASSERT(buffer_id >= 0);
    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
@ -816,7 +810,11 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 }
 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
+    size_t node_size = 0;
    if (!node->data && !node->view_src) {
        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
        node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
    }
    return talloc->size_max >= node_size;
 }
--- a/ggml/src/ggml-amx/ggml-amx.cpp
+++ b/ggml/src/ggml-amx/ggml-amx.cpp
@ -317,8 +317,6 @@ static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const st
            const enum ggml_type type = src0->type;
            const int64_t ne0 = op->ne[0];
            bool is_training = src0->grad || src1->grad;
            // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
            // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
            bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
@ -326,7 +324,6 @@ static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const st
            bool can_use_amx =
                is_contiguous_2d(src0) &&       // src0 must be contiguous
                is_contiguous_2d(src1) &&       // src1 must be contiguous
                !is_training &&                 // inference only
                src1->type == GGML_TYPE_F32 &&  // src1 must be float32
                has_amx_kernels &&              // with amx kernel impls
                ne0 % (TILE_N * 2) == 0;        // out_features is 32x
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -279,7 +279,7 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz
    buf->iface.get_tensor(buf, tensor, data, offset, size);
 }
-GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
    if (size == 0) {
@ -689,7 +689,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
 }
 static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
-    ggml_backend_buffer_t buffer = tensor->buffer;
+    ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
    if (buffer == NULL) {
        return -1;
    }
@ -722,8 +722,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML
 // returns the backend that should be used for the node based on the current locations
 static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
    // TODO: use supports_op to check if the backend supports the op
    // assign pre-allocated nodes to their backend
    int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
    if (cur_backend_id != -1) {
@ -742,7 +740,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
    if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
        // since the tensor is pre-allocated, it cannot be moved to another backend
-        GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
+        GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
    }
    // graph input
@ -886,6 +884,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];
        int * node_backend_id = &tensor_backend_id(node);
        if (ggml_is_view_op(node->op)) {
            continue;
        }
        // do not overwrite user assignments
        if (*node_backend_id == -1) {
            *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
@ -1538,12 +1539,13 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
    ggml_backend_sched_split_graph(sched, measure_graph);
    ggml_backend_sched_synchronize(sched);
    if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
        return false;
    }
    ggml_backend_sched_reset(sched);
    ggml_backend_sched_synchronize(sched);
    return true;
 }
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -2369,7 +2369,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
    // figure out which node we're on
    uint current_cpu;
    int getcpu_ret = 0;
-#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 33) || defined(__COSMOPOLITAN__)
    getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
 #else
    // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
@ -12220,7 +12220,12 @@ static void ggml_compute_forward_opt_step_adamw_f32(
    const struct ggml_tensor * src0_grad    = dst->src[1];
    const struct ggml_tensor * src0_grad_m  = dst->src[2];
    const struct ggml_tensor * src0_grad_v  = dst->src[3];
    const struct ggml_tensor * adamw_params = dst->src[4];
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
    const int ith = params->ith;
    const int nth = params->nth;
@ -12237,16 +12242,14 @@ static void ggml_compute_forward_opt_step_adamw_f32(
    const int ir0 = dr*ith;
    const int ir1 = MIN(ir0 + dr, nr);
-    /* const float   gnorm = 1.0f; */
+    const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
-    int64_t       iter;   memcpy(&iter, &dst->op_params[0], sizeof(int64_t));
+    const float alpha  = adamw_params_ptr[0];
-    const float   alpha = ggml_get_op_params_f32(dst, 2);
+    const float beta1  = adamw_params_ptr[1];
-    const float   beta1 = ggml_get_op_params_f32(dst, 3);
+    const float beta2  = adamw_params_ptr[2];
-    const float   beta2 = ggml_get_op_params_f32(dst, 4);
+    const float eps    = adamw_params_ptr[3];
-    const float   eps   = ggml_get_op_params_f32(dst, 5);
+    const float wd     = adamw_params_ptr[4];
-    const float   wd    = ggml_get_op_params_f32(dst, 6);
+    const float beta1h = adamw_params_ptr[5];
-
+    const float beta2h = adamw_params_ptr[6];
    const float beta1h  = alpha/(1.0f - powf(beta1, iter));
    const float beta2h  =  1.0f/(1.0f - powf(beta2, iter));
    for (int ir = ir0; ir < ir1; ++ir) {
        const int64_t i03 = ir/(ne02*ne01);
@ -12270,17 +12273,9 @@ static void ggml_compute_forward_opt_step_adamw_f32(
            // The weight decay is applied independently of the Adam momenta m and v.
            // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
            // See: https://arxiv.org/pdf/1711.05101v3.pdf
-            w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh;
+            w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
        }
    }
    ggml_barrier(params->threadpool);
    if (ith != 0) {
        return;
    }
    iter++;
    memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
 }
 static void ggml_compute_forward_opt_step_adamw(
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@ -6,15 +6,18 @@ if (CUDAToolkit_FOUND)
    message(STATUS "CUDA Toolkit found")
    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # 52 == lowest CUDA 12 standard
+        # native == GPUs available at build time
-        # 60 == FP16 CUDA intrinsics
+        # 52     == Maxwell, lowest CUDA 12 standard
-        # 61 == integer CUDA intrinsics
+        # 60     == P100, FP16 CUDA intrinsics
-        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
+        # 61     == Pascal, __dp4a instruction (per-byte integer dot product)
-        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+        # 70     == V100, FP16 tensor cores
        # 75     == Turing, int8 tensor cores
        if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6")
            set(CMAKE_CUDA_ARCHITECTURES "native")
        elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
        else()
            set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
            #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@ -51,21 +54,12 @@ if (CUDAToolkit_FOUND)
    target_link_libraries(ggml-cuda PRIVATE ggml-base)
    target_include_directories(ggml-cuda PRIVATE . ..)
    # TODO: change the definitions to this target only
    add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
    add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
    if (GGML_CUDA_GRAPHS)
        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
    endif()
    if (GGML_CUDA_FORCE_DMMV)
        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
    endif()
    if (GGML_CUDA_FORCE_MMQ)
        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
    endif()
@ -78,10 +72,6 @@ if (CUDAToolkit_FOUND)
        add_compile_definitions(GGML_CUDA_NO_VMM)
    endif()
    if (DEFINED GGML_CUDA_DMMV_Y)
        add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
    endif()
    if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
        add_compile_definitions(GGML_CUDA_F16)
    endif()
--- a/ggml/src/ggml-cuda/dmmv.cu
+++ b/ggml/src/ggml-cuda/dmmv.cu
@ -1,699 +0,0 @@
 #include "dmmv.cuh"
 #include "dequantize.cuh"
 #include "convert.cuh"
 #ifndef K_QUANTS_PER_ITERATION
 #define K_QUANTS_PER_ITERATION 2
 #else
 static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
 #endif
 static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;
    const block_q2_K * x = (const block_q2_K *)vx + ib0;
    float tmp = 0; // partial sum for thread in warp
    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
    const int step = 16/K_QUANTS_PER_ITERATION;
    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
    const int in = tid - step*im;                        // 0...15 or 0...7
    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
    const int q_offset = 32*im + l0;
    const int s_offset = 8*im;
    const int y_offset = 128*im + l0;
    uint32_t aux[4];
    const uint8_t * d = (const uint8_t *)aux;
    const uint8_t * m = (const uint8_t *)(aux + 2);
    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
        const float   * y = yy + i * QK_K + y_offset;
        const uint8_t * q = x[i].qs + q_offset;
        const float dall = __low2half(x[i].dm);
        const float dmin = __high2half(x[i].dm);
        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
        aux[0] = a[0] & 0x0f0f0f0f;
        aux[1] = a[1] & 0x0f0f0f0f;
        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
        float sum1 = 0, sum2 = 0;
        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
        }
        tmp += dall * sum1 - dmin * sum2;
    }
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
    if (threadIdx.x == 0) {
        dst[row] = tmp;
    }
 }
 static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;
    const block_q3_K * x = (const block_q3_K *)vx + ib0;
    float tmp = 0; // partial sum for thread in warp
    const uint16_t kmask1 = 0x0303;
    const uint16_t kmask2 = 0x0f0f;
    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
    const int step = 16/K_QUANTS_PER_ITERATION;
    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
    const int in = tid - step*im;                        // 0....15 or 0...7
    const uint8_t m = 1 << (4*im);
    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
    const int q_offset =  32*im + l0;
    const int y_offset = 128*im + l0;
    uint16_t utmp[4];
    const int8_t * s = (const int8_t *)utmp;
    const uint16_t s_shift = 4*im;
    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
        const float   * y  = yy + i * QK_K + y_offset;
        const uint8_t * q = x[i].qs + q_offset;
        const uint8_t * h = x[i].hmask + l0;
        const uint16_t * a = (const uint16_t *)x[i].scales;
        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
        const float d = x[i].d;
        float sum = 0;
        for (int l = 0; l < n; ++l) {
            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
        }
        tmp += d * sum;
    }
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
    if (threadIdx.x == 0) {
        dst[row] = tmp;
    }
 }
 static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;
    const block_q4_K * x = (const block_q4_K *)vx + ib0;
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;
    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
    const int il  = tid/step;                            // 0...3
    const int ir  = tid - step*il;                       // 0...7 or 0...3
    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
    const int in = il%2;
    const int l0 = n*(2*ir + in);
    const int q_offset = 32*im + l0;
    const int y_offset = 64*im + l0;
    uint16_t aux[4];
    const uint8_t * sc = (const uint8_t *)aux;
 #if K_QUANTS_PER_ITERATION == 2
    uint32_t q32[4];
    const uint8_t * q4 = (const uint8_t *)q32;
 #else
    uint16_t q16[4];
    const uint8_t * q4 = (const uint8_t *)q16;
 #endif
    float tmp = 0; // partial sum for thread in warp
    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
        const float   * y1 = yy + i*QK_K + y_offset;
        const float   * y2 = y1 + 128;
        const float dall = __low2half(x[i].dm);
        const float dmin = __high2half(x[i].dm);
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
        aux[1] = a[im+2] & kmask1;
        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
 #if K_QUANTS_PER_ITERATION == 2
        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
        const uint32_t * q2 = q1 + 16;
        q32[0] = q1[0] & 0x0f0f0f0f;
        q32[1] = q1[0] & 0xf0f0f0f0;
        q32[2] = q2[0] & 0x0f0f0f0f;
        q32[3] = q2[0] & 0xf0f0f0f0;
        float4 s = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
        for (int l = 0; l < 4; ++l) {
            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
        }
        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
 #else
        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
        const uint16_t * q2 = q1 + 32;
        q16[0] = q1[0] & 0x0f0f;
        q16[1] = q1[0] & 0xf0f0;
        q16[2] = q2[0] & 0x0f0f;
        q16[3] = q2[0] & 0xf0f0;
        float4 s = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
        for (int l = 0; l < 2; ++l) {
            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
        }
        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
 #endif
    }
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
    if (tid == 0) {
        dst[row] = tmp;
    }
 }
 static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
    const int row = blockIdx.x;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;
    const block_q5_K * x = (const block_q5_K *)vx + ib0;
    float tmp = 0; // partial sum for thread in warp
    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;
    const int tid = threadIdx.x/2;  // 0...15
    const int ix  = threadIdx.x%2;
    const int il  = tid/4;     // 0...3
    const int ir  = tid - 4*il;// 0...3
    const int n   = 2;
    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
    const int in = il%2;
    const int l0 = n*(2*ir + in);
    const int q_offset = 32*im + l0;
    const int y_offset = 64*im + l0;
    const uint8_t hm1  = 1 << (2*im);
    const uint8_t hm2  = hm1 << 4;
    uint16_t aux[4];
    const uint8_t * sc = (const uint8_t *)aux;
    uint16_t q16[8];
    const uint8_t * q4 = (const uint8_t *)q16;
    for (int i = ix; i < num_blocks_per_row; i += 2) {
        const uint8_t * ql1 = x[i].qs + q_offset;
        const uint8_t * qh  = x[i].qh + l0;
        const float   * y1  = yy + i*QK_K + y_offset;
        const float   * y2  = y1 + 128;
        const float dall = __low2half(x[i].dm);
        const float dmin = __high2half(x[i].dm);
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux[0] = a[im+0] & kmask1;
        aux[1] = a[im+2] & kmask1;
        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
        float4 sum = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
        const uint16_t * q1 = (const uint16_t *)ql1;
        const uint16_t * q2 = q1 + 32;
        q16[0] = q1[0] & 0x0f0f;
        q16[1] = q1[8] & 0x0f0f;
        q16[2] = (q1[0] >> 4) & 0x0f0f;
        q16[3] = (q1[8] >> 4) & 0x0f0f;
        q16[4] = q2[0] & 0x0f0f;
        q16[5] = q2[8] & 0x0f0f;
        q16[6] = (q2[0] >> 4) & 0x0f0f;
        q16[7] = (q2[8] >> 4) & 0x0f0f;
        for (int l = 0; l < n; ++l) {
            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
        }
        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
    }
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
    if (threadIdx.x == 0) {
        dst[row] = tmp;
    }
 }
 static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;
    const block_q6_K * x = (const block_q6_K *)vx + ib0;
    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
    const int in = tid - step*im;                        // 0...15 or 0...7
 #if K_QUANTS_PER_ITERATION == 1
    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
    const int is = 0;
 #else
    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
    const int is = in / 4;
 #endif
    const int ql_offset = 64*im + l0;
    const int qh_offset = 32*im + l0;
    const int s_offset  =  8*im + is;
    const int y_offset = 128*im + l0;
    float tmp = 0; // partial sum for thread in warp
    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
        const float   * y  = yy + i * QK_K + y_offset;
        const uint8_t * ql = x[i].ql + ql_offset;
        const uint8_t * qh = x[i].qh + qh_offset;
        const int8_t  * s  = x[i].scales + s_offset;
        const float d = x[i].d;
 #if K_QUANTS_PER_ITERATION == 1
        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
        tmp += sum;
 #else
        float sum = 0;
        for (int l = 0; l < 4; ++l) {
            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
        }
        tmp += sum;
 #endif
    }
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
    if (tid == 0) {
        dst[row] = tmp;
    }
 }
 static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
    const half * x = (const half *) vx;
    // load 2 halfs into register in a single instruction
    const half2 x_reg = *((half2 *) &(x[ib + iqs]));
    // automatic half -> float type cast if dfloat == float
    v.x = __low2float(x_reg);
    v.y = __high2float(x_reg);
 }
 static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
    return type == GGML_TYPE_Q4_0 ? dequantize_q4_0 :
        type == GGML_TYPE_Q4_1 ? dequantize_q4_1 :
        type == GGML_TYPE_Q5_0 ? dequantize_q5_0 :
        type == GGML_TYPE_Q5_1 ? dequantize_q5_1 :
        type == GGML_TYPE_Q8_0 ? dequantize_q8_0 :
        type == GGML_TYPE_F16 ? convert_f16 :
        nullptr;
 }
 template <ggml_type type>
 static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
    constexpr int qk = ggml_cuda_type_traits<type>::qk; // quantized weights per x block
    constexpr int qr = ggml_cuda_type_traits<type>::qr; // number of quantized weights per data value in x block
    constexpr dequantize_kernel_t dequantize_kernel = get_dequantize_kernel(type);
    const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
    if (row >= nrows) {
        return;
    }
    const int tid = threadIdx.x;
    const int iter_stride = 2*GGML_CUDA_DMMV_X;
    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
    const int y_offset = qr == 1 ? 1 : qk/2;
 // partial sum for each thread
 #ifdef GGML_CUDA_F16
    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
 #else
    float tmp = 0.0f;
 #endif // GGML_CUDA_F16
    for (int i = 0; i < ncols; i += iter_stride) {
        const int col = i + vals_per_iter*tid;
        const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
        const int iqs = (col%qk)/qr; // x quant index
        const int iybs = col - col%qk; // y block start index
 // processing >2 values per i iter is faster for fast GPUs
 #pragma unroll
        for (int j = 0; j < vals_per_iter; j += 2) {
            // process 2 vals per j iter
            // dequantize
            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
            dfloat2 v;
            dequantize_kernel(vx, ib, iqs + j/qr, v);
            // matrix multiplication
            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
 #ifdef GGML_CUDA_F16
            if ( y_offset == 1 ) {
                // load 2 dfloats into register in a single instruction
                const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
                tmp += __hmul2(v, y_reg);
            }
            else {
                tmp += __hmul2(v, {
                        y[iybs + iqs + j/qr + 0],
                        y[iybs + iqs + j/qr + y_offset]
                    });
            }
 #else
            if ( y_offset == 1 ) {
                // load 2 dfloats into register in a single instruction
                const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
                tmp += v.x * y_reg.x;
                tmp += v.y * y_reg.y;
            }
            else {
                tmp += v.x * y[iybs + iqs + j/qr + 0];
                tmp += v.y * y[iybs + iqs + j/qr + y_offset];
            }
 #endif // GGML_CUDA_F16
        }
    }
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
    if (tid == 0) {
 #ifdef GGML_CUDA_F16
        dst[row] = tmp.x + tmp.y;
 #else
        dst[row] = tmp;
 #endif // GGML_CUDA_F16
    }
 }
 static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<GGML_TYPE_Q4_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<GGML_TYPE_Q4_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<GGML_TYPE_Q5_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<GGML_TYPE_Q5_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<GGML_TYPE_Q8_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const dim3 block_dims(32, 1, 1);
    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
 }
 static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<GGML_TYPE_F16>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 void ggml_cuda_op_dequantize_mul_mat_vec(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream) {
    GGML_UNUSED(ctx);
    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
 #ifdef GGML_CUDA_F16
    ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());
    half * src1_dfloat = nullptr; // dfloat == half
    bool src1_convert_f16 =
        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
    if (src1_convert_f16) {
        src1_dfloat = src1_dfloat_a.alloc(ne00);
        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
        GGML_ASSERT(to_fp16_cuda != nullptr);
        to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
    }
 #else
    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
 #endif // GGML_CUDA_F16
    switch (src0->type) {
        case GGML_TYPE_Q4_0:
            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q4_1:
            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q5_0:
            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q5_1:
            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q8_0:
            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q2_K:
            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q3_K:
            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q4_K:
            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q5_K:
            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_Q6_K:
            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
            break;
        case GGML_TYPE_F16:
            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
            break;
        default:
            GGML_ABORT("fatal error");
            break;
    }
    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_ddq_i);
    GGML_UNUSED(src1_ncols);
    GGML_UNUSED(src1_padded_row_size);
 }
 bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
    return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 ||
        src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 ||
        src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
        src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
        src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
        src0_type == GGML_TYPE_F16;
 }
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -16,11 +16,11 @@
 #include "ggml-cuda/cpy.cuh"
 #include "ggml-cuda/cross-entropy-loss.cuh"
 #include "ggml-cuda/diagmask.cuh"
 #include "ggml-cuda/dmmv.cuh"
 #include "ggml-cuda/fattn.cuh"
 #include "ggml-cuda/getrows.cuh"
 #include "ggml-cuda/im2col.cuh"
 #include "ggml-cuda/mmq.cuh"
 #include "ggml-cuda/mmv.cuh"
 #include "ggml-cuda/mmvq.cuh"
 #include "ggml-cuda/norm.cuh"
 #include "ggml-cuda/opt-step-adamw.cuh"
@ -1020,114 +1020,6 @@ typedef void (*ggml_cuda_op_mul_mat_t)(
 #define MUL_MAT_SRC1_COL_STRIDE 128
 static __global__ void mul_mat_p021_f16_f32(
    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
    const half * x = (const half *) vx;
    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
    const int channel_x = channel / (nchannels_y / nchannels_x);
    const int nrows_y = ncols_x;
    const int nrows_dst = nrows_x;
    const int row_dst = row_x;
    float tmp = 0.0f;
    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
        const int col_x = col_x0 + threadIdx.x;
        if (col_x >= ncols_x) {
            break;
        }
        // x is transposed and permuted
        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
        const float xi = __half2float(x[ix]);
        const int row_y = col_x;
        // y is not transposed but permuted
        const int iy = channel*nrows_y + row_y;
        tmp += xi * y[iy];
    }
    // dst is not transposed and not permuted
    const int idst = channel*nrows_dst + row_dst;
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
    if (threadIdx.x == 0) {
        dst[idst] = tmp;
    }
 }
 static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
    const half * x = (const half *) vx;
    const int row_x     = blockDim.y*blockIdx.y + threadIdx.y;
    const int channel   = blockDim.z*blockIdx.z + threadIdx.z;
    const int channel_x = channel / channel_x_divisor;
    const int nrows_y   = ncols_x;
    const int nrows_dst = nrows_x;
    const int row_dst   = row_x;
    const int idst = channel*nrows_dst + row_dst;
    float tmp = 0.0f;
    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
        const int col_x = col_x0 + threadIdx.x;
        if (col_x >= ncols_x) {
            break;
        }
        const int row_y = col_x;
        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
        const int iy = channel*nrows_y + row_y;
        const float xi = __half2float(x[ix]);
        tmp += xi * y[iy];
    }
    // sum up partial sums and write back result
    tmp = warp_reduce_sum(tmp);
    if (threadIdx.x == 0) {
        dst[idst] = tmp;
    }
 }
 static void ggml_mul_mat_p021_f16_f32_cuda(
    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
    const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
    const dim3 block_nums(1, nrows_x, nchannels_y);
    const dim3 block_dims(WARP_SIZE, 1, 1);
    mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
 }
 static void ggml_mul_mat_vec_nc_f16_f32_cuda(
    const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
    const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
    const dim3 block_nums(1, nrows_x, nchannels_y);
    const dim3 block_dims(WARP_SIZE, 1, 1);
    mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
 }
 static cudaError_t ggml_cuda_cpy_tensor_2d(
    void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
@ -1654,58 +1546,6 @@ static void ggml_cuda_op_mul_mat(
    }
 }
 static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne12 = src1->ne[2];
    cudaStream_t main_stream = ctx.stream();
    void  * src0_ddq = src0->data;
    float * src1_ddf = (float *) src1->data;
    float * dst_ddf  = (float *) dst->data;
    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
 }
 static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
    GGML_ASSERT(!ggml_is_permuted(src0));
    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t nb01 = src0->nb[1];
    const int64_t nb02 = src0->nb[2];
    const int64_t ne12 = src1->ne[2];
    cudaStream_t main_stream = ctx.stream();
    void  * src0_ddq = src0->data;
    float * src1_ddf = (float *) src1->data;
    float * dst_ddf  = (float *) dst->data;
    const int64_t row_stride_x = nb01 / sizeof(half);
    const int64_t channel_stride_x = nb02 / sizeof(half);
    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
 }
 static __global__ void k_compute_batched_ptrs(
        const half * src0_as_f16, const half * src1_as_f16, char * dst,
        const void ** ptrs_src, void ** ptrs_dst,
@ -1879,21 +1719,17 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
-    bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
+    bool use_mul_mat_vec   = src0->type == GGML_TYPE_F16
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
+        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
    bool use_mul_mat_q     = ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
    // if mmvq is available it's a better choice than dmmv:
 #ifndef GGML_CUDA_FORCE_DMMV
    use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
 #endif // GGML_CUDA_FORCE_DMMV
    bool any_gpus_with_slow_fp16   = false;
    bool any_gpus_without_fp16_mma = false;
    if (split) {
        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
@ -1907,11 +1743,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
            const int cc              = ggml_cuda_info().devices[id].cc;
            use_mul_mat_q             = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
            any_gpus_with_slow_fp16   = any_gpus_with_slow_fp16   || !fast_fp16_available(cc);
            any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
        }
    } else {
        const int cc              = ggml_cuda_info().devices[ctx.device].cc;
        use_mul_mat_q             = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
        any_gpus_with_slow_fp16   = any_gpus_with_slow_fp16   || !fast_fp16_available(cc);
        any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
    }
    // debug helpers
@ -1922,18 +1760,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-    if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (!split && src0->type == GGML_TYPE_F16 && src1->ne[1] == 1 && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
-        // FP32 precision KQ single-batch for batch size 1 without FlashAttention
+        ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
        ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
    } else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // FP32 precision KQV single-batch for batch size 1 without FlashAttention
        ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
               && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch without FlashAttention
        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
-    } else if (use_dequantize_mul_mat_vec) {
+    } else if (use_mul_mat_vec) {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
    } else if (use_mul_mat_vec_q) {
        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
    } else if (use_mul_mat_q) {
--- a/ggml/src/ggml-cuda/mmv.cu
+++ b/ggml/src/ggml-cuda/mmv.cu
@ -0,0 +1,223 @@
 #include "common.cuh"
 #include "mmv.cuh"
 template <typename type_acc, int block_size>
 static __global__ void mul_mat_vec(
        const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
        const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
    const int64_t row     = blockIdx.x;
    const int64_t channel = blockIdx.z;
    const int     tid     = threadIdx.x;
    x   += (channel/channel_ratio)*stride_channel_x + row*stride_row;
    y   +=  channel               *stride_channel_y;
    dst +=  channel               *stride_channel_dst;
    const half2  * x2 = (const half2  *) x;
    const float2 * y2 = (const float2 *) y;
    extern __shared__ char data_mmv[];
    float * buf_iw = (float *) data_mmv;
    if (block_size > WARP_SIZE) {
        if (tid < WARP_SIZE) {
            buf_iw[tid] = 0.0f;
        }
        __syncthreads();
    }
    float sumf;
    if (std::is_same<type_acc, float>::value) {
        sumf = 0.0f;
        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
            const float2 tmpx = __half22float2(x2[col2]);
            const float2 tmpy = y2[col2];
            sumf += tmpx.x * tmpy.x;
            sumf += tmpx.y * tmpy.y;
        }
    } else {
 #ifdef FP16_AVAILABLE
        half2 sumh2 = make_half2(0.0f, 0.0f);
        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
            const float2 tmp = y2[col2];
            sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
        }
        sumf = __low2float(sumh2) + __high2float(sumh2);
 #else
        NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
    }
    sumf = warp_reduce_sum(sumf);
    if (block_size > WARP_SIZE) {
        buf_iw[tid/WARP_SIZE] = sumf;
        __syncthreads();
        if (tid > WARP_SIZE) {
            return;
        }
        sumf = buf_iw[tid];
        sumf = warp_reduce_sum(sumf);
    }
    if (tid != 0) {
        return;
    }
    dst[row] = sumf;
 }
 template <typename type_acc>
 static void launch_mul_mat_vec_cuda(
        const half * x, const float * y, float * dst,
        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
        cudaStream_t stream) {
    GGML_ASSERT(ncols      % 2 == 0);
    GGML_ASSERT(stride_row % 2 == 0);
    GGML_ASSERT(nchannels_y % nchannels_x == 0);
    const int64_t channel_ratio = nchannels_y / nchannels_x;
    int64_t block_size_best = WARP_SIZE;
    int64_t niter_best      = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
    for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) {
        const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
        if (niter < niter_best) {
            niter_best      = niter;
            block_size_best = block_size;
        }
    }
    const int smem = WARP_SIZE*sizeof(float);
    const dim3 block_nums(nrows, 1, nchannels_y);
    const dim3 block_dims(block_size_best, 1, 1);
    switch (block_size_best) {
        case   32: {
            mul_mat_vec<type_acc,  32><<<block_nums, block_dims, smem, stream>>>
                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
        } break;
        case   64: {
            mul_mat_vec<type_acc,  64><<<block_nums, block_dims, smem, stream>>>
                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
        } break;
        case   96: {
            mul_mat_vec<type_acc,  96><<<block_nums, block_dims, smem, stream>>>
                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
        } break;
        case  128: {
            mul_mat_vec<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
        } break;
        case  160: {
            mul_mat_vec<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
        } break;
        case  192: {
            mul_mat_vec<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
        } break;
        case  224: {
            mul_mat_vec<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
        } break;
        case  256: {
            mul_mat_vec<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
        } break;
        default: {
            GGML_ABORT("fatal error");
        } break;
    }
 }
 static void mul_mat_vec_cuda(
        const half * x, const float * y, float * dst,
        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
        enum ggml_prec prec, cudaStream_t stream) {
    switch (prec) {
        case GGML_PREC_DEFAULT: {
            launch_mul_mat_vec_cuda<half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
                stride_channel_x, stride_channel_y, stride_channel_dst, stream);
        } break;
        case GGML_PREC_F32: {
            launch_mul_mat_vec_cuda<float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
                stride_channel_x, stride_channel_y, stride_channel_dst, stream);
        } break;
    }
 }
 void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    GGML_ASSERT(src1->ne[1] == 1);
    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
    const half  * src0_d = (const half  *) src0->data;
    const float * src1_d = (const float *) src1->data;
    float       *  dst_d = (float       *)  dst->data;
    const int64_t ne02 = src0->ne[2];
    const int64_t ne12 = src1->ne[2];
    GGML_ASSERT(dst->ne[2] == ne12);
    GGML_ASSERT(src0->ne[3] == 1);
    GGML_ASSERT(src1->ne[3] == 1);
    GGML_ASSERT( dst->ne[3] == 1);
    const int64_t stride_row         = src0->nb[1] / ggml_type_size(src0->type);
    const int64_t channel_stride_x   = src0->nb[2] / ggml_type_size(src0->type);
    const int64_t channel_stride_y   = src1->nb[2] / ggml_type_size(src1->type);
    const int64_t channel_stride_dst =  dst->nb[2] / ggml_type_size( dst->type);
    mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
 }
 void ggml_cuda_op_mul_mat_vec(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream) {
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;
    GGML_ASSERT(src1_ncols == 1);
    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
    // ggml_cuda_op provides single, contiguous matrices
    const int64_t stride_row         = ne00;
    const int64_t nchannels_x        = 1;
    const int64_t nchannels_y        = 1;
    const int64_t channel_stride_x   = 0;
    const int64_t channel_stride_y   = 0;
    const int64_t channel_stride_dst = 0;
    mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
        nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
    GGML_UNUSED(ctx);
    GGML_UNUSED(src1);
    GGML_UNUSED(dst);
    GGML_UNUSED(src1_ddq_i);
    GGML_UNUSED(src1_ncols);
    GGML_UNUSED(src1_padded_row_size);
 }
--- a/ggml/src/ggml-cuda/dmmv.cuh
+++ b/ggml/src/ggml-cuda/dmmv.cuh
@ -1,20 +1,12 @@
 #include "common.cuh"
-// dmmv = dequantize_mul_mat_vec
+// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
 #define MMV_MAX_ROWS 512
-// TODO: remove this?
+void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
 #ifndef GGML_CUDA_DMMV_X
 #define GGML_CUDA_DMMV_X 32
 #endif
-#ifndef GGML_CUDA_MMV_Y
+void ggml_cuda_op_mul_mat_vec(
 #define GGML_CUDA_MMV_Y 1
 #endif
 void ggml_cuda_op_dequantize_mul_mat_vec(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, cudaStream_t stream);
 bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
--- a/ggml/src/ggml-cuda/opt-step-adamw.cu
+++ b/ggml/src/ggml-cuda/opt-step-adamw.cu
@ -1,11 +1,11 @@
 #include "ggml-impl.h"
 #include "opt-step-adamw.cuh"
 #include <cstdint>
 static __global__ void opt_step_adamw_f32(
-    float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v, const int64_t k,
+    float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v,
-    const float alpha, const float beta1, const float beta2, const float eps, const float wd,
+    const float * __restrict__ pars, const int64_t k) {
    const float beta1h, const float beta2h) {
    const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
@ -13,6 +13,14 @@ static __global__ void opt_step_adamw_f32(
        return;
    }
    const float alpha  = pars[0];
    const float beta1  = pars[1];
    const float beta2  = pars[2];
    const float eps    = pars[3];
    const float wd     = pars[4];
    const float beta1h = pars[5];
    const float beta2h = pars[6];
    const float gi = g[i];
    const float gmi = g_m[i]*beta1 +    gi*(1.0f - beta1);
    const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2);
@ -23,17 +31,15 @@ static __global__ void opt_step_adamw_f32(
    const float mh =       gmi*beta1h;
    const float vh = sqrtf(gvi*beta2h) + eps;
-    x[i] = x[i]*(1.0f - alpha*wd) - mh/vh;
+    x[i] = x[i]*(1.0f - alpha*wd) - alpha*mh/vh;
 }
 static void opt_step_adamw_f32_cuda(
-    float * x, const float * g, float * g_m, float * g_v, const int64_t k,
+    float * x, const float * g, float * g_m, float * g_v, const float * pars, const int64_t k, cudaStream_t stream) {
    const float alpha, const float beta1, const float beta2, const float eps, const float wd,
    const float beta1h, const float beta2h, cudaStream_t stream) {
    const dim3 block_dims(CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
    const dim3 block_nums((k + CUDA_OPT_STEP_ADAMW_BLOCK_SIZE - 1) / CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
-    opt_step_adamw_f32<<<block_nums, block_dims, 0, stream>>>(x, g, g_m, g_v, k, alpha, beta1, beta2, eps, wd, beta1h, beta2h);
+    opt_step_adamw_f32<<<block_nums, block_dims, 0, stream>>>(x, g, g_m, g_v, pars, k);
 }
 void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -41,40 +47,32 @@ void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    const ggml_tensor * src0_grad    = dst->src[1];
    const ggml_tensor * src0_grad_m  = dst->src[2];
    const ggml_tensor * src0_grad_v  = dst->src[3];
    const ggml_tensor * adamw_params = dst->src[4];
    GGML_ASSERT(src0->type         == GGML_TYPE_F32);
    GGML_ASSERT(src0_grad->type    == GGML_TYPE_F32);
    GGML_ASSERT(src0_grad_m->type  == GGML_TYPE_F32);
    GGML_ASSERT(src0_grad_v->type  == GGML_TYPE_F32);
    GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(src0_grad));
    GGML_ASSERT(ggml_is_contiguous(src0_grad_m));
    GGML_ASSERT(ggml_is_contiguous(src0_grad_v));
    GGML_ASSERT(ggml_is_contiguous(adamw_params));
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
    float       * src0_d         = (float       *) src0->data;
    const float * src0_grad_d    = (const float *) src0_grad->data;
    float       * src0_grad_m_d  = (float       *) src0_grad_m->data;
    float       * src0_grad_v_d  = (float       *) src0_grad_v->data;
    const float * adamw_params_d = (const float *) adamw_params->data;
    cudaStream_t stream = ctx.stream();
    const int64_t ne = ggml_nelements(src0);
-    int64_t iter;  memcpy(&iter,  &dst->op_params[0], sizeof(int64_t));
+    opt_step_adamw_f32_cuda(src0_d, src0_grad_d, src0_grad_m_d, src0_grad_v_d, adamw_params_d, ne, stream);
    float   alpha; memcpy(&alpha, &dst->op_params[2], sizeof(float));
    float   beta1; memcpy(&beta1, &dst->op_params[3], sizeof(float));
    float   beta2; memcpy(&beta2, &dst->op_params[4], sizeof(float));
    float   eps;   memcpy(&eps,   &dst->op_params[5], sizeof(float));
    float   wd;    memcpy(&wd,    &dst->op_params[6], sizeof(float));
    const float beta1h  = alpha/(1.0f - powf(beta1, iter));
    const float beta2h  =  1.0f/(1.0f - powf(beta2, iter));
    opt_step_adamw_f32_cuda(src0_d, src0_grad_d, src0_grad_m_d, src0_grad_v_d, ne, alpha, beta1, beta2, eps, wd, beta1h, beta2h, stream);
    iter++;
    memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
 }
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@ -75,18 +75,11 @@ target_include_directories(ggml-hip PRIVATE . ..)
 target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
 add_compile_definitions(GGML_USE_HIP)
 add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
 add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
 add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
 if (GGML_HIP_UMA)
    add_compile_definitions(GGML_HIP_UMA)
 endif()
 if (GGML_CUDA_FORCE_DMMV)
    add_compile_definitions(GGML_CUDA_FORCE_DMMV)
 endif()
 if (GGML_CUDA_FORCE_MMQ)
    add_compile_definitions(GGML_CUDA_FORCE_MMQ)
 endif()
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -196,7 +196,7 @@ void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
 static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
 // returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
-static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key);
 // returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
 static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
@ -210,7 +210,7 @@ static inline size_t ggml_hash(const struct ggml_tensor * p) {
    return (size_t)(uintptr_t)p >> 4;
 }
-static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
+static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key) {
    size_t h = ggml_hash(key) % hash_set->size;
    // linear probing
@ -281,13 +281,14 @@ enum ggml_cgraph_eval_order {
 };
 struct ggml_cgraph {
-    int size;
+    int size;    // maximum number of nodes/leafs/grads/grad_accs
-    int n_nodes;
+    int n_nodes; // number of nodes currently in use
-    int n_leafs;
+    int n_leafs; // number of leafs currently in use
-    struct ggml_tensor ** nodes;
+    struct ggml_tensor ** nodes;     // tensors with data that can change if the graph is evaluated
-    struct ggml_tensor ** grads;
+    struct ggml_tensor ** grads;     // the outputs of these tensors are the gradients of the nodes
-    struct ggml_tensor ** leafs;
+    struct ggml_tensor ** grad_accs; // accumulators for node gradients
    struct ggml_tensor ** leafs;     // tensors with constant data
    struct ggml_hash_set visited_hash_set;
--- a/ggml/src/ggml-metal/CMakeLists.txt
+++ b/ggml/src/ggml-metal/CMakeLists.txt
@ -25,9 +25,10 @@ if (GGML_METAL_USE_BF16)
    add_compile_definitions(GGML_METAL_USE_BF16)
 endif()
-# copy ggml-common.h and ggml-metal.metal to bin directory
+# copy metal files to bin directory
 configure_file(../ggml-common.h  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h     COPYONLY)
 configure_file(ggml-metal.metal  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal  COPYONLY)
 configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
 if (GGML_METAL_EMBED_LIBRARY)
    enable_language(ASM)
@ -36,24 +37,27 @@ if (GGML_METAL_EMBED_LIBRARY)
    set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
    set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
    set(METALLIB_IMPL   "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h")
    file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
    # merge ggml-common.h and ggml-metal.metal into a single file
    set(METALLIB_EMBED_ASM        "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
    set(METALLIB_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
    set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
    add_custom_command(
        OUTPUT ${METALLIB_EMBED_ASM}
        COMMAND echo "Embedding Metal library"
-        COMMAND sed -e '/__embed_ggml-common.h__/r ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED}
+        COMMAND sed -e '/__embed_ggml-common.h__/r         ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d'         < ${METALLIB_SOURCE}           > ${METALLIB_SOURCE_EMBED_TMP}
        COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}'   -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED}
        COMMAND echo ".section __DATA,__ggml_metallib"          >  ${METALLIB_EMBED_ASM}
        COMMAND echo ".globl _ggml_metallib_start"              >> ${METALLIB_EMBED_ASM}
        COMMAND echo "_ggml_metallib_start:"                    >> ${METALLIB_EMBED_ASM}
        COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
        COMMAND echo ".globl _ggml_metallib_end"                >> ${METALLIB_EMBED_ASM}
        COMMAND echo "_ggml_metallib_end:"                      >> ${METALLIB_EMBED_ASM}
-        DEPENDS ggml-metal.metal ../ggml-common.h
+        DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
        COMMENT "Generate assembly for embedded Metal library"
    )
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@ -0,0 +1,249 @@
 #ifndef GGML_METAL_IMPL
 #define GGML_METAL_IMPL
 // kernel argument structs
 //
 // - element counters (e.g. ne00) typically use int32_t to reduce register usage
 //   however, be careful from int overflows when using those in the kernel implementation
 //
 // - strides (e.g. nb00) use uint64_t
 typedef struct {
    int32_t  ne00;
    int32_t  ne01;
    int32_t  ne02;
    int32_t  ne03;
    uint64_t nb00;
    uint64_t nb01;
    uint64_t nb02;
    uint64_t nb03;
    int32_t  ne10;
    int32_t  ne11;
    int32_t  ne12;
    int32_t  ne13;
    uint64_t nb10;
    uint64_t nb11;
    uint64_t nb12;
    uint64_t nb13;
    int32_t  ne0;
    int32_t  ne1;
    int32_t  ne2;
    int32_t  ne3;
    uint64_t nb0;
    uint64_t nb1;
    uint64_t nb2;
    uint64_t nb3;
    int32_t  dim;
 } ggml_metal_kargs_concat;
 typedef struct {
    int32_t  ne00;
    int32_t  ne01;
    int32_t  ne02;
    int32_t  ne03;
    uint64_t nb00;
    uint64_t nb01;
    uint64_t nb02;
    uint64_t nb03;
    int32_t  ne10;
    int32_t  ne11;
    int32_t  ne12;
    int32_t  ne13;
    uint64_t nb10;
    uint64_t nb11;
    uint64_t nb12;
    uint64_t nb13;
    int32_t  ne0;
    int32_t  ne1;
    int32_t  ne2;
    int32_t  ne3;
    uint64_t nb0;
    uint64_t nb1;
    uint64_t nb2;
    uint64_t nb3;
    uint64_t offs;
 } ggml_metal_kargs_bin;
 typedef struct {
    int32_t  ne00;
    int32_t  ne01;
    int32_t  ne02;
    int32_t  ne03;
    uint64_t nb00;
    uint64_t nb01;
    uint64_t nb02;
    uint64_t nb03;
    int32_t  ne0;
    int32_t  ne1;
    int32_t  ne2;
    int32_t  ne3;
    uint64_t nb0;
    uint64_t nb1;
    uint64_t nb2;
    uint64_t nb3;
 } ggml_metal_kargs_repeat;
 typedef struct {
    int64_t  ne00;
    int64_t  ne01;
    int64_t  ne02;
    int64_t  ne03;
    uint64_t nb00;
    uint64_t nb01;
    uint64_t nb02;
    uint64_t nb03;
    int64_t  ne0;
    int64_t  ne1;
    int64_t  ne2;
    int64_t  ne3;
    uint64_t nb0;
    uint64_t nb1;
    uint64_t nb2;
    uint64_t nb3;
 } ggml_metal_kargs_cpy;
 typedef struct {
    int32_t  ne00;
    int32_t  ne01;
    int32_t  ne02;
    int32_t  ne03;
    uint64_t nb00;
    uint64_t nb01;
    uint64_t nb02;
    uint64_t nb03;
    int32_t  ne0;
    int32_t  ne1;
    int32_t  ne2;
    int32_t  ne3;
    uint64_t nb0;
    uint64_t nb1;
    uint64_t nb2;
    uint64_t nb3;
    int32_t  n_past;
    int32_t  n_dims;
    int32_t  n_ctx_orig;
    float    freq_base;
    float    freq_scale;
    float    ext_factor;
    float    attn_factor;
    float    beta_fast;
    float    beta_slow;
 } ggml_metal_kargs_rope;
 typedef struct {
    int32_t  ne01;
    int32_t  ne02;
    int32_t  ne03;
    uint64_t nb01;
    uint64_t nb02;
    uint64_t nb03;
    int32_t  ne11;
    int32_t  ne_12_2; // assume K and V are same shape
    int32_t  ne_12_3;
    uint64_t nb_12_1;
    uint64_t nb_12_2;
    uint64_t nb_12_3;
    uint64_t nb31;
    int32_t  ne1;
    int32_t  ne2;
    float    scale;
    float    max_bias;
    float    m0;
    float    m1;
    uint16_t n_head_log2;
    float    logit_softcap;
 } ggml_metal_kargs_flash_attn_ext;
 typedef struct {
    int32_t  ne00;
    int32_t  ne02;
    uint64_t nb01;
    uint64_t nb02;
    uint64_t nb03;
    int32_t  ne12;
    uint64_t nb10;
    uint64_t nb11;
    uint64_t nb12;
    uint64_t nb13;
    int32_t  ne0;
    int32_t  ne1;
    int16_t  r2;
    int16_t  r3;
 } ggml_metal_kargs_mul_mm;
 typedef struct {
    int32_t  ne00;
    int32_t  ne01;
    int32_t  ne02;
    uint64_t nb00;
    uint64_t nb01;
    uint64_t nb02;
    uint64_t nb03;
    int32_t  ne10;
    int32_t  ne11;
    int32_t  ne12;
    uint64_t nb10;
    uint64_t nb11;
    uint64_t nb12;
    uint64_t nb13;
    int32_t  ne0;
    int32_t  ne1;
    int16_t  r2;
    int16_t  r3;
 } ggml_metal_kargs_mul_mv;
 typedef struct {
    int32_t  nei0;
    int32_t  nei1;
    uint64_t nbi1;
    int32_t  ne00;
    int32_t  ne02;
    uint64_t nb01;
    uint64_t nb02;
    int32_t  ne11;
    int32_t  ne12;
    int32_t  ne13;
    uint64_t nb10;
    uint64_t nb11;
    uint64_t nb12;
    int32_t  ne0;
    int32_t  ne1;
 } ggml_metal_kargs_mul_mm_id;
 typedef struct {
    int32_t  nei0;
    int32_t  nei1;
    uint64_t nbi1;
    int32_t  ne00;
    int32_t  ne01;
    int32_t  ne02;
    uint64_t nb00;
    uint64_t nb01;
    uint64_t nb02;
    int32_t  ne10;
    int32_t  ne11;
    int32_t  ne12;
    int32_t  ne13;
    uint64_t nb10;
    uint64_t nb11;
    uint64_t nb12;
    int32_t  ne0;
    int32_t  ne1;
    uint64_t nb1;
 } ggml_metal_kargs_mul_mv_id;
 typedef struct {
    int32_t  ne00;
    int32_t  ne00_4;
    uint64_t nb01;
    float    eps;
 } ggml_metal_kargs_norm;
 typedef struct {
    int32_t  ne00;
    int32_t  ne00_4;
    uint64_t nb01;
    float    eps;
 } ggml_metal_kargs_rms_norm;
 #endif // GGML_METAL_IMPL
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@ -2,6 +2,7 @@
 #import "ggml-impl.h"
 #import "ggml-backend-impl.h"
 #import "ggml-metal-impl.h"
 #import <Foundation/Foundation.h>
@ -1193,35 +1194,39 @@ static void ggml_metal_encode_node(
                const int32_t dim = ((const int32_t *) dst->op_params)[0];
                ggml_metal_kargs_concat args = {
                    /*.ne00 =*/ ne00,
                    /*.ne01 =*/ ne01,
                    /*.ne02 =*/ ne02,
                    /*.ne03 =*/ ne03,
                    /*.nb00 =*/ nb00,
                    /*.nb01 =*/ nb01,
                    /*.nb02 =*/ nb02,
                    /*.nb03 =*/ nb03,
                    /*.ne10 =*/ ne10,
                    /*.ne11 =*/ ne11,
                    /*.ne12 =*/ ne12,
                    /*.ne13 =*/ ne13,
                    /*.nb10 =*/ nb10,
                    /*.nb11 =*/ nb11,
                    /*.nb12 =*/ nb12,
                    /*.nb13 =*/ nb13,
                    /*.ne0  =*/ ne0,
                    /*.ne1  =*/ ne1,
                    /*.ne2  =*/ ne2,
                    /*.ne3  =*/ ne3,
                    /*.nb0  =*/ nb0,
                    /*.nb1  =*/ nb1,
                    /*.nb2  =*/ nb2,
                    /*.nb3  =*/ nb3,
                    /*.dim  =*/ dim,
                };
                [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
-                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
                [encoder setBytes:&dim  length:sizeof(dim)  atIndex:27];
                const int nth = MIN(1024, ne0);
@ -1239,8 +1244,6 @@ static void ggml_metal_encode_node(
                bool bcast_row = false;
                int64_t nb = ne00; // used by the "row" kernels
                id<MTLComputePipelineState> pipeline = nil;
                if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
@ -1249,7 +1252,6 @@ static void ggml_metal_encode_node(
                    // src1 is a row
                    GGML_ASSERT(ne11 == 1);
                    nb = ne00 / 4;
                    switch (dst->op) {
                        case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
                        case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
@ -1269,36 +1271,39 @@ static void ggml_metal_encode_node(
                    }
                }
                ggml_metal_kargs_bin args = {
                    /*.ne00 =*/ ne00,
                    /*.ne01 =*/ ne01,
                    /*.ne02 =*/ ne02,
                    /*.ne03 =*/ ne03,
                    /*.nb00 =*/ nb00,
                    /*.nb01 =*/ nb01,
                    /*.nb02 =*/ nb02,
                    /*.nb03 =*/ nb03,
                    /*.ne10 =*/ ne10,
                    /*.ne11 =*/ ne11,
                    /*.ne12 =*/ ne12,
                    /*.ne13 =*/ ne13,
                    /*.nb10 =*/ nb10,
                    /*.nb11 =*/ nb11,
                    /*.nb12 =*/ nb12,
                    /*.nb13 =*/ nb13,
                    /*.ne0  =*/ ne0,
                    /*.ne1  =*/ ne1,
                    /*.ne2  =*/ ne2,
                    /*.ne3  =*/ ne3,
                    /*.nb0  =*/ nb0,
                    /*.nb1  =*/ nb1,
                    /*.nb2  =*/ nb2,
                    /*.nb3  =*/ nb3,
                    /*.offs =*/ offs,
                };
                [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
-                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
                [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
                [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
                if (bcast_row) {
                    const int64_t n = ggml_nelements(dst)/4;
@ -1322,25 +1327,29 @@ static void ggml_metal_encode_node(
                    default: GGML_ABORT("fatal error");
                }
                ggml_metal_kargs_repeat args = {
                    /*.ne00 =*/ ne00,
                    /*.ne01 =*/ ne01,
                    /*.ne02 =*/ ne02,
                    /*.ne03 =*/ ne03,
                    /*.nb00 =*/ nb00,
                    /*.nb01 =*/ nb01,
                    /*.nb02 =*/ nb02,
                    /*.nb03 =*/ nb03,
                    /*.ne0  =*/ ne0,
                    /*.ne1  =*/ ne1,
                    /*.ne2  =*/ ne2,
                    /*.ne3  =*/ ne3,
                    /*.nb0  =*/ nb0,
                    /*.nb1  =*/ nb1,
                    /*.nb2  =*/ nb2,
                    /*.nb3  =*/ nb3,
                };
                [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
                [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
                [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
                [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
@ -1369,25 +1378,29 @@ static void ggml_metal_encode_node(
                    const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
                    ggml_metal_kargs_cpy args = {
                        /*.ne00 =*/ ne00,
                        /*.ne01 =*/ ne01,
                        /*.ne02 =*/ ne02,
                        /*.ne03 =*/ ne03,
                        /*.nb00 =*/ nb00,
                        /*.nb01 =*/ nb01,
                        /*.nb02 =*/ nb02,
                        /*.nb03 =*/ nb03,
                        /*.ne0  =*/ ne0,
                        /*.ne1  =*/ ne1,
                        /*.ne2  =*/ ne2,
                        /*.ne3  =*/ ne3,
                        /*.nb0  =*/ nb0,
                        /*.nb1  =*/ nb1,
                        /*.nb2  =*/ nb2,
                        /*.nb3  =*/ nb3,
                    };
                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                    [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                    [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                    [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                    [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
                    [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
                    [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
                    [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
                    [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
                    [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
                    [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
                    [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
                    [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
                    [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
                    [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
                    [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
                    [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
                    [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
                    [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
                    const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
@ -1396,35 +1409,39 @@ static void ggml_metal_encode_node(
                const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
                ggml_metal_kargs_bin args = {
                    /*.ne00 =*/ ne00,
                    /*.ne01 =*/ ne01,
                    /*.ne02 =*/ ne02,
                    /*.ne03 =*/ ne03,
                    /*.nb00 =*/ nb00,
                    /*.nb01 =*/ pnb1,
                    /*.nb02 =*/ pnb2,
                    /*.nb03 =*/ pnb3,
                    /*.ne10 =*/ ne10,
                    /*.ne11 =*/ ne11,
                    /*.ne12 =*/ ne12,
                    /*.ne13 =*/ ne13,
                    /*.nb10 =*/ nb10,
                    /*.nb11 =*/ nb11,
                    /*.nb12 =*/ nb12,
                    /*.nb13 =*/ nb13,
                    /*.ne0  =*/ ne0,
                    /*.ne1  =*/ ne1,
                    /*.ne2  =*/ ne2,
                    /*.ne3  =*/ ne3,
                    /*.nb0  =*/ nb0,
                    /*.nb1  =*/ pnb1,
                    /*.nb2  =*/ pnb2,
                    /*.nb3  =*/ pnb3,
                    /*.offs =*/ offs,
                };
                [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
-                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
                [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
                [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
                [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
                [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
                [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
                [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
                [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
                [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
                [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
                [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
                [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
                [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
@ -1640,6 +1657,7 @@ static void ggml_metal_encode_node(
                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@ -1715,6 +1733,8 @@ static void ggml_metal_encode_node(
                const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
                const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
                // TODO: add ggml_metal_kargs struct
                // TODO: optimize (see https://github.com/ggerganov/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
                if (id_src1) {
@ -1731,6 +1751,7 @@ static void ggml_metal_encode_node(
                [encoder setBytes:&m0          length:sizeof(m0)          atIndex:8];
                [encoder setBytes:&m1          length:sizeof(m1)          atIndex:9];
                [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
                [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
@ -1747,6 +1768,7 @@ static void ggml_metal_encode_node(
                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
                }
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@ -1771,6 +1793,7 @@ static void ggml_metal_encode_node(
                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline;
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
@ -1841,6 +1864,7 @@ static void ggml_metal_encode_node(
                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
@ -1959,24 +1983,29 @@ static void ggml_metal_encode_node(
                                default: GGML_ABORT("MUL MAT-MAT not implemented");
                            }
                            ggml_metal_kargs_mul_mm args = {
                                /*.ne00 =*/ ne00,
                                /*.ne02 =*/ ne02,
                                /*.nb01 =*/ nb01,
                                /*.nb02 =*/ nb02,
                                /*.nb03 =*/ nb03,
                                /*.ne12 =*/ ne12,
                                /*.nb10 =*/ nb10,
                                /*.nb11 =*/ nb11,
                                /*.nb12 =*/ nb12,
                                /*.nb13 =*/ nb13,
                                /*.ne0  =*/ ne0,
                                /*.ne1  =*/ ne1,
                                /*.r2   =*/ r2,
                                /*.r3   =*/ r3,
                            };
                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                            [encoder setBytes:&args    length:sizeof(args) atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
-                            [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
+                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
-                            [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
+
                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
                            [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
                            [encoder setBytes:&nb03    length:sizeof(nb03) atIndex:7];
                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:9];
                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:10];
                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:11];
                            [encoder setBytes:&nb13    length:sizeof(nb13) atIndex:12];
                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:15];
                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:16];
                            [encoder setThreadgroupMemoryLength:8192 atIndex:0];
                            [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                        } else {
@ -2154,28 +2183,32 @@ static void ggml_metal_encode_node(
                                    }
                            };
                            ggml_metal_kargs_mul_mv args = {
                                /*.ne00 =*/ ne00,
                                /*.ne01 =*/ ne01,
                                /*.ne02 =*/ ne02,
                                /*.nb00 =*/ nb00,
                                /*.nb01 =*/ nb01,
                                /*.nb02 =*/ nb02,
                                /*.nb03 =*/ nb03,
                                /*.ne10 =*/ ne10,
                                /*.ne11 =*/ ne11,
                                /*.ne12 =*/ ne12,
                                /*.nb10 =*/ nb10,
                                /*.nb11 =*/ nb11,
                                /*.nb12 =*/ nb12,
                                /*.nb13 =*/ nb13,
                                /*.ne0  =*/ ne0,
                                /*.ne1  =*/ ne1,
                                /*.r2   =*/ r2,
                                /*.r3   =*/ r3,
                            };
                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:13];
                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:14];
                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:15];
                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:16];
                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:18];
                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:19];
                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:20];
                            if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
                                src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
@ -2288,27 +2321,30 @@ static void ggml_metal_encode_node(
                        default: GGML_ABORT("MUL_MAT_ID not implemented");
                    }
                    ggml_metal_kargs_mul_mm_id args = {
                        /*.nei0 =*/ ne20,
                        /*.nei1 =*/ ne21,
                        /*.nbi1 =*/ nb21,
                        /*.ne00 =*/ ne00,
                        /*.ne02 =*/ ne02,
                        /*.nb01 =*/ nb01,
                        /*.nb02 =*/ nb02,
                        /*.ne11 =*/ ne11,
                        /*.ne12 =*/ ne12,
                        /*.ne13 =*/ ne13,
                        /*.nb10 =*/ nb10,
                        /*.nb11 =*/ nb11,
                        /*.nb12 =*/ nb12,
                        /*.ne0  =*/ ne0,
                        /*.ne1  =*/ ne1,
                    };
                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                    [encoder setBytes:&args    length:sizeof(args) atIndex:0];
-                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
-                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
-                    [encoder setBuffer:id_src2 offset:offs_src2    atIndex:3];
+                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
-                    [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
+                    [encoder setBuffer:id_src2 offset:offs_src2    atIndex:4];
                    [encoder setBytes:&ne21    length:sizeof(ne21) atIndex:5];
                    [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
                    [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:7];
                    [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:8];
                    [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:9];
                    [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:10];
                    [encoder setBytes:&ne11    length:sizeof(ne11) atIndex:11];
                    [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:12];
                    [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:13];
                    [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:14];
                    [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:15];
                    [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:16];
                    [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:17];
                    [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:18];
                    [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:19];
                    [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0];
@ -2467,30 +2503,34 @@ static void ggml_metal_encode_node(
                        GGML_ASSERT(ne00 >= nth0*nth1);
                    }
                    ggml_metal_kargs_mul_mv_id args = {
                        /*.nei0 =*/ ne20,
                        /*.nei1 =*/ ne21,
                        /*.nbi1 =*/ nb21,
                        /*.ne00 =*/ ne00,
                        /*.ne01 =*/ ne01,
                        /*.ne02 =*/ ne02,
                        /*.nb00 =*/ nb00,
                        /*.nb01 =*/ nb01,
                        /*.nb02 =*/ nb02,
                        /*.ne10 =*/ ne10,
                        /*.ne11 =*/ ne11,
                        /*.ne12 =*/ ne12,
                        /*.ne13 =*/ ne13,
                        /*.nb10 =*/ nb10,
                        /*.nb11 =*/ nb11,
                        /*.nb12 =*/ nb12,
                        /*.ne0  =*/ ne0,
                        /*.ne1  =*/ ne1,
                        /*.nb1  =*/ nb1,
                    };
                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
-                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:3];
-                    [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
+                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:4];
                    [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
                    [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6];
                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7];
                    [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:8];
                    [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:9];
                    [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:10];
                    [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:11];
                    [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:12];
                    [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:13];
                    [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:14];
                    [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:15];
                    [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:16];
                    [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:17];
                    [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:18];
                    [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:19];
                    [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:20];
                    [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:21];
                    [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:22];
                    const int64_t _ne1 = 1;
                    const int tgz = dst_rows;
@ -2563,6 +2603,7 @@ static void ggml_metal_encode_node(
                    default: GGML_ABORT("not implemented");
                }
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
                [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
@ -2586,20 +2627,28 @@ static void ggml_metal_encode_node(
                float eps;
                memcpy(&eps, dst->op_params, sizeof(float));
                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
                int nth = 32; // SIMD width
-                while (nth < ne00/4 && nth < 1024) {
+                while (nth < ne00/4 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
                    nth *= 2;
                }
-                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
+                nth = MIN(nth, ne00/4);
                ggml_metal_kargs_rms_norm args = {
                    /*.ne00   =*/ ne00,
                    /*.ne00_4 =*/ ne00/4,
                    /*.nb01   =*/ nb01,
                    /*.eps    =*/ eps,
                };
                [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+
                [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
                const int64_t nrows = ggml_nrows(src0);
@ -2624,6 +2673,7 @@ static void ggml_metal_encode_node(
                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
                [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
@ -2641,22 +2691,35 @@ static void ggml_metal_encode_node(
            } break;
        case GGML_OP_NORM:
            {
                GGML_ASSERT(ne00 % 4 == 0);
                GGML_ASSERT(ggml_is_contiguous_1(src0));
                float eps;
                memcpy(&eps, dst->op_params, sizeof(float));
                const int nth = MIN(256, ne00);
                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
                int nth = 32; // SIMD width
                while (nth < ne00/4 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
                    nth *= 2;
                }
                nth = MIN(nth, ne00/4);
                ggml_metal_kargs_norm args = {
                    /*.ne00   =*/ ne00,
                    /*.ne00_4 =*/ ne00/4,
                    /*.nb01   =*/ nb01,
                    /*.eps    =*/ eps,
                };
                [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+
-                [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
                [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
                const int64_t nrows = ggml_nrows(src0);
@ -2706,40 +2769,44 @@ static void ggml_metal_encode_node(
                    };
                }
                ggml_metal_kargs_rope args = {
                    /*.ne00        =*/ ne00,
                    /*.ne01        =*/ ne01,
                    /*.ne02        =*/ ne02,
                    /*.ne03        =*/ ne03,
                    /*.nb00        =*/ nb00,
                    /*.nb01        =*/ nb01,
                    /*.nb02        =*/ nb02,
                    /*.nb03        =*/ nb03,
                    /*.ne0         =*/ ne0,
                    /*.ne1         =*/ ne1,
                    /*.ne2         =*/ ne2,
                    /*.ne3         =*/ ne3,
                    /*.nb0         =*/ nb0,
                    /*.nb1         =*/ nb1,
                    /*.nb2         =*/ nb2,
                    /*.nb3         =*/ nb3,
                    /*.n_past      =*/ n_past,
                    /*.n_dims      =*/ n_dims,
                    /*.n_ctx_orig  =*/ n_ctx_orig,
                    /*.freq_base   =*/ freq_base,
                    /*.freq_scale  =*/ freq_scale,
                    /*.ext_factor  =*/ ext_factor,
                    /*.attn_factor =*/ attn_factor,
                    /*.beta_fast   =*/ beta_fast,
                    /*.beta_slow   =*/ beta_slow,
                };
                [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
+                [encoder setBytes:&args length:sizeof(args)     atIndex:0];
-                [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
+                [encoder setBuffer:id_src0 offset:offs_src0     atIndex:1];
                [encoder setBuffer:id_src1 offset:offs_src1     atIndex:2];
                if (id_src2 != nil) {
-                    [encoder setBuffer:id_src2 offset:offs_src2        atIndex:2];
+                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
                } else {
-                    [encoder setBuffer:id_src0 offset:offs_src0        atIndex:2];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:3];
                }
-                [encoder setBuffer:id_dst      offset:offs_dst         atIndex:3];
+                [encoder setBuffer:id_dst  offset:offs_dst      atIndex:4];
                [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:4];
                [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:5];
                [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:6];
                [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:7];
                [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:8];
                [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:9];
                [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:10];
                [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:11];
                [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:12];
                [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:13];
                [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:14];
                [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:15];
                [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:16];
                [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:17];
                [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:18];
                [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:19];
                [encoder setBytes:&n_past      length:sizeof(     int) atIndex:20];
                [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:21];
                [encoder setBytes:&n_ctx_orig  length:sizeof(     int) atIndex:22];
                [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
                [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
                [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
                [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
                [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
                [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
            } break;
@ -2796,6 +2863,7 @@ static void ggml_metal_encode_node(
                    default: GGML_ABORT("fatal error");
                };
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src1 offset:offs_src1       atIndex:0];
                [encoder setBuffer:id_dst  offset:offs_dst        atIndex:1];
@ -2836,6 +2904,7 @@ static void ggml_metal_encode_node(
                const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@ -2870,6 +2939,7 @@ static void ggml_metal_encode_node(
                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@ -2906,6 +2976,7 @@ static void ggml_metal_encode_node(
                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARANGE_F32].pipeline;
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_dst  offset:offs_dst    atIndex:0];
                [encoder setBytes:&ne0   length:sizeof(ne0)   atIndex:1];
@ -2927,6 +2998,7 @@ static void ggml_metal_encode_node(
                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32].pipeline;
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@ -2965,6 +3037,7 @@ static void ggml_metal_encode_node(
                    default: GGML_ABORT("fatal error");
                };
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
                [encoder setBuffer:id_dst      offset:offs_dst         atIndex:1];
@ -2983,6 +3056,7 @@ static void ggml_metal_encode_node(
                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
                [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
@ -3224,37 +3298,41 @@ static void ggml_metal_encode_node(
                    }
                }
                ggml_metal_kargs_flash_attn_ext args = {
                    /*.ne01          =*/ ne01,
                    /*.ne02          =*/ ne02,
                    /*.ne03          =*/ ne03,
                    /*.nb01          =*/ nb01,
                    /*.nb02          =*/ nb02,
                    /*.nb03          =*/ nb03,
                    /*.ne11          =*/ ne11,
                    /*.ne_12_2       =*/ ne12,
                    /*.ne_12_3       =*/ ne13,
                    /*.nb_12_1       =*/ nb11,
                    /*.nb_12_2       =*/ nb12,
                    /*.nb_12_3       =*/ nb13,
                    /*.nb31          =*/ nb31,
                    /*.ne1           =*/ ne1,
                    /*.ne2           =*/ ne2,
                    /*.scale         =*/ scale,
                    /*.max_bias      =*/ max_bias,
                    /*.m0            =*/ m0,
                    /*.m1            =*/ m1,
                    /*.n_head_log2   =*/ n_head_log2,
                    /*.logit_softcap =*/ logit_softcap,
                };
                [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0     offset:offs_src0           atIndex:0];
+                [encoder setBytes:&args length:sizeof(args)     atIndex:0];
-                [encoder setBuffer:id_src1     offset:offs_src1           atIndex:1];
+                [encoder setBuffer:id_src0 offset:offs_src0     atIndex:1];
-                [encoder setBuffer:id_src2     offset:offs_src2           atIndex:2];
+                [encoder setBuffer:id_src1 offset:offs_src1     atIndex:2];
                [encoder setBuffer:id_src2 offset:offs_src2     atIndex:3];
                if (id_src3) {
-                    [encoder setBuffer:id_src3     offset:offs_src3           atIndex:3];
+                    [encoder setBuffer:id_src3 offset:offs_src3 atIndex:4];
                } else {
-                    [encoder setBuffer:id_src0     offset:offs_src0           atIndex:3];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:4];
                }
-                [encoder setBuffer:id_dst        offset:offs_dst              atIndex:4];
+                [encoder setBuffer:id_dst offset:offs_dst       atIndex:5];
                [encoder setBytes:&ne01          length:sizeof( int64_t)      atIndex:5];
                [encoder setBytes:&ne02          length:sizeof( int64_t)      atIndex:6];
                [encoder setBytes:&ne03          length:sizeof( int64_t)      atIndex:7];
                [encoder setBytes:&nb01          length:sizeof(uint64_t)      atIndex:8];
                [encoder setBytes:&nb02          length:sizeof(uint64_t)      atIndex:9];
                [encoder setBytes:&nb03          length:sizeof(uint64_t)      atIndex:10];
                [encoder setBytes:&ne11          length:sizeof( int64_t)      atIndex:11];
                [encoder setBytes:&ne12          length:sizeof( int64_t)      atIndex:12];
                [encoder setBytes:&ne13          length:sizeof( int64_t)      atIndex:13];
                [encoder setBytes:&nb11          length:sizeof(uint64_t)      atIndex:14];
                [encoder setBytes:&nb12          length:sizeof(uint64_t)      atIndex:15];
                [encoder setBytes:&nb13          length:sizeof(uint64_t)      atIndex:16];
                [encoder setBytes:&nb31          length:sizeof(uint64_t)      atIndex:17];
                [encoder setBytes:&ne1           length:sizeof( int64_t)      atIndex:18];
                [encoder setBytes:&ne2           length:sizeof( int64_t)      atIndex:19];
                [encoder setBytes:&scale         length:sizeof(   float)      atIndex:20];
                [encoder setBytes:&max_bias      length:sizeof(   float)      atIndex:21];
                [encoder setBytes:&m0            length:sizeof(m0)            atIndex:22];
                [encoder setBytes:&m1            length:sizeof(m1)            atIndex:23];
                [encoder setBytes:&n_head_log2   length:sizeof(n_head_log2)   atIndex:24];
                [encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:25];
                if (!use_vec_kernel) {
                    // half8x8 kernel
@ -3389,25 +3467,29 @@ static void ggml_metal_encode_node(
                    default: GGML_ABORT("not implemented");
                }
                ggml_metal_kargs_cpy args = {
                    /*.ne00 =*/ ne00,
                    /*.ne01 =*/ ne01,
                    /*.ne02 =*/ ne02,
                    /*.ne03 =*/ ne03,
                    /*.nb00 =*/ nb00,
                    /*.nb01 =*/ nb01,
                    /*.nb02 =*/ nb02,
                    /*.nb03 =*/ nb03,
                    /*.ne0  =*/ ne0,
                    /*.ne1  =*/ ne1,
                    /*.ne2  =*/ ne2,
                    /*.ne3  =*/ ne3,
                    /*.nb0  =*/ nb0,
                    /*.nb1  =*/ nb1,
                    /*.nb2  =*/ nb2,
                    /*.nb3  =*/ nb3,
                };
                [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
-                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
                [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
                [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
                [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
                [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
                [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
                [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
                [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
                [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
                [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
                [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
                [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
                [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
                [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
            } break;
@ -3452,6 +3534,7 @@ static void ggml_metal_encode_node(
                const int64_t n_threads = MIN((int64_t)[pipeline maxTotalThreadsPerThreadgroup], parallel_elements);
                const int64_t n_tg = (parallel_elements + n_threads - 1) / n_threads;
                // TODO: add ggml_metal_kargs struct
                [encoder setComputePipelineState:pipeline];
                [encoder setBuffer:id_src0 offset:offs_src0       atIndex:0];
                [encoder setBuffer:id_dst  offset:offs_dst        atIndex:1];
@ -3639,6 +3722,12 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
    return ctx->all_data;
 }
 static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
    memset((char *)tensor->data + offset, value, size);
    UNUSED(buffer);
 }
 static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    memcpy((char *)tensor->data + offset, data, size);
@ -3671,7 +3760,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
    /* .free_buffer     = */ ggml_backend_metal_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_metal_buffer_get_base,
    /* .init_tensor     = */ NULL,
-    /* .memset_tensor   = */ NULL,
+    /* .memset_tensor   = */ ggml_backend_metal_buffer_memset_tensor,
    /* .set_tensor      = */ ggml_backend_metal_buffer_set_tensor,
    /* .get_tensor      = */ ggml_backend_metal_buffer_get_tensor,
    /* .cpy_tensor      = */ ggml_backend_metal_buffer_cpy_tensor,
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@ -58,19 +58,12 @@ if (MUSAToolkit_FOUND)
    target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
    add_compile_definitions(GGML_USE_MUSA)
    add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
    add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
    if (GGML_CUDA_GRAPHS)
        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
    endif()
    if (GGML_CUDA_FORCE_DMMV)
        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
    endif()
    if (GGML_CUDA_FORCE_MMQ)
        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
    endif()
@ -83,10 +76,6 @@ if (MUSAToolkit_FOUND)
        add_compile_definitions(GGML_CUDA_NO_VMM)
    endif()
    if (DEFINED GGML_CUDA_DMMV_Y)
        add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_DMMV_Y}) # for backwards compatibility
    endif()
    if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
        add_compile_definitions(GGML_CUDA_F16)
    endif()
--- a/ggml/src/ggml-opt.cpp
+++ b/ggml/src/ggml-opt.cpp
@ -0,0 +1,867 @@
 #include "ggml-opt.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <cinttypes>
 #include <map>
 #include <random>
 #include <vector>
 struct ggml_opt_dataset {
    struct ggml_context   * ctx;
    ggml_backend_buffer_t   buf;
    struct ggml_tensor    * data;
    struct ggml_tensor    * labels;
    int64_t ndata;
    int64_t ndata_shard;
    size_t  nbs_data;
    size_t  nbs_labels;
    std::vector<int64_t> permutation;
 };
 struct ggml_opt_context {
    ggml_backend_sched_t    backend_sched;
    ggml_cgraph           * allocated_graph;
    ggml_cgraph           * allocated_graph_copy;
    struct ggml_context   * ctx_static;
    struct ggml_context   * ctx_static_cpu;
    struct ggml_context   * ctx_compute;
    struct ggml_context   * ctx_copy;
    ggml_backend_buffer_t   buf_static;
    ggml_backend_buffer_t   buf_static_cpu;
    std::mt19937            rng;
    struct ggml_tensor * inputs;
    struct ggml_tensor * outputs;
    struct ggml_tensor * labels;
    struct ggml_tensor * loss;
    struct ggml_tensor * pred;
    struct ggml_tensor * ncorrect;
    struct ggml_cgraph * gf;
    struct ggml_cgraph * gb_grad;
    struct ggml_cgraph * gb_opt;
    int64_t iter;
    int32_t opt_period;
    int32_t opt_i;
    bool    loss_per_datapoint;
    ggml_opt_get_optimizer_params get_opt_pars;
    void * get_opt_pars_ud;
    struct ggml_tensor * adamw_params;
 };
 struct ggml_opt_result {
    int64_t              ndata    = 0;
    std::vector<float>   loss;
    std::vector<int32_t> pred;
    int64_t              ncorrect = 0;
    bool loss_per_datapoint = false;
    int64_t opt_period = -1;
 };
 // ====== Dataset ======
 ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, int64_t ndata, int64_t ndata_shard) {
    GGML_ASSERT(ne_datapoint >  0);
    GGML_ASSERT(ne_label     >= 0);
    GGML_ASSERT(ndata        >  0);
    GGML_ASSERT(ndata_shard  >  0);
    ggml_opt_dataset_t result = new ggml_opt_dataset;
    result->ndata       = ndata;
    result->ndata_shard = ndata_shard;
    {
        struct ggml_init_params params = {
            /*.mem_size   =*/ 2*ggml_tensor_overhead(),
            /*.mem_buffer =*/ nullptr,
            /*.no_alloc   =*/ true,
        };
        result->ctx = ggml_init(params);
    }
    result->data = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_datapoint, ndata);
    result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;
    if (ne_label > 0) {
        result->labels = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_label, ndata);
        result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
    } else {
        result->labels = nullptr;
        result->nbs_labels = 0;
    }
    result->buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_cpu_buffer_type());
    const int64_t nshards = ndata/ndata_shard;
    result->permutation.resize(nshards);
    for (int64_t i = 0; i < nshards; ++i) {
        result->permutation[i] = i;
    }
    return result;
 }
 void ggml_opt_dataset_free(ggml_opt_dataset_t dataset) {
    ggml_backend_buffer_free(dataset->buf);
    ggml_free(dataset->ctx);
    delete dataset;
 }
 struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
    return dataset->data;
 }
 struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset) {
    return dataset->labels;
 }
 void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata) {
    GGML_ASSERT(idata <= dataset->ndata);
    if (idata < 0) {
        std::shuffle(dataset->permutation.begin(), dataset->permutation.end(), opt_ctx->rng);
        return;
    }
    GGML_ASSERT(idata % dataset->ndata_shard == 0);
    const int64_t ishard_max = idata / dataset->ndata_shard;
    std::shuffle(dataset->permutation.begin(), dataset->permutation.begin() + ishard_max, opt_ctx->rng);
 }
 void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor * data_batch, struct ggml_tensor * labels_batch, int64_t ibatch) {
    GGML_ASSERT(   data_batch && ggml_is_contiguous(data_batch));
    GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
    GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
    const size_t nb_data_batch = ggml_nbytes(data_batch);
    GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
    const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data;
    if (labels_batch) {
        const size_t nb_labels_batch = ggml_nbytes(labels_batch);
        GGML_ASSERT(nb_labels_batch == shards_per_batch*dataset->nbs_labels);
    }
    GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size()));
    for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
        const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch];
        const char * ptr_data = (const char *) dataset->data->data + ishard*dataset->nbs_data;
        ggml_backend_tensor_set(data_batch, ptr_data, ishard_batch*dataset->nbs_data, dataset->nbs_data);
        if (!labels_batch) {
            continue;
        }
        const char * ptr_labels = (const char *) dataset->labels->data + ishard*dataset->nbs_labels;
        ggml_backend_tensor_set(labels_batch, ptr_labels, ishard_batch*dataset->nbs_labels, dataset->nbs_labels);
    }
 }
 // ====== Model / Context ======
 struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
    GGML_UNUSED(userdata);
    ggml_opt_optimizer_params result;
    result.adamw.alpha = 0.001f;
    result.adamw.beta1 = 0.9f;
    result.adamw.beta2 = 0.999f;
    result.adamw.eps   = 1e-8f;
    result.adamw.wd    = 0.0f;
    return result;
 }
 struct ggml_opt_params ggml_opt_default_params(
        ggml_backend_sched_t backend_sched,
        struct ggml_context * ctx_compute,
        struct ggml_tensor * inputs,
        struct ggml_tensor * outputs,
        enum ggml_opt_loss_type loss_type) {
    return {
        /*backend_sched   =*/ backend_sched,
        /*ctx_compute     =*/ ctx_compute,
        /*inputs          =*/ inputs,
        /*logits          =*/ outputs,
        /*loss_type       =*/ loss_type,
        /*build_type      =*/ GGML_OPT_BUILD_TYPE_OPT,
        /*opt_period      =*/ 1,
        /*get_opt_pars    =*/ ggml_opt_get_default_optimizer_params,
        /*get_opt_pars_ud =*/ nullptr,
    };
 }
 static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_map, ggml_context * ctx, ggml_tensor * tensor) {
    if (!tensor) {
        return nullptr;
    }
    if (tensor_map.find(tensor) != tensor_map.end()) {
        return tensor_map[tensor];
    }
    ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
    tensor_map[tensor] = new_tensor;
    new_tensor->op = tensor->op;
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        new_tensor->nb[i] = tensor->nb[i];
    }
    new_tensor->flags = tensor->flags;
    memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params));
    strcpy(new_tensor->name, tensor->name);
    new_tensor->data = tensor->data;
    new_tensor->buffer = tensor->buffer;
    new_tensor->extra = tensor->extra;
    new_tensor->view_offs = tensor->view_offs;
    new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src);
    for (int i = 0; i < GGML_MAX_SRC; i++) {
        new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i]);
    }
    return new_tensor;
 }
 static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * graph) {
    std::map<ggml_tensor *, ggml_tensor *> tensor_map;
    ggml_cgraph * new_graph = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true);
    for (int i = 0; i < graph->n_leafs; i++) {
        ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->leafs[i]));
    }
    for (int i = 0; i < graph->n_nodes; i++) {
        ggml_build_forward_expand(new_graph, map_tensor(tensor_map, ctx, graph->nodes[i]));
    }
    for (int i = 0; i < graph->n_nodes; ++i) {
        const size_t igrad_src = ggml_hash_find(&graph->visited_hash_set, graph->nodes[i]);
        const size_t igrad_dst = ggml_hash_find(&new_graph->visited_hash_set, new_graph->nodes[i]);
        graph->grads[igrad_dst]     = new_graph->grads[igrad_src];
        graph->grad_accs[igrad_dst] = new_graph->grad_accs[igrad_src];
    }
    return new_graph;
 }
 static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
    GGML_ASSERT(graph);
    if (opt_ctx->allocated_graph == graph) {
        return;
    }
    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
    {
        ggml_init_params params = {
            /*.mem_size   =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE,
            /*.mem_buffer =*/ nullptr,
            /*.no_alloc   =*/ true,
        };
        ggml_free(opt_ctx->ctx_copy);
        opt_ctx->ctx_copy = ggml_init(params);
    }
    opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
    opt_ctx->allocated_graph = graph;
 }
 ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
    ggml_opt_context_t result = new struct ggml_opt_context;
    result->backend_sched        = params.backend_sched;
    result->allocated_graph      = nullptr;
    result->allocated_graph_copy = nullptr;
    result->ctx_compute          = params.ctx_compute;
    result->ctx_copy             = nullptr;
    result->inputs               = params.inputs;
    result->outputs              = params.outputs;
    result->iter                 = 1;
    result->opt_period           = params.opt_period;
    result->opt_i                = 0;
    result->get_opt_pars         = params.get_opt_pars;
    result->get_opt_pars_ud      = params.get_opt_pars_ud;
    GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
    GGML_ASSERT(result->opt_period >= 1);
    const bool accumulate = params.build_type == GGML_OPT_BUILD_TYPE_GRAD ||
        (params.build_type == GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1);
    ggml_set_input(result->inputs);
    ggml_set_output(result->outputs);
    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
    ggml_build_forward_expand(result->gf, result->outputs);
    int n_param = 0;
    for (int i = 0; i < result->gf->n_nodes; ++i) {
        if (result->gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
            n_param++;
        }
    }
    {
        // The static context is used for:
        //   - gradients (1 tensor per param if using gradient accumulation)
        //   - optimizer momenta (2 tensors per param)
        //   - labels
        //   - loss + its gradient (up to 5 tensors)
        //   - pred
        //   - ncorrect (2 tensors).
        const size_t tensors_per_param = (accumulate ? 1 : 0) + (params.build_type == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
        const size_t size_meta = (tensors_per_param*n_param + 9) * ggml_tensor_overhead();
        struct ggml_init_params params = {
            /*.mem_size   =*/ size_meta,
            /*.mem_buffer =*/ nullptr,
            /*.no_alloc   =*/ true,
        };
        result->ctx_static = ggml_init(params);
    }
    {
        // The static cpu context is used for:
        //   - optimizer parameters (1 for the entire context)
        const size_t size_meta = 1 * ggml_tensor_overhead();
        struct ggml_init_params params = {
            /*.mem_size   =*/ size_meta,
            /*.mem_buffer =*/ nullptr,
            /*.no_alloc   =*/ true,
        };
        result->ctx_static_cpu = ggml_init(params);
    }
    switch (params.loss_type) {
        case GGML_OPT_LOSS_TYPE_MEAN: {
            result->labels = nullptr;
            result->loss = ggml_sum(result->ctx_static, result->outputs);
            ggml_set_name(result->loss, "loss_sum");
            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
            ggml_set_name(result->loss, "loss_mean");
            result->loss_per_datapoint = true;
            break;
        }
        case GGML_OPT_LOSS_TYPE_SUM: {
            result->labels = nullptr;
            result->loss = ggml_sum(result->ctx_static, result->outputs);
            ggml_set_name(result->loss, "loss_sum");
            result->loss_per_datapoint = false;
            break;
        }
        case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
            ggml_set_input(result->labels);
            ggml_set_name(result->labels, "labels");
            result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels);
            ggml_set_name(result->loss, "loss_cross_entropy");
            if (result->opt_period > 1) {
                result->loss = ggml_scale(result->ctx_static, result->loss, 1.0f / result->opt_period);
                ggml_set_name(result->loss, "loss_cross_entropy_scaled");
            }
            result->loss_per_datapoint = true;
            break;
        }
        case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
            ggml_set_input(result->labels);
            ggml_set_name(result->labels, "labels");
            result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels);
            ggml_set_name(result->loss, "loss_error");
            result->loss = ggml_sqr(result->ctx_static, result->loss);
            ggml_set_name(result->loss, "loss_squared_error");
            result->loss = ggml_sum(result->ctx_static, result->loss);
            ggml_set_name(result->loss, "loss_sum_squared_error");
            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
            ggml_set_name(result->loss, "loss_mean_squared_error");
            result->loss_per_datapoint = true;
            break;
        }
    }
    ggml_set_output(result->loss);
    ggml_set_loss(result->loss);
    ggml_build_forward_expand(result->gf, result->loss);
    result->pred = ggml_argmax(result->ctx_static, result->outputs);
    ggml_set_name(result->pred, "pred");
    ggml_set_output(result->pred);
    ggml_build_forward_expand(result->gf, result->pred);
    if (result->labels) {
        result->ncorrect = ggml_count_equal(result->ctx_static, result->pred, ggml_argmax(result->ctx_static, result->labels));
        ggml_set_name(result->ncorrect, "ncorrect");
        ggml_set_output(result->ncorrect);
        ggml_build_forward_expand(result->gf, result->ncorrect);
    } else {
        result->ncorrect = nullptr;
    }
    if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
        result->gb_grad = nullptr;
        result->gb_opt  = nullptr;
        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
        result->buf_static_cpu = nullptr;
        ggml_opt_alloc_graph(result, result->gf);
        return result;
    }
    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
    result->gb_grad = ggml_graph_dup(result->ctx_compute, result->gf);
    ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
    if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
        result->gb_opt  = nullptr;
        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
        result->buf_static_cpu = nullptr;
        ggml_opt_alloc_graph(result, result->gb_grad);
        ggml_graph_reset(result->gb_grad);
        return result;
    }
    GGML_ASSERT(params.build_type == GGML_OPT_BUILD_TYPE_OPT);
    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
    result->gb_opt = ggml_graph_dup(result->ctx_compute, result->gb_grad);
    result->adamw_params = ggml_new_tensor_1d(result->ctx_static_cpu, GGML_TYPE_F32, 7);
    ggml_set_input(result->adamw_params);
    ggml_set_name(result->adamw_params, "adamw_params");
    for (int i = result->gf->n_nodes-1; i >= 0; --i) {
        struct ggml_tensor * node = result->gb_opt->nodes[i];
        struct ggml_tensor * grad = ggml_graph_get_grad(result->gb_opt, node);
        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
            struct ggml_tensor * m        = ggml_dup_tensor(result->ctx_static, node);
            struct ggml_tensor * v        = ggml_dup_tensor(result->ctx_static, node);
            struct ggml_tensor * opt_step = ggml_opt_step_adamw(result->ctx_compute, node, grad, m, v, result->adamw_params);
            ggml_build_forward_expand(result->gb_opt, opt_step);
        }
    }
    result->buf_static = ggml_backend_alloc_ctx_tensors(
        result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
    result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
    ggml_opt_alloc_graph(result, result->gb_opt);
    ggml_graph_reset(result->gb_opt);
    return result;
 }
 void ggml_opt_free(ggml_opt_context_t opt_ctx) {
    if (opt_ctx == nullptr) {
        return;
    }
    ggml_backend_buffer_free(opt_ctx->buf_static);
    ggml_backend_buffer_free(opt_ctx->buf_static_cpu);
    ggml_free(opt_ctx->ctx_static);
    ggml_free(opt_ctx->ctx_static_cpu);
    delete opt_ctx;
 }
 void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer) {
    if (optimizer) {
        ggml_graph_reset(opt_ctx->gb_opt);
        opt_ctx->iter = 1;
    } else {
        ggml_graph_reset(opt_ctx->gb_grad);
    }
 }
 struct ggml_tensor * ggml_opt_inputs(ggml_opt_context_t opt_ctx) {
    return opt_ctx->inputs;
 }
 struct ggml_tensor * ggml_opt_outputs(ggml_opt_context_t opt_ctx) {
    return opt_ctx->outputs;
 }
 struct ggml_tensor * ggml_opt_labels(ggml_opt_context_t opt_ctx) {
    return opt_ctx->labels;
 }
 struct ggml_tensor * ggml_opt_loss(ggml_opt_context_t opt_ctx) {
    return opt_ctx->loss;
 }
 struct ggml_tensor * ggml_opt_pred(ggml_opt_context_t opt_ctx) {
    return opt_ctx->pred;
 }
 struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx) {
    return opt_ctx->ncorrect;
 }
 struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node) {
    return ggml_graph_get_grad_acc(opt_ctx->gb_opt, node);
 }
 // ====== Optimization Result ======
 ggml_opt_result_t ggml_opt_result_init() {
    return new ggml_opt_result;
 }
 void ggml_opt_result_free(ggml_opt_result_t result) {
    delete result;
 }
 void ggml_opt_result_reset(ggml_opt_result_t result) {
    result->ndata = 0;
    result->loss.clear();
    result->pred.clear();
    result->ncorrect = 0;
 }
 void ggml_opt_result_ndata(ggml_opt_result_t result, int64_t * ndata) {
    *ndata = result->ndata;
 }
 void ggml_opt_result_loss(ggml_opt_result_t result, double * loss, double * unc) {
    const int64_t nbatches = result->loss.size(); // Number of physical batches.
    if (nbatches == 0) {
        *loss = 0.0;
        *unc  = NAN;
        return;
    }
    double sum         = 0.0;
    double sum_squared = 0.0;
    for (const float & loss : result->loss) {
        // If the loss is per datapoint it was scaled by 1.0f/opt_period for each physical batch.
        const float loss_scaled = result->loss_per_datapoint ? loss*result->opt_period : loss;
        sum         += loss_scaled;
        sum_squared += loss_scaled*loss_scaled;
    }
    const double mean = sum/nbatches;
    *loss = result->loss_per_datapoint ? mean : sum;
    if (!unc) {
        return;
    }
    if (nbatches < 2) {
        *unc = NAN;
        return;
    }
    const double var_sum = sum_squared/nbatches - mean*mean; // variance without Bessel's correction, i.e. nbatches/(nbatches-1)
    *unc = result->loss_per_datapoint ? sqrt(var_sum / (nbatches - 1)) : sqrt(var_sum * nbatches/(nbatches - 1));
 }
 void ggml_opt_result_pred(ggml_opt_result_t result, int32_t * pred) {
    for (size_t i = 0; i < result->pred.size(); ++i) {
        pred[i] = result->pred[i];
    }
 }
 void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc) {
    *accuracy = result->ncorrect >= 0 ? double(result->ncorrect) / double(result->ndata) : NAN;
    if (!unc) {
        return;
    }
    *unc = result->ncorrect >= 0 && result->ndata >= 2 ?
        sqrt((*accuracy) * (1.0 - (*accuracy)) / double(result->ndata - 1)) : NAN;
 }
 // ====== Computation ======
 static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) {
    if (graph != opt_ctx->gf) {
        struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
        GGML_ASSERT(opt_pars.adamw.alpha >  0.0f);
        GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
        GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
        GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
        GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
        GGML_ASSERT(opt_pars.adamw.eps   >= 0.0f);
        GGML_ASSERT(opt_pars.adamw.wd    >= 0.0f);
        GGML_ASSERT(opt_pars.adamw.wd    <= 1.0f);
        // beta1, beta2 after applying warmup
        const float beta1h = 1.0f/(1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
        const float beta2h = 1.0f/(1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
        float * adamw_par_data = ggml_get_data_f32(opt_ctx->adamw_params);
        adamw_par_data[0] = opt_pars.adamw.alpha;
        adamw_par_data[1] = opt_pars.adamw.beta1;
        adamw_par_data[2] = opt_pars.adamw.beta2;
        adamw_par_data[3] = opt_pars.adamw.eps;
        adamw_par_data[4] = opt_pars.adamw.wd;
        adamw_par_data[5] = beta1h;
        adamw_par_data[6] = beta2h;
    }
    ggml_opt_alloc_graph(opt_ctx, graph);
    ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
    opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
    if (!result) {
        return;
    }
    if (result->ndata == 0) {
        result->loss_per_datapoint = opt_ctx->loss_per_datapoint;
        result->opt_period         = opt_ctx->opt_period;
    } else {
        GGML_ASSERT(result->loss_per_datapoint == opt_ctx->loss_per_datapoint);
        GGML_ASSERT(result->opt_period         == opt_ctx->opt_period);
    }
    const int64_t ndata = opt_ctx->outputs->ne[1];
    GGML_ASSERT(result->ndata == ndata*int64_t(result->loss.size()) && "varying batch size not supported");
    result->ndata += ndata;
    GGML_ASSERT(ggml_is_scalar(opt_ctx->loss));
    GGML_ASSERT(opt_ctx->loss->type == GGML_TYPE_F32);
    float loss;
    ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
    result->loss.push_back(loss);
    GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
    std::vector<int32_t> pred(ndata);
    ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
    result->pred.insert(result->pred.end(), pred.begin(), pred.end());
    if (!opt_ctx->labels || result->ncorrect < 0) {
        result->ncorrect = -1;
        return;
    }
    GGML_ASSERT(ggml_is_scalar(opt_ctx->ncorrect));
    GGML_ASSERT(opt_ctx->ncorrect->type == GGML_TYPE_I64);
    int64_t ncorrect;
    ggml_backend_tensor_get(opt_ctx->ncorrect, &ncorrect, 0, ggml_nbytes(opt_ctx->ncorrect));
    result->ncorrect += ncorrect;
 }
 void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
    ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result);
 }
 void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
    if (opt_ctx->opt_period == 1) {
        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
        return;
    }
    const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
    if (opt_i_next == 0) {
        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
        ggml_opt_reset(opt_ctx, /*optimizer =*/ false);
    } else {
        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result);
    }
    opt_ctx->opt_i = opt_i_next;
 }
 // ====== High-Level Functions ======
 void ggml_opt_epoch(
        ggml_opt_context_t      opt_ctx,
        ggml_opt_dataset_t      dataset,
        ggml_opt_result_t       result_train,
        ggml_opt_result_t       result_eval,
        int64_t                 idata_split,
        ggml_opt_epoch_callback callback_train,
        ggml_opt_epoch_callback callback_eval) {
    struct ggml_tensor * inputs = ggml_opt_inputs(opt_ctx);
    struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
    struct ggml_tensor * data   = ggml_opt_dataset_data(dataset);
    GGML_ASSERT(data->ne[0] == inputs->ne[0]);
    const int64_t ndata       =   data->ne[1];
    const int64_t ndata_batch = inputs->ne[1];
    GGML_ASSERT(data->ne[1] % inputs->ne[1] == 0);
    const int64_t nbatches = ndata/ndata_batch;
    idata_split = idata_split < 0 ? ndata : idata_split;
    GGML_ASSERT(idata_split % ndata_batch == 0);
    const int64_t ibatch_split = idata_split / ndata_batch;
    int64_t ibatch = 0;
    int64_t t_loop_start = ggml_time_us();
    for (; ibatch < ibatch_split; ++ibatch) {
        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
        ggml_opt_forward_backward(opt_ctx, result_train);
        if (callback_train) {
            callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
        }
    }
    t_loop_start = ggml_time_us();
    for (; ibatch < nbatches; ++ibatch) {
        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
        ggml_opt_forward(opt_ctx, result_eval);
        if (callback_eval) {
            callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
        }
    }
 }
 void ggml_opt_epoch_callback_progress_bar(
        bool               train,
        ggml_opt_context_t opt_ctx,
        ggml_opt_dataset_t dataset,
        ggml_opt_result_t  result,
        int64_t            ibatch,
        int64_t            ibatch_max,
        int64_t            t_start_us) {
    fprintf(stderr, "%s[", train ? "train: " : "val:   ");
    constexpr int64_t bar_length = 25;
    for (int64_t j = 0; j < bar_length; ++j) {
        const int64_t ibatch_j = ibatch_max * j/bar_length;
        if (ibatch_j < ibatch) {
            fprintf(stderr, "=");
        } else if (ibatch_max * (j - 1)/bar_length < ibatch) {
            fprintf(stderr, ">");
        } else {
            fprintf(stderr, " ");
        }
    }
    const int64_t batch_size = ggml_opt_inputs(opt_ctx)->ne[1];
    const int64_t idata      = ibatch*batch_size;
    const int64_t idata_max  = ibatch_max*batch_size;
    double loss;
    double loss_unc;
    ggml_opt_result_loss(result, &loss, &loss_unc);
    double accuracy;
    double accuracy_unc;
    ggml_opt_result_accuracy(result, &accuracy, &accuracy_unc);
    const int64_t t_ibatch_us = ggml_time_us() - t_start_us;
    int64_t t_ibatch_s = t_ibatch_us / 1000000;
    const int64_t t_ibatch_h = t_ibatch_s / 3600;
    t_ibatch_s -= t_ibatch_h * 3600;
    const int64_t t_ibatch_m = t_ibatch_s / 60;
    t_ibatch_s -= t_ibatch_m * 60;
    const int64_t t_eta_us = t_ibatch_us * (ibatch_max - ibatch)/ibatch;
    int64_t t_eta_s = t_eta_us / 1000000;
    const int64_t t_eta_h = t_eta_s / 3600;
    t_eta_s -= t_eta_h * 3600;
    const int64_t t_eta_m = t_eta_s / 60;
    t_eta_s -= t_eta_m * 60;
    fprintf(stderr, "| data=%06" PRId64 "/%06" PRId64 ", loss=%.6lf+-%.6lf, accuracy=%.2lf+-%.2lf%%, "
            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 ", ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "]\r",
            idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
            t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
    if (ibatch == ibatch_max) {
        fprintf(stderr, "\n");
    }
    fflush(stderr);
    GGML_UNUSED(dataset);
 }
 void ggml_opt_fit(
        ggml_backend_sched_t            backend_sched,
        ggml_context                  * ctx_compute,
        ggml_tensor                   * inputs,
        ggml_tensor                   * outputs,
        ggml_opt_dataset_t              dataset,
        enum ggml_opt_loss_type         loss_type,
        ggml_opt_get_optimizer_params   get_opt_pars,
        int64_t                         nepoch,
        int64_t                         nbatch_logical,
        float                           val_split,
        bool                            silent) {
    ggml_time_init();
    const int64_t t_start_us = ggml_time_us();
    const int64_t ndata           = ggml_opt_dataset_data(dataset)->ne[1];
    const int64_t nbatch_physical = inputs->ne[1];
    GGML_ASSERT(ndata          % nbatch_logical  == 0);
    GGML_ASSERT(nbatch_logical % nbatch_physical == 0);
    const int64_t opt_period       = nbatch_logical / nbatch_physical;
    const int64_t nbatches_logical = ndata / nbatch_logical;
    GGML_ASSERT(val_split >= 0.0f);
    GGML_ASSERT(val_split <  1.0f);
    const int64_t ibatch_split = int64_t(((1.0f - val_split) * nbatches_logical)) * opt_period; // train <-> val split index (physical)
    const int64_t idata_split  = ibatch_split * nbatch_physical;
    int64_t epoch = 1;
    ggml_opt_params params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
    params.opt_period      = opt_period;
    params.get_opt_pars    = get_opt_pars;
    params.get_opt_pars_ud = &epoch;
    ggml_opt_context_t opt_ctx = ggml_opt_init(params);
    // Shuffling the data is generally useful but there is only a point if not all data is used in a single batch.
    if (nbatch_logical < ndata) {
        ggml_opt_dataset_shuffle(opt_ctx, dataset, -1); // Shuffle all data (train + validation).
    }
    ggml_opt_result_t result_train = ggml_opt_result_init();
    ggml_opt_result_t result_val   = ggml_opt_result_init();
    ggml_opt_epoch_callback epoch_callback = silent ? nullptr : ggml_opt_epoch_callback_progress_bar;
    for (; epoch <= nepoch; ++epoch) {
        if (nbatch_logical < idata_split) {
            ggml_opt_dataset_shuffle(opt_ctx, dataset, idata_split);
        }
        ggml_opt_result_reset(result_train);
        ggml_opt_result_reset(result_val);
        if (!silent) {
            fprintf(stderr, "%s: epoch %04" PRId64 "/%04" PRId64 ":\n", __func__, epoch, nepoch);
        }
        ggml_opt_epoch(opt_ctx, dataset, result_train, result_val, idata_split, epoch_callback, epoch_callback);
        if (!silent) {
            fprintf(stderr, "\n");
        }
    }
    if (!silent) {
        int64_t t_total_s = (ggml_time_us() - t_start_us) / 1000000;
        const int64_t t_total_h = t_total_s / 3600;
        t_total_s -= t_total_h * 3600;
        const int64_t t_total_m = t_total_s / 60;
        t_total_s -= t_total_m * 60;
        fprintf(stderr, "%s: training took %02" PRId64 ":%02" PRId64 ":%02" PRId64 "\n", __func__, t_total_h, t_total_m, t_total_s);
    }
    ggml_opt_free(opt_ctx);
    ggml_opt_result_free(result_train);
    ggml_opt_result_free(result_val);
 }
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
--- a/include/llama.h
+++ b/include/llama.h
@ -1244,8 +1244,6 @@ extern "C" {
    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
 #ifdef __cplusplus
 }
 #endif
--- a/scripts/run-with-preset.py
+++ b/scripts/run-with-preset.py
@ -1,146 +0,0 @@
 #!/usr/bin/env python3
 import logging
 import argparse
 import os
 import subprocess
 import sys
 import yaml
 logger = logging.getLogger("run-with-preset")
 CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
    "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
    "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
    "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
    "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
    "low-vram", "main-gpu", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
    "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
    "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
    "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
    "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
    "simple-io", "tensor-split", "threads", "temp", "top-k", "top-p", "typical",
    "verbose-prompt"
 ]
 CLI_ARGS_LLAMA_BENCH = [
    "batch-size", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
    "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
 ]
 CLI_ARGS_LLAMA_SERVER = [
    "alias", "batch-size", "ctx-size", "embedding", "host", "lora", "lora-base",
    "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
    "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
    "threads", "verbose"
 ]
 description = """Run llama.cpp binaries with presets from YAML file(s).
 To specify which binary should be run, specify the "binary" property (llama-cli, llama-perplexity, llama-bench, and llama-server are supported).
 To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument.
 Formatting considerations:
 - The YAML property names are the same as the CLI argument names of the corresponding binary.
 - Properties must use the long name of their corresponding llama.cpp CLI arguments.
 - Like the llama.cpp binaries the property names do not differentiate between hyphens and underscores.
 - Flags must be defined as "<PROPERTY_NAME>: true" to be effective.
 - To define the logit_bias property, the expected format is "<TOKEN_ID>: <BIAS>" in the "logit_bias" namespace.
 - To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
 - To define a tensor split, pass a list of floats.
 """
 usage = "run-with-preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
 epilog = ("  --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
          "Unknown args will be ignored.")
 parser = argparse.ArgumentParser(
    description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
 parser.add_argument("-bin", "--binary", help="The binary to run.")
 parser.add_argument("yaml_files", nargs="*",
                    help="Arbitrary number of YAML files from which to read preset values. "
                    "If two files specify the same values the later one will be used.")
 parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
 known_args, unknown_args = parser.parse_known_args()
 if not known_args.yaml_files and not unknown_args:
    parser.print_help()
    sys.exit(0)
 logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
 props = dict()
 for yaml_file in known_args.yaml_files:
    with open(yaml_file, "r") as f:
        props.update(yaml.load(f, yaml.SafeLoader))
 props = {prop.replace("_", "-"): val for prop, val in props.items()}
 binary = props.pop("binary", "llama-cli")
 if known_args.binary:
    binary = known_args.binary
 if os.path.exists(f"./{binary}"):
    binary = f"./{binary}"
 if binary.lower().endswith("llama-cli") or binary.lower().endswith("llama-perplexity"):
    cli_args = CLI_ARGS_LLAMA_CLI_PERPLEXITY
 elif binary.lower().endswith("llama-bench"):
    cli_args = CLI_ARGS_LLAMA_BENCH
 elif binary.lower().endswith("llama-server"):
    cli_args = CLI_ARGS_LLAMA_SERVER
 else:
    logger.error(f"Unknown binary: {binary}")
    sys.exit(1)
 command_list = [binary]
 for cli_arg in cli_args:
    value = props.pop(cli_arg, None)
    if not value or value == -1:
        continue
    if cli_arg == "logit-bias":
        for token, bias in value.items():
            command_list.append("--logit-bias")
            command_list.append(f"{token}{bias:+}")
        continue
    if cli_arg == "reverse-prompt" and not isinstance(value, str):
        for rp in value:
            command_list.append("--reverse-prompt")
            command_list.append(str(rp))
        continue
    command_list.append(f"--{cli_arg}")
    if cli_arg == "tensor-split":
        command_list.append(",".join([str(v) for v in value]))
        continue
    value = str(value)
    if value != "True":
        command_list.append(str(value))
 num_unused = len(props)
 if num_unused > 10:
    logger.info(f"The preset file contained a total of {num_unused} unused properties.")
 elif num_unused > 0:
    logger.info("The preset file contained the following unused properties:")
    for prop, value in props.items():
        logger.info(f"  {prop}: {value}")
 command_list += unknown_args
 sp = subprocess.Popen(command_list)
 while sp.returncode is None:
    try:
        sp.wait()
    except KeyboardInterrupt:
        pass
 sys.exit(sp.returncode)
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@ -73,17 +73,20 @@ while read c; do
        src/ggml*.h \
        src/ggml*.c \
        src/ggml*.cpp \
        src/ggml*.m \
        src/ggml*.metal \
        src/ggml*.cu \
        src/ggml-amx/* \
        src/ggml-blas/* \
        src/ggml-cann/* \
        src/ggml-cpu/* \
        src/ggml-cuda/* \
        src/ggml-hip/* \
        src/ggml-kompute/* \
        src/ggml-metal/* \
        src/ggml-musa/* \
        src/ggml-rpc/* \
        src/ggml-sycl/* \
-        src/vulkan-shaders/* \
+        src/ggml-vulkan/* \
        include/ggml*.h \
        tests/test-opt.cpp \
        tests/test-grad0.cpp \
        tests/test-quantize-fns.cpp \
        tests/test-quantize-perf.cpp \
        tests/test-backend-ops.cpp \
@ -121,21 +124,22 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # src/ggml*.c          -> ggml/src/ggml*.c
    # src/ggml*.cpp        -> ggml/src/ggml*.cpp
    # src/ggml*.h          -> ggml/src/ggml*.h
-    # src/ggml*.cu         -> ggml/src/ggml*.cu
+    # src/ggml-amx/*       -> ggml/src/ggml-amx/*
-    # src/ggml*.m          -> ggml/src/ggml*.m
+    # src/ggml-blas/*      -> ggml/src/ggml-blas/*
-    # src/ggml-amx/*       -> ggml/src/ggml-amx/
+    # src/ggml-cann/*      -> ggml/src/ggml-cann/*
-    # src/ggml-cann/*      -> ggml/src/ggml-cann/
+    # src/ggml-cpu/*       -> ggml/src/ggml-cpu/*
-    # src/ggml-cuda/*      -> ggml/src/ggml-cuda/
+    # src/ggml-cuda/*      -> ggml/src/ggml-cuda/*
-    # src/ggml-sycl/*      -> ggml/src/ggml-sycl/
+    # src/ggml-hip/*       -> ggml/src/ggml-hip/*
-    # src/vulkan-shaders/* -> ggml/src/vulkan-shaders/
+    # src/ggml-kompute/*   -> ggml/src/ggml-kompute/*
    # src/ggml-metal/*     -> ggml/src/ggml-metal/*
    # src/ggml-musa/*      -> ggml/src/ggml-musa/*
    # src/ggml-rpc/*       -> ggml/src/ggml-rpc/*
    # src/ggml-sycl/*      -> ggml/src/ggml-sycl/*
    # src/ggml-vulkan/*    -> ggml/src/ggml-vulkan/*
    #
    # include/ggml*.h -> ggml/include/ggml*.h
    #
-    # tests/test-opt.cpp           -> tests/test-opt.cpp
+    # tests/test*.cpp -> tests/
    # tests/test-grad0.cpp         -> tests/test-grad0.cpp
    # tests/test-quantize-fns.cpp  -> tests/test-quantize-fns.cpp
    # tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp
    # tests/test-backend-ops.cpp   -> tests/test-backend-ops.cpp
    #
    # LICENSE                -> LICENSE
    # scripts/gen-authors.sh -> scripts/gen-authors.sh
@ -147,18 +151,20 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\2.cu/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\2.m/g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-blas\//\1ggml\/src\/ggml-blas\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-cpu\//\1ggml\/src\/ggml-cpu\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-hip\//\1ggml\/src\/ggml-hip\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\//\1ggml\/src\/ggml-kompute\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\//\1ggml\/src\/ggml-metal\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-musa\//\1ggml\/src\/ggml-musa\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\//\1ggml\/src\/ggml-rpc\//g' \
        -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
-        -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
+        -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\//\1ggml\/src\/ggml-vulkan\//g' \
        -e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \
-        -e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \
+        -e 's/([[:space:]]|[ab]\/)tests\/(.*)\.cpp/\1tests\/\2.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \
        -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/\1examples\/common-ggml.cpp/g' \
        -e 's/([[:space:]]|[ab]\/)LICENSE/\1LICENSE/g' \
        -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/\1scripts\/gen-authors.sh/g' \
        > ggml-src.patch.tmp
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@ -7,18 +7,22 @@ cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
 cp -rpv ../ggml/src/ggml*.c        ./ggml/src/
 cp -rpv ../ggml/src/ggml*.cpp      ./ggml/src/
 cp -rpv ../ggml/src/ggml*.h        ./ggml/src/
 cp -rpv ../ggml/src/ggml*.cu         ./ggml/src/
 cp -rpv ../ggml/src/ggml*.m          ./ggml/src/
 cp -rpv ../ggml/src/ggml-amx/*     ./ggml/src/ggml-amx/
 cp -rpv ../ggml/src/ggml-blas/*    ./ggml/src/ggml-blas/
 cp -rpv ../ggml/src/ggml-cann/*    ./ggml/src/ggml-cann/
 cp -rpv ../ggml/src/ggml-cpu/*     ./ggml/src/ggml-cpu/
 cp -rpv ../ggml/src/ggml-cuda/*    ./ggml/src/ggml-cuda/
 cp -rpv ../ggml/src/ggml-hip/*     ./ggml/src/ggml-hip/
 cp -rpv ../ggml/src/ggml-kompute/* ./ggml/src/ggml-kompute/
 cp -rpv ../ggml/src/ggml-metal/*   ./ggml/src/ggml-metal/
 cp -rpv ../ggml/src/ggml-musa/*    ./ggml/src/ggml-musa/
 cp -rpv ../ggml/src/ggml-rpc/*     ./ggml/src/ggml-rpc/
 cp -rpv ../ggml/src/ggml-sycl/*    ./ggml/src/ggml-sycl/
-cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/
+cp -rpv ../ggml/src/ggml-vulkan/*  ./ggml/src/ggml-vulkan/
 cp -rpv ../ggml/include/ggml*.h ./ggml/include/
 cp -rpv ../ggml/tests/test-opt.cpp           ./tests/test-opt.cpp
 cp -rpv ../ggml/tests/test-grad0.cpp         ./tests/test-grad0.cpp
 cp -rpv ../ggml/tests/test-quantize-fns.cpp  ./tests/test-quantize-fns.cpp
 cp -rpv ../ggml/tests/test-quantize-perf.cpp ./tests/test-quantize-perf.cpp
 cp -rpv ../ggml/tests/test-backend-ops.cpp   ./tests/test-backend-ops.cpp
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3460,21 +3460,13 @@ static bool llama_kv_cache_init(
        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
-        const llama_model::buft_list_t * buft_list;
+        ggml_backend_buffer_type_t buft;
        if (offload) {
-            buft_list = model.dev_layer.at(i).buft_list;
+            auto * dev = model.dev_layer.at(i).dev;
            buft = ggml_backend_dev_buffer_type(dev);
        } else {
-            buft_list = &model.cpu_buft_list;
+            buft = ggml_backend_cpu_buffer_type();
        }
        ggml_backend_buffer_type_t buft = select_buft(*buft_list,
            [&](ggml_context * ctx) {
                ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
                if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
                    return k;
                }
                ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
                return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
            });
        ggml_context * ctx = ctx_for_buft(buft);
        if (!ctx) {
@ -22075,28 +22067,6 @@ void llama_perf_context_reset(struct llama_context * ctx) {
    ctx->t_p_eval_us = ctx->n_p_eval = 0;
 }
 void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
    fprintf(stream, "\n");
    fprintf(stream, "###########\n");
    fprintf(stream, "# Timings #\n");
    fprintf(stream, "###########\n");
    fprintf(stream, "\n");
    fprintf(stream, "mst_eval: %.2f  # ms / token during generation\n",
            1.0e-3 * ctx->t_eval_us / ctx->n_eval);
    fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
            1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
    fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
    fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
    fprintf(stream, "t_eval_us: %" PRId64 "  # total microseconds spent generating tokens\n", ctx->t_eval_us);
    fprintf(stream, "t_load_us: %" PRId64 "  # total microseconds spent loading the model\n", ctx->t_load_us);
    fprintf(stream, "t_p_eval_us: %" PRId64 "  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
    fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
            1.0e6 * ctx->n_eval / ctx->t_eval_us);
    fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
            1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
 }
 // For internal test use
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -116,9 +116,8 @@ llama_target_and_test(test-sampling.cpp)
 llama_target_and_test(test-chat-template.cpp)
 llama_target_and_test(test-grammar-parser.cpp)
 llama_target_and_test(test-llama-grammar.cpp)
 llama_target_and_test(test-grammar-integration.cpp)
-llama_target_and_test(test-grad0.cpp)
+llama_target_and_test(test-llama-grammar.cpp)
 llama_target_and_test(test-barrier.cpp)
 # llama_target_and_test(test-opt.cpp) # SLOW
 llama_target_and_test(test-backend-ops.cpp)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -811,11 +811,11 @@ struct test_case {
        ggml_build_forward_expand(gf, out);
        ggml_graph_cpy(gf, gb);
-        ggml_build_backward_expand(ctx, gf, gb, false);
+        ggml_build_backward_expand(ctx, ctx, gb, false);
        if (expect.size() != 1 || expect[0] != 0.0f) {
            GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-                GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
+                GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || ggml_graph_get_grad(gb, t)->op != GGML_OP_NONE);
            }
        }
@ -862,7 +862,13 @@ struct test_case {
            const char * bn = ggml_backend_name(backend);
            const int64_t ne = ggml_nelements(t);
-            std::vector<float> ga = tensor_to_float(t->grad);
+            std::vector<float> ga;
            struct ggml_tensor * grad = ggml_graph_get_grad(gb, t);
            if (grad) {
                ga = tensor_to_float(grad);
            } else {
                ga.resize(ne); // default value is 0.0f
            }
            for (int64_t i = 0; i < ne; ++i) { // gradient algebraic
                // check for nans
@ -2500,6 +2506,35 @@ struct test_sum_rows : public test_case {
    }
 };
 // GGML_OP_MEAN
 struct test_mean : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
    std::string vars() override {
        return VARS_TO_STR2(type, ne);
    }
    test_mean(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {10, 5, 4, 3})
        : type(type), ne(ne) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_set_param(ctx, a);
        ggml_set_name(a, "a");
        ggml_tensor * out = ggml_mean(ctx, a);
        ggml_set_name(out, "out");
        return out;
    }
    float grad_eps() override {
        return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
    }
 };
 // GGML_OP_UPSCALE
 struct test_upscale : public test_case {
    const ggml_type type;
@ -2834,24 +2869,14 @@ struct test_cross_entropy_loss : public test_case {
 struct test_opt_step_adamw : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
    const float alpha;
    const float beta1;
    const float beta2;
    const float eps;
    const float wd;
    std::string vars() override {
-        return VARS_TO_STR7(type, ne, alpha, beta1, beta2, eps, wd);
+        return VARS_TO_STR2(type, ne);
    }
    test_opt_step_adamw(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 5, 4, 3},
+            std::array<int64_t, 4> ne = {10, 5, 4, 3})
-            float alpha = 1e-3f,
+        : type(type), ne(ne) {}
            float beta1 = 0.9f,
            float beta2 = 0.999f,
            float eps = 1e-8f,
            float wd = 0.0f)
        : type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
@ -2861,7 +2886,16 @@ struct test_opt_step_adamw : public test_case {
        ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
        ggml_set_name(grad, "grad");
-        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, alpha, beta1, beta2, eps, wd);
+        ggml_tensor * grad_m = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
        ggml_set_name(grad_m, "grad_m");
        ggml_tensor * grad_v = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
        ggml_set_name(grad_v, "grad_v");
        ggml_tensor * adamw_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 7);
        ggml_set_name(adamw_params, "adamw_params");
        ggml_tensor * out = ggml_opt_step_adamw(ctx, a, grad, grad_m, grad_v, adamw_params);
        ggml_set_name(out, "out");
        return out;
@ -2869,7 +2903,7 @@ struct test_opt_step_adamw : public test_case {
    void initialize_tensors(ggml_context * ctx) override {
        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            init_tensor_uniform(t, 0.0f, 1.0f); // grad_v needs non-negative values.
+            init_tensor_uniform(t, 0.0f, 1.0f); // grad_v and adamw_params need non-negative values.
        }
    }
@ -3735,6 +3769,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_sum());
    test_cases.emplace_back(new test_sum_rows());
    test_cases.emplace_back(new test_mean());
    test_cases.emplace_back(new test_upscale());
    test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
    test_cases.emplace_back(new test_upscale_ext());
@ -3766,9 +3801,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    }
    test_cases.emplace_back(new test_cross_entropy_loss());
-    for (float wd : {0.0f, 1e-2f}) {
+    test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
        test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd));
    }
    // these tests are disabled to save execution time, but they can be handy for debugging
 #if 0
@ -3938,6 +3971,8 @@ int main(int argc, char ** argv) {
        ggml_backend_free(backend);
    }
    ggml_quantize_free();
    printf("%zu/%zu backends passed\n", n_ok, ggml_backend_dev_count());
    if (n_ok != ggml_backend_dev_count()) {
@ -3945,8 +3980,6 @@ int main(int argc, char ** argv) {
        return 1;
    }
    ggml_quantize_free();
    printf("\033[1;32mOK\033[0m\n");
    return 0;
 }
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -7,7 +7,6 @@
 #include <algorithm>
 #include <assert.h>
 #include <functional>
 #include <inttypes.h>
 #include <math.h>
 #include <memory>
 #include <stdio.h>