Merge branch 'master' into add_gpt2_support

2023-12-26 07:23:04 -05:00 · 2023-12-26 07:23:04 -05:00 · c4d1c26710
commit c4d1c26710
parent fdf599d7cf de8e496437
24 changed files with 1018 additions and 421 deletions
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@ -6,179 +6,4 @@ assignees: ''

 ---

-# Prerequisites
-
-Please answer the following questions for yourself before submitting an issue.
-
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
-
-# Expected Behavior
-
-Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
-
-# Current Behavior
-
-Please provide a detailed written description of what `llama.cpp` did, instead.
-
-# Environment and Context
-
-Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
-
-* Physical (or virtual) hardware you are using, e.g. for Linux:
-
-`$ lscpu`
-
-* Operating System, e.g. for Linux:
-
-`$ uname -a`
-
-* SDK version, e.g. for Linux:
-
-```
-$ python3 --version
-$ make --version
-$ g++ --version
-```
-
-# Failure Information (for bugs)
-
-Please help provide information about the failure / bug.
-
-# Steps to Reproduce
-
-Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
-
-1. step 1
-2. step 2
-3. step 3
-4. etc.
-
-# Failure Logs
-
-Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
-
-Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
-
-Example environment info:
-```
-llama.cpp$ git log | head -1
-commit 2af23d30434a677c6416812eea52ccc0af65119c
-
-llama.cpp$ lscpu | egrep "AMD|Flags"
-Vendor ID:                       AuthenticAMD
-Model name:                      AMD Ryzen Threadripper 1950X 16-Core Processor
-Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev
-Virtualization:                  AMD-V
-
-llama.cpp$ python3 --version
-Python 3.10.9
-
-llama.cpp$ pip list | egrep "torch|numpy|sentencepiece"
-numpy                         1.24.2
-numpydoc                      1.5.0
-sentencepiece                 0.1.97
-torch                         1.13.1
-torchvision                   0.14.1
-
-llama.cpp$ make --version | head -1
-GNU Make 4.3
-
-$ md5sum ./models/65B/ggml-model-q4_0.bin
-dbdd682cce80e2d6e93cefc7449df487  ./models/65B/ggml-model-q4_0.bin
-```
-
-Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
-```
-llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
-main: seed = 1679149377
-llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ...
-llama_model_load: n_vocab = 32000
-llama_model_load: n_ctx   = 512
-llama_model_load: n_embd  = 8192
-llama_model_load: n_mult  = 256
-llama_model_load: n_head  = 64
-llama_model_load: n_layer = 80
-llama_model_load: n_rot   = 128
-llama_model_load: f16     = 2
-llama_model_load: n_ff    = 22016
-llama_model_load: n_parts = 8
-llama_model_load: ggml ctx size = 41477.73 MB
-llama_model_load: memory_size =  2560.00 MB, n_mem = 40960
-llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7'
-llama_model_load: .......................................................................................... done
-llama_model_load: model size =  4869.09 MB / num tensors = 723
-
-system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
-
-main: prompt: 'Please close your issue when it has been answered.'
-main: number of tokens in prompt = 11
-     1 -> ''
- 12148 -> 'Please'
-  3802 -> ' close'
-   596 -> ' your'
-  2228 -> ' issue'
-   746 -> ' when'
-   372 -> ' it'
-   756 -> ' has'
-  1063 -> ' been'
-  7699 -> ' answered'
- 29889 -> '.'
-
-sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
-
-
-Please close your issue when it has been answered.
-@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine??
-I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!!
-@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text]
-
-
-main: mem per token = 71159620 bytes
-main:     load time = 19309.95 ms
-main:   sample time =   168.62 ms
-main:  predict time = 223895.61 ms / 888.47 ms per token
-main:    total time = 246406.42 ms
-
- Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
-
-        3636882.89 msec task-clock                #   14.677 CPUs utilized
-             13509      context-switches          #    3.714 /sec
-              2436      cpu-migrations            #    0.670 /sec
-          10476679      page-faults               #    2.881 K/sec
-    13133115082869      cycles                    #    3.611 GHz                      (16.77%)
-       29314462753      stalled-cycles-frontend   #    0.22% frontend cycles idle     (16.76%)
-    10294402631459      stalled-cycles-backend    #   78.39% backend cycles idle      (16.74%)
-    23479217109614      instructions              #    1.79  insn per cycle
-                                                  #    0.44  stalled cycles per insn  (16.76%)
-     2353072268027      branches                  #  647.002 M/sec                    (16.77%)
-        1998682780      branch-misses             #    0.08% of all branches          (16.76%)
-
-     247.802177522 seconds time elapsed
-
-    3618.573072000 seconds user
-      18.491698000 seconds sys
-```
+Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -98,5 +98,5 @@ jobs:
          context: .
          push: ${{ github.event_name == 'push' }}
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}" , "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.gitignore
+++ b/.gitignore
@ -48,6 +48,7 @@ models-mnt
 /llama-bench
 /llava-cli
 /lookahead
+/lookup
 /main
 /metal
 /perplexity
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -302,6 +302,8 @@ if (LLAMA_CUBLAS)
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()

+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
+
    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
        # 52 == lowest CUDA 12 standard
        # 60 == f16 CUDA intrinsics
--- a/11
+++ b/11
@ -2,7 +2,7 @@
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead tests/test-c.o
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = \
@ -367,17 +367,15 @@ endif # LLAMA_BLIS

 ifdef LLAMA_CUBLAS
 	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
-	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	MK_NVCCFLAGS  = -use_fast_math
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
-
 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
-endif
-
+endif # LLAMA_DEBUG
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(LLAMA_CUDA_NVCC)
 else
@ -664,6 +662,9 @@ parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

+lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 ifdef LLAMA_METAL
 metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
--- a/README.md
+++ b/README.md
@ -102,6 +102,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
+- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)

 **Multimodal models:**

@ -132,6 +133,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [withcatai/catai](https://github.com/withcatai/catai)
 - [semperai/amica](https://github.com/semperai/amica)
 - [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
+- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)

 ---

--- a/common/common.h
+++ b/common/common.h
@ -51,7 +51,7 @@ struct gpt_params {
    int32_t n_ctx                           = 512;   // context size
    int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                          = 0;     // number of tokens to keep from initial prompt
-    int32_t n_draft                         = 16;    // number of tokens to draft during speculative decoding
+    int32_t n_draft                         = 8;     // number of tokens to draft during speculative decoding
    int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel                      = 1;     // number of parallel sequences to decode
    int32_t n_sequences                     = 1;     // number of sequences to decode
@ -240,3 +240,4 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);

 // Dump the KV cache view showing individual sequences in each cell (long output).
 void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -149,11 +149,12 @@ static void sampler_queue(
    }
 }

-llama_token llama_sampling_sample(
+static llama_token llama_sampling_sample_impl(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
-                  const int idx) {
+                  const int idx,
+                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
    const llama_sampling_params & params = ctx_sampling->params;

    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
@ -173,8 +174,17 @@ llama_token llama_sampling_sample(

    llama_token id = 0;

+    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);

+    // Declare original_logits at the beginning of the function scope
+    std::vector<float> original_logits;
+
+    if (!is_resampling) {
+        // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
+        original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
+    }
+
    // apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
@ -193,12 +203,14 @@ llama_token llama_sampling_sample(
    }

    // apply penalties
-    if (!prev.empty()) {
+    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
+    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
+    if (penalty_tokens_used_size) {
        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];

        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                prev.data() + prev.size() - penalty_last_n,
-                penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
+                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
+                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);

        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
@ -210,7 +222,8 @@ llama_token llama_sampling_sample(
        }
    }

-    if (ctx_sampling->grammar != NULL) {
+    // If we are in the resampling phase, apply grammar checks before sampling logic
+    if (is_resampling && ctx_sampling->grammar != NULL) {
        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }

@ -252,9 +265,40 @@ llama_token llama_sampling_sample(
        }
    }

+    if (ctx_sampling->grammar != NULL && !is_resampling) {
+        // Create an array with a single token data element for the sampled id
+        llama_token_data single_token_data = {id, logits[id], 0.0f};
+        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
+
+        // Apply grammar constraints to the single token
+        llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
+
+        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
+        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
+
+        // If the token is not valid according to the grammar, perform resampling
+        if (!is_valid) {
+            LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
+
+            // Restore logits from the copy
+            std::copy(original_logits.begin(), original_logits.end(), logits);
+
+            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);  // Pass true for is_resampling
+        }
+    }
+
    return id;
 }

+llama_token llama_sampling_sample(
+                  struct llama_sampling_context * ctx_sampling,
+                  struct llama_context * ctx_main,
+                  struct llama_context * ctx_cfg,
+                  const int idx) {
+    // Call the implementation function with is_resampling set to false by default
+    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
+}
+
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
--- a/common/sampling.h
+++ b/common/sampling.h
@ -36,6 +36,9 @@ typedef struct llama_sampling_params {
    float       cfg_scale     = 1.f; // how strong is guidance

    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+
+    std::vector<llama_token> penalty_prompt_tokens;
+    bool                     use_penalty_prompt_tokens = false;
 } llama_sampling_params;

 // general sampler context
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -186,6 +186,8 @@ class Model:
            return GPT2Model
        if model_architecture == "PhiForCausalLM":
            return Phi2Model
+        if model_architecture == "PlamoForCausalLM":
+            return PlamoModel
        return Model

    def _is_model_safetensors(self) -> bool:
@ -229,6 +231,8 @@ class Model:
            return gguf.MODEL_ARCH.GPT2
        if arch == "PhiForCausalLM":
            return gguf.MODEL_ARCH.PHI2
+        if arch == "PlamoForCausalLM":
+            return gguf.MODEL_ARCH.PLAMO

        raise NotImplementedError(f'Architecture "{arch}" not supported!')

@ -1068,11 +1072,91 @@ class Phi2Model(Model):
        self.gguf_writer.add_add_bos_token(False)


+class PlamoModel(Model):
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
+
+        self.gguf_writer.add_name("PLaMo")
+        self.gguf_writer.add_context_length(4096)  # not in config.json
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(5)  # hparams["num_key_value_heads"]) is wrong
+        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+    def shuffle_attn_q_weight(self, data_torch):
+        assert data_torch.size() == (5120, 5120)
+        data_torch = data_torch.reshape(8, 5, 128, 5120)
+        data_torch = torch.permute(data_torch, (1, 0, 2, 3))
+        data_torch = torch.reshape(data_torch, (5120, 5120))
+        return data_torch
+
+    def shuffle_attn_output_weight(self, data_torch):
+        assert data_torch.size() == (5120, 5120)
+        data_torch = data_torch.reshape(5120, 8, 5, 128)
+        data_torch = torch.permute(data_torch, (0, 2, 1, 3))
+        data_torch = torch.reshape(data_torch, (5120, 5120))
+        return data_torch
+
+    def write_tensors(self):
+        block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+
+        for name, data_torch in self.get_tensors():
+            if "self_attn.rotary_emb.inv_freq" in name:
+                continue
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            # shuffle for broadcasting of gqa in ggml_mul_mat
+            if new_name.endswith("attn_q.weight"):
+                data_torch = self.shuffle_attn_q_weight(data_torch)
+            elif new_name.endswith("attn_output.weight"):
+                data_torch = self.shuffle_attn_output_weight(data_torch)
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+
 ###### CONVERSION LOGIC ######


 def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
+    parser = argparse.ArgumentParser(
+        description="Convert a huggingface model to a GGML compatible file")
    parser.add_argument(
        "--vocab-only", action="store_true",
        help="extract only the vocab",
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -33,6 +33,7 @@ else()
    add_subdirectory(simple)
    add_subdirectory(speculative)
    add_subdirectory(lookahead)
+    add_subdirectory(lookup)
    add_subdirectory(train-text-from-scratch)
    if (LLAMA_METAL)
        add_subdirectory(metal)
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@ -0,0 +1,5 @@
+set(TARGET lookup)
+add_executable(${TARGET} lookup.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/lookup/README.md
+++ b/examples/lookup/README.md
@ -0,0 +1,13 @@
+# llama.cpp/examples/lookup
+
+Demonstration of Prompt Lookup Decoding
+
+https://github.com/apoorvumang/prompt-lookup-decoding
+
+The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft`. The first two determine the size of the ngrams to search for in the prompt for a match. The latter specifies how many subsequent tokens to draft if a match is found.
+
+More info:
+
+https://github.com/ggerganov/llama.cpp/pull/4484
+https://github.com/ggerganov/llama.cpp/issues/4226
+
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -0,0 +1,230 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+int main(int argc, char ** argv){
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
+
+    // max/min n-grams size to search for in prompt
+    const int ngram_max = 4;
+    const int ngram_min = 1;
+
+    // length of the candidate / draft sequence, if match is found
+    const int n_draft = params.n_draft;
+
+    const bool dump_kv_cache = params.dump_kv_cache;
+
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("lookup", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
+    // init llama.cpp
+    llama_backend_init(params.numa);
+
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;
+
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+
+    // tokenize the prompt
+    const bool add_bos = llama_should_add_bos_token(model);
+    LOG("add_bos tgt: %d\n", add_bos);
+
+    std::vector<llama_token> inp;
+    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+
+    const int max_context_size     = llama_n_ctx(ctx);
+    const int max_tokens_list_size = max_context_size - 4;
+
+    if ((int) inp.size() > max_tokens_list_size) {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        return 1;
+    }
+
+    fprintf(stderr, "\n\n");
+
+    for (auto id : inp) {
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+    }
+
+    fflush(stderr);
+
+    const int n_input = inp.size();
+
+    const auto t_enc_start = ggml_time_us();
+
+    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
+    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
+
+    const auto t_enc_end = ggml_time_us();
+
+    int n_predict = 0;
+    int n_drafted = 0;
+    int n_accept  = 0;
+
+    int n_past = inp.size();
+
+    bool has_eos = false;
+
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+
+    std::vector<llama_token> draft;
+
+    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
+
+    // debug
+    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
+
+    const auto t_dec_start = ggml_time_us();
+
+    while (true) {
+        // debug
+        if (dump_kv_cache) {
+            llama_kv_cache_view_update(ctx, &kvc_view);
+            dump_kv_cache_view_seqs(kvc_view, 40);
+        }
+
+        // print current draft sequence
+        LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
+
+        int i_dft = 0;
+        while (true) {
+            // sample from the target model
+            llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
+
+            llama_sampling_accept(ctx_sampling, ctx, id, true);
+
+            const std::string token_str = llama_token_to_piece(ctx, id);
+
+            if (!params.use_color) {
+                printf("%s", token_str.c_str());
+            }
+
+            if (id == llama_token_eos(model)) {
+                has_eos = true;
+            }
+
+            ++n_predict;
+
+            // check if the target token matches the draft
+            if (i_dft < (int) draft.size() && id == draft[i_dft]) {
+                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
+                ++n_accept;
+                ++n_past;
+                ++i_dft;
+                inp.push_back(id);
+
+                if (params.use_color) {
+                    // color accepted draft token
+                    printf("\033[34m%s\033[0m", token_str.c_str());
+                    fflush(stdout);
+                }
+                continue;
+            }
+
+            if (params.use_color) {
+                printf("%s", token_str.c_str());
+            }
+            fflush(stdout);
+
+
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+
+            draft.clear();
+            draft.push_back(id);
+            inp.push_back(id);
+            break;
+        }
+
+        if ((params.n_predict > 0 && n_predict > params.n_predict) || has_eos) {
+            break;
+        }
+
+        // KV cache management
+        // clean the cache of draft tokens that weren't accepted
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
+        llama_batch_clear(batch_tgt);
+        llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
+
+        // generate n_pred tokens through prompt lookup
+        auto prompt_lookup = [&]() -> void {
+            int inp_size = inp.size();
+            for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
+                const llama_token * ngram = &inp[inp_size - ngram_size];
+
+                for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
+                    bool match = true;
+                    for (int j = 0; j < ngram_size; ++j) {
+                        if (inp[i + j] != ngram[j]) {
+                            match = false;
+                            break;
+                        }
+                    }
+
+                    if (match) {
+                        const int startIdx = i + ngram_size;
+                        const int endIdx = startIdx + n_draft;
+                        if (endIdx < inp_size) {
+                            for (int j = startIdx; j < endIdx; ++j) {
+                                LOG(" - draft candidate %d: %d\n", j, inp[j]);
+                                draft.push_back(inp[j]);
+                                llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true);
+                                ++n_drafted;
+                            }
+                            return;
+                        }
+                    }
+                }
+            }
+            return;
+        };
+
+        prompt_lookup();
+
+        llama_decode(ctx, batch_tgt);
+        ++n_past;
+
+        draft.erase(draft.begin());
+    }
+
+    auto t_dec_end = ggml_time_us();
+
+    LOG_TEE("\n\n");
+
+    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+
+    LOG_TEE("\n");
+    LOG_TEE("n_draft   = %d\n", n_draft);
+    LOG_TEE("n_predict = %d\n", n_predict);
+    LOG_TEE("n_drafted = %d\n", n_drafted);
+    LOG_TEE("n_accept  = %d\n", n_accept);
+    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+
+    LOG_TEE("\ntarget:\n");
+    llama_print_timings(ctx);
+
+    llama_sampling_free(ctx_sampling);
+    llama_batch_free(batch_tgt);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    fprintf(stderr, "\n\n");
+
+    return 0;
+}
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -148,6 +148,8 @@ node index.js

    `frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);

+    `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens (default: `null` = use the original `prompt`).
+
    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).

    `mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -761,6 +761,42 @@ struct llama_server_context
            slot->prompt = "";
        }

+        slot->sparams.penalty_prompt_tokens.clear();
+        slot->sparams.use_penalty_prompt_tokens = false;
+        const auto &penalty_prompt = data.find("penalty_prompt");
+        if (penalty_prompt != data.end())
+        {
+            if (penalty_prompt->is_string())
+            {
+                const auto penalty_prompt_string = penalty_prompt->get<std::string>();
+                auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
+                slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
+                if (slot->params.n_predict > 0)
+                {
+                    slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
+                }
+                slot->sparams.use_penalty_prompt_tokens = true;
+            }
+            else if (penalty_prompt->is_array())
+            {
+                const auto n_tokens = penalty_prompt->size();
+                slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
+                const int n_vocab = llama_n_vocab(model);
+                for (const auto &penalty_token : *penalty_prompt)
+                {
+                    if (penalty_token.is_number_integer())
+                    {
+                        const auto tok = penalty_token.get<llama_token>();
+                        if (tok >= 0 && tok < n_vocab)
+                        {
+                            slot->sparams.penalty_prompt_tokens.push_back(tok);
+                        }
+                    }
+                }
+                slot->sparams.use_penalty_prompt_tokens = true;
+            }
+        }
+
        slot->sparams.logit_bias.clear();

        if (json_value(data, "ignore_eos", false))
@ -992,6 +1028,12 @@ struct llama_server_context
        slot.generated_text += token_str;
        slot.has_next_token = true;

+        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
+        {
+            // we can change penalty_prompt_tokens because it is always created from scratch each request
+            slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
+        }
+
        // check if there is incomplete UTF-8 character at the end
        bool incomplete = false;
        for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
@ -1183,6 +1225,8 @@ struct llama_server_context
            {"repeat_penalty",    slot.sparams.penalty_repeat},
            {"presence_penalty",  slot.sparams.penalty_present},
            {"frequency_penalty", slot.sparams.penalty_freq},
+            {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
+            {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
            {"mirostat",          slot.sparams.mirostat},
            {"mirostat_tau",      slot.sparams.mirostat_tau},
            {"mirostat_eta",      slot.sparams.mirostat_eta},
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -297,7 +297,7 @@ static void ggml_backend_registry_init(void) {
 void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
    GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);

-    int id = ggml_backend_registry_count;
+    size_t id = ggml_backend_registry_count;

    ggml_backend_registry[id] = (struct ggml_backend_reg) {
        /* .name                = */ {0},
@ -330,6 +330,8 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
            return i;
        }
    }
+
+    // not found
    return SIZE_MAX;
 }

@ -340,15 +342,15 @@ ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str)
    const char * params = strchr(backend_str, ':');
    char backend_name[128];
    if (params == NULL) {
-        strcpy(backend_name, backend_str);
+        snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
        params = "";
    } else {
-        strncpy(backend_name, backend_str, params - backend_str);
-        backend_name[params - backend_str] = '\0';
+        snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
        params++;
    }

    size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
+
    if (backend_i == SIZE_MAX) {
        fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
        return NULL;
@ -396,18 +398,12 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 }

 static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
    memcpy((char *)tensor->data + offset, data, size);

    GGML_UNUSED(buffer);
 }

 static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
    memcpy(data, (const char *)tensor->data + offset, size);

    GGML_UNUSED(buffer);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -86,17 +86,29 @@
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #define __trap abort
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
 #else
 #include <cuda_runtime.h>
+#include <cuda.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
-// CUDA 10.2 does not have these macro definitions.
-#ifndef CUBLAS_TF32_TENSOR_OP_MATH
+
+#if CUDART_VERSION < 11020
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
 #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
 #define CUBLAS_COMPUTE_16F CUDA_R_16F
 #define CUBLAS_COMPUTE_32F CUDA_R_32F
 #define cublasComputeType_t cudaDataType_t
-#endif
+#endif // CUDART_VERSION < 11020
+
 #endif // defined(GGML_USE_HIPBLAS)

 #include "ggml-cuda.h"
@ -200,45 +212,45 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {

 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");

-#define CUDA_CHECK(err)                                                                 \
-    do {                                                                                \
-        cudaError_t err_ = (err);                                                       \
-        if (err_ != cudaSuccess) {                                                      \
-            int id;                                                                     \
-            cudaGetDevice(&id);                                                         \
-            fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
-                cudaGetErrorString(err_));                                              \
-            fprintf(stderr, "current device: %d\n", id);                                \
-            GGML_ASSERT(!"CUDA error");                                                 \
-        }                                                                               \
-    } while (0)
-
 #if CUDART_VERSION >= 12000
-#define CUBLAS_CHECK(err)                                                               \
-    do {                                                                                \
-        cublasStatus_t err_ = (err);                                                    \
-        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
-            int id;                                                                     \
-            cudaGetDevice(&id);                                                         \
-            fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n",                         \
-                    err_, __FILE__, __LINE__, cublasGetStatusString(err_));             \
-            fprintf(stderr, "current device: %d\n", id);                                \
-            GGML_ASSERT(!"cuBLAS error");                                               \
-        }                                                                               \
-    } while (0)
+    static const char * cublas_get_error_str(const cublasStatus_t err) {
+        return cublasGetStatusString(err);
+    }
 #else
-#define CUBLAS_CHECK(err)                                                               \
-    do {                                                                                \
-        cublasStatus_t err_ = (err);                                                    \
-        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
-            int id;                                                                     \
-            cudaGetDevice(&id);                                                         \
-            fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);  \
-            fprintf(stderr, "current device: %d\n", id);                                \
-            GGML_ASSERT(!"cuBLAS error");                                               \
-        }                                                                               \
-    } while (0)
-#endif // CUDART_VERSION >= 11
+    static const char * cublas_get_error_str(const cublasStatus_t err) {
+        switch (err) {
+            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+            default: return "unknown error";
+        }
+    }
+#endif // CUDART_VERSION >= 12000
+
+[[noreturn]]
+static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
+    fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg);
+    fprintf(stderr, "  in function %s at %s:%d\n", func, file, line);
+    GGML_ASSERT(!"CUDA error");
+}
+
+#define CUDA_CHECK(err)   do { auto err_ = (err); if (err_ != cudaSuccess)           ggml_cuda_error(#err, __func__, __FILE__, __LINE__, cudaGetErrorString(err_));   } while (0)
+#define CUBLAS_CHECK(err) do { auto err_ = (err); if (err_ != CUBLAS_STATUS_SUCCESS) ggml_cuda_error(#err, __func__, __FILE__, __LINE__, cublas_get_error_str(err_)); } while (0)
+
+#if !defined(GGML_USE_HIPBLAS)
+static const char * cu_get_error_str(CUresult err) {
+    const char * err_str;
+    cuGetErrorString(err, &err_str);
+    return err_str;
+}
+#define CU_CHECK(err)     do { auto err_ = (err); if (err_ != CUDA_SUCCESS)          ggml_cuda_error(#err, __func__, __FILE__, __LINE__, cu_get_error_str(err_));     } while (0)
+#endif

 #if CUDART_VERSION >= 11100
 #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
@ -516,9 +528,17 @@ inline cudaError_t ggml_cuda_set_device(const int device) {

 static int g_device_count = -1;
 static int g_main_device = 0;
-static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};

+struct cuda_device_capabilities {
+    int     cc;                 // compute capability
+    bool    vmm;                // virtual memory support
+    size_t  vmm_granularity;    // granularity of virtual memory
+};
+
+static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
+
+
 static void * g_scratch_buffer = nullptr;
 static size_t g_scratch_size = 0; // disabled by default
 static size_t g_scratch_offset = 0;
@ -5875,7 +5895,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_compute_capabilities[id];
+    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
@ -5920,7 +5940,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_compute_capabilities[id];
+    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
@ -5965,7 +5985,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_compute_capabilities[id];
+    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
@ -6010,7 +6030,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_compute_capabilities[id];
+    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
@ -6055,7 +6075,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_compute_capabilities[id];
+    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
@ -6100,7 +6120,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_compute_capabilities[id];
+    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
@ -6147,7 +6167,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_compute_capabilities[id];
+    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
@ -6193,7 +6213,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_compute_capabilities[id];
+    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
@ -6238,7 +6258,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_compute_capabilities[id];
+    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
@ -6283,7 +6303,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(

    int id;
    CUDA_CHECK(cudaGetDevice(&id));
-    const int compute_capability = g_compute_capabilities[id];
+    const int compute_capability = g_device_caps[id].cc;

    int mmq_x, mmq_y, nwarps;
    if (compute_capability >= CC_RDNA2) {
@ -6543,21 +6563,24 @@ struct scoped_spin_lock {
    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
 };

+static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+
+// #define DEBUG_CUDA_MALLOC
 struct cuda_buffer {
    void * ptr = nullptr;
    size_t size = 0;
 };

 static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
-static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};

-static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+static void * ggml_cuda_pool_malloc_leg(size_t size, size_t * actual_size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
 #ifdef DEBUG_CUDA_MALLOC
    int nnz = 0;
-    size_t max_size = 0, tot_size = 0;
+    size_t max_size = 0;
 #endif
    size_t best_diff = 1ull << 36;
    int ibest = -1;
@ -6566,7 +6589,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
        if (b.ptr != nullptr) {
 #ifdef DEBUG_CUDA_MALLOC
            ++nnz;
-            tot_size += b.size;
            if (b.size > max_size) max_size = b.size;
 #endif
            if (b.size >= size) {
@ -6593,19 +6615,20 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
        b.size = 0;
        return ptr;
    }
-#ifdef DEBUG_CUDA_MALLOC
-    fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
-            (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
-#endif
    void * ptr;
    size_t look_ahead_size = (size_t) (1.05 * size);
    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
    CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
    *actual_size = look_ahead_size;
+    g_cuda_pool_size[id] += look_ahead_size;
+#ifdef DEBUG_CUDA_MALLOC
+    fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
+            (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
+#endif
    return ptr;
 }

-static void ggml_cuda_pool_free(void * ptr, size_t size) {
+static void ggml_cuda_pool_free_leg(void * ptr, size_t size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
    CUDA_CHECK(cudaGetDevice(&id));
@ -6620,8 +6643,152 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
    }
    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
    CUDA_CHECK(cudaFree(ptr));
+    g_cuda_pool_size[id] -= size;
 }

+#if !defined(GGML_USE_HIPBLAS)
+// pool with virtual memory
+static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
+static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
+static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
+static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
+
+static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+
+    // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
+    const size_t alignment = 128;
+    size = alignment * ((size + alignment - 1) / alignment);
+
+    size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id];
+
+    if (size > avail) {
+        // round up to the next multiple of the granularity
+        size_t reserve_size = size - avail;
+        const size_t granularity = g_device_caps[id].vmm_granularity;
+        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
+
+        GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
+
+        // allocate more physical memory
+        CUmemAllocationProp prop = {};
+        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id = id;
+        CUmemGenericAllocationHandle handle;
+        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
+
+        // reserve virtual address space (if not already reserved)
+        if (g_cuda_pool_addr[id] == 0) {
+            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+        }
+
+        // map at the end of the pool
+        CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, 0, handle, 0));
+
+        // set access
+        CUmemAccessDesc access = {};
+        access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        access.location.id = id;
+        access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, &access, 1));
+
+        // add to the pool
+        g_cuda_pool_handles[id].push_back(handle);
+        g_cuda_pool_size[id] += reserve_size;
+
+        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
+        //       (unsigned long long) (reserve_size/1024/1024));
+    }
+
+    GGML_ASSERT(g_cuda_pool_addr[id] != 0);
+
+    void * ptr = (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]);
+    *actual_size = size;
+    g_cuda_pool_used[id] += size;
+
+#ifdef DEBUG_CUDA_MALLOC
+    printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
+#endif
+
+    return ptr;
+}
+
+static void ggml_cuda_pool_free_vmm(void * ptr, size_t size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+
+#ifdef DEBUG_CUDA_MALLOC
+    printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
+#endif
+
+    g_cuda_pool_used[id] -= size;
+
+    // all deallocations must be in reverse order of the allocations
+    GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
+}
+
+static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    if (g_device_caps[id].vmm) {
+        return ggml_cuda_pool_malloc_vmm(size, actual_size);
+    } else {
+        return ggml_cuda_pool_malloc_leg(size, actual_size);
+    }
+}
+
+static void ggml_cuda_pool_free(void * ptr, size_t size) {
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+    if (g_device_caps[id].vmm) {
+        ggml_cuda_pool_free_vmm(ptr, size);
+    } else {
+        ggml_cuda_pool_free_leg(ptr, size);
+    }
+}
+#else
+#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
+#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
+#endif // !defined(GGML_USE_HIPBLAS)
+
+template<typename T>
+struct cuda_pool_alloc {
+    T * ptr = nullptr;
+    size_t actual_size = 0;
+
+    // size is in number of elements
+    T * alloc(size_t size) {
+        GGML_ASSERT(ptr == nullptr);
+        ptr = (T *) ggml_cuda_pool_malloc(size * sizeof(T), &this->actual_size);
+        return ptr;
+    }
+
+    cuda_pool_alloc(size_t size) {
+        alloc(size);
+    }
+
+    ~cuda_pool_alloc() {
+        if (ptr != nullptr) {
+            ggml_cuda_pool_free(ptr, actual_size);
+        }
+    }
+
+    T * get() {
+        return ptr;
+    }
+
+    cuda_pool_alloc() = default;
+    cuda_pool_alloc(const cuda_pool_alloc &) = delete;
+    cuda_pool_alloc(cuda_pool_alloc &&) = delete;
+    cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
+    cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
+};
+
 static bool g_cublas_loaded = false;

 bool ggml_cublas_loaded(void) {
@ -6660,16 +6827,33 @@ void ggml_init_cublas() {
 #endif
        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
        for (int id = 0; id < g_device_count; ++id) {
+            int device_vmm = 0;
+
+#if !defined(GGML_USE_HIPBLAS)
+            CUdevice device;
+            CU_CHECK(cuDeviceGet(&device, id));
+            CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
+
+            if (device_vmm) {
+                CUmemAllocationProp alloc_prop = {};
+                alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+                alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+                alloc_prop.location.id = id;
+                CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+            }
+#endif // !defined(GGML_USE_HIPBLAS)
+            g_device_caps[id].vmm = !!device_vmm;
+
            cudaDeviceProp prop;
            CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
-            fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
+            fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");

            g_tensor_split[id] = total_vram;
            total_vram += prop.totalGlobalMem;
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-            g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
+            g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
 #else
-            g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
+            g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        }
        for (int id = 0; id < g_device_count; ++id) {
@ -6729,8 +6913,7 @@ void * ggml_cuda_host_malloc(size_t size) {
    void * ptr = nullptr;
    cudaError_t err = cudaMallocHost((void **) &ptr, size);
    if (err != cudaSuccess) {
-        // The allocation error can be bypassed. A null ptr will assigned out of this function.
-        // This can fixed the OOM error in WSL.
+        // clear the error
        cudaGetLastError();
        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
            size/1024.0/1024.0, cudaGetErrorString(err));
@ -7179,11 +7362,11 @@ static int64_t get_row_rounding(ggml_type type) {
    int64_t max_compute_capability = INT_MIN;
    for (int64_t id = 0; id < g_device_count; ++id) {
        if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
-            if (min_compute_capability > g_compute_capabilities[id]) {
-                min_compute_capability = g_compute_capabilities[id];
+            if (min_compute_capability > g_device_caps[id].cc) {
+                min_compute_capability = g_device_caps[id].cc;
            }
-            if (max_compute_capability < g_compute_capabilities[id]) {
-                max_compute_capability = g_compute_capabilities[id];
+            if (max_compute_capability < g_device_caps[id].cc) {
+                max_compute_capability = g_device_caps[id].cc;
            }
        }
    }
@ -7298,8 +7481,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(

    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
 #ifdef GGML_CUDA_F16
-    size_t ash;
-    dfloat * src1_dfloat = nullptr; // dfloat == half
+    cuda_pool_alloc<half> src1_dfloat_a;
+    half * src1_dfloat = nullptr; // dfloat == half

    bool src1_convert_f16 =
        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
@ -7307,7 +7490,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;

    if (src1_convert_f16) {
-        src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
+        src1_dfloat = src1_dfloat_a.alloc(ne00);
        ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
                                ne00, 1, sizeof(float), 0, 0,
                                ne00, 1, sizeof(half),  0, 0, stream);
@ -7355,12 +7538,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
            break;
    }

-#ifdef GGML_CUDA_F16
-    if (src1_convert_f16) {
-        ggml_cuda_pool_free(src1_dfloat, ash);
-    }
-#endif // GGML_CUDA_F16
-
    (void) src1;
    (void) dst;
    (void) src1_ddq_i;
@ -7391,33 +7568,30 @@ inline void ggml_cuda_op_mul_mat_cublas(
    // ldc == nrows of the matrix that cuBLAS writes into
    int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;

-    const int compute_capability = g_compute_capabilities[id];
+    const int compute_capability = g_device_caps[id].cc;

    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-        half * src0_as_f16 = nullptr;
-        size_t src0_as = 0;
+        cuda_pool_alloc<half> src0_as_f16;
        if (src0->type != GGML_TYPE_F16) {
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
            size_t ne = row_diff*ne00;
-            src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
-            to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
+            src0_as_f16.alloc(ne);
+            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
        }
-        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
+        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();

-        half * src1_as_f16 = nullptr;
-        size_t src1_as = 0;
+        cuda_pool_alloc<half> src1_as_f16;
        if (src1->type != GGML_TYPE_F16) {
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
            size_t ne = src1_ncols*ne10;
-            src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
-            to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
+            src1_as_f16.alloc(ne);
+            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
        }
-        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
-        size_t dst_as = 0;
-        half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
+        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
+        cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);

        const half alpha_f16 = 1.0f;
        const half beta_f16 = 0.0f;
@ -7426,36 +7600,25 @@ inline void ggml_cuda_op_mul_mat_cublas(
        CUBLAS_CHECK(
            cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                    row_diff, src1_ncols, ne10,
-                    &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
-                                src1_ptr, CUDA_R_16F, ne10,
-                    &beta_f16,   dst_f16, CUDA_R_16F, ldc,
+                    &alpha_f16, src0_ptr,       CUDA_R_16F, ne00,
+                                src1_ptr,       CUDA_R_16F, ne10,
+                    &beta_f16,   dst_f16.get(), CUDA_R_16F, ldc,
                    CUBLAS_COMPUTE_16F,
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));

        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
-
-        ggml_cuda_pool_free(dst_f16, dst_as);
-
-        if (src0_as != 0) {
-            ggml_cuda_pool_free(src0_as_f16, src0_as);
-        }
-
-        if (src1_as != 0) {
-            ggml_cuda_pool_free(src1_as_f16, src1_as);
-        }
+        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
    }
    else {
-        float * src0_ddq_as_f32 = nullptr;
-        size_t src0_as = 0;
+        cuda_pool_alloc<float> src0_ddq_as_f32;

        if (src0->type != GGML_TYPE_F32) {
            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
            GGML_ASSERT(to_fp32_cuda != nullptr);
-            src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
-            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
+            src0_ddq_as_f32.alloc(row_diff*ne00);
+            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
        }
-        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
+        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();

        const float alpha = 1.0f;
        const float beta = 0.0f;
@ -7467,10 +7630,6 @@ inline void ggml_cuda_op_mul_mat_cublas(
                    &alpha, src0_ddf_i, ne00,
                            src1_ddf_i, ne10,
                    &beta,  dst_dd_i,   ldc));
-
-        if (src0_as != 0) {
-            ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
-        }
    }

    (void) dst;
@ -7762,18 +7921,17 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
    float * src1_ddf = nullptr;
    float *  dst_ddf = nullptr;

-    // as = actual size
-    size_t src0_asf = 0;
-    size_t src1_asf = 0;
-    size_t  dst_asf = 0;
+    cuda_pool_alloc<float> src0_f;
+    cuda_pool_alloc<float> src1_f;
+    cuda_pool_alloc<float>  dst_f;

    ggml_cuda_set_device(g_main_device);
-    const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];

    if (src0_on_device) {
        src0_ddf = (float *) src0_extra->data_device[g_main_device];
    } else {
-        src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
+        src0_ddf = src0_f.alloc(ggml_nelements(src0));
        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
    }

@ -7781,14 +7939,14 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
        if (src1_on_device) {
            src1_ddf = (float *) src1_extra->data_device[g_main_device];
        } else {
-            src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
+            src1_ddf = src1_f.alloc(ggml_nelements(src1));
            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
        }
    }
    if (dst_on_device) {
        dst_ddf = (float *) dst_extra->data_device[g_main_device];
    } else {
-        dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
+        dst_ddf = dst_f.alloc(ggml_nelements(dst));
    }

    // do the computation
@ -7800,16 +7958,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
    }

-    if (src0_asf > 0) {
-        ggml_cuda_pool_free(src0_ddf, src0_asf);
-    }
-    if (src1_asf > 0) {
-        ggml_cuda_pool_free(src1_ddf, src1_asf);
-    }
-    if (dst_asf > 0) {
-        ggml_cuda_pool_free(dst_ddf, dst_asf);
-    }
-
    if (dst->backend == GGML_BACKEND_CPU) {
        CUDA_CHECK(cudaDeviceSynchronize());
    }
@ -7937,12 +8085,16 @@ static void ggml_cuda_op_mul_mat(

            if (id != 0) {
                row_low[id]  = ne01*g_tensor_split[id];
-                row_low[id] -= row_low[id] % rounding;
+                if (row_low[id] < ne01) {
+                    row_low[id] -= row_low[id] % rounding;
+                }
            }

            if (id != g_device_count - 1) {
                row_high[id]  = ne01*g_tensor_split[id + 1];
-                row_high[id] -= row_high[id] % rounding;
+                if (row_high[id] < ne01) {
+                    row_high[id] -= row_high[id] % rounding;
+                }
            }
        }
    }
@ -8119,17 +8271,17 @@ static void ggml_cuda_op_mul_mat(
        CUDA_CHECK(ggml_cuda_set_device(id));

        // free buffers again when done
-        if (src0_as[id] > 0) {
-            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
-        }
-        if (src1_asf[id] > 0) {
-            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+        if (dst_as[id] > 0) {
+            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
        }
        if (src1_asq[id] > 0) {
            ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
        }
-        if (dst_as[id] > 0) {
-            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
+        if (src1_asf[id] > 0) {
+            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+        }
+        if (src0_as[id] > 0) {
+            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
        }
    }

@ -8382,14 +8534,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
    GGML_ASSERT(to_fp16_cuda != nullptr);

-    size_t src1_as = 0;
-    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
-    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
+    cuda_pool_alloc<half> src1_as_f16(ne1);
+    to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);

-    size_t dst_as = 0;
-
-    half * dst_f16 = nullptr;
-    char * dst_t   = nullptr;
+    cuda_pool_alloc<half> dst_f16;
+    char * dst_t;

    cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
    cudaDataType_t      cu_data_type    = CUDA_R_16F;
@ -8408,8 +8557,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    const void * beta  = &beta_f16;

    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
-        dst_t   = (char *) dst_f16;
+        dst_t = (char *) dst_f16.alloc(ne);

        nbd2 /= sizeof(float) / sizeof(half);
        nbd3 /= sizeof(float) / sizeof(half);
@ -8456,9 +8604,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        CUBLAS_CHECK(
        cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                alpha, (const char *) src0_as_f16, CUDA_R_16F,   nb01/sizeof(half),  src0->nb[2]/sizeof(half),  // strideA
-                       (const char *) src1_as_f16, CUDA_R_16F,   nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
-                beta,  (      char *)       dst_t, cu_data_type, ne01,                dst->nb[2]/sizeof(float), // strideC
+                alpha, (const char *) src0_as_f16,       CUDA_R_16F,   nb01/sizeof(half),  src0->nb[2]/sizeof(half),  // strideA
+                       (const char *) src1_as_f16.get(), CUDA_R_16F,   nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
+                beta,  (      char *)       dst_t,       cu_data_type, ne01,                dst->nb[2]/sizeof(float), // strideC
                ne12*ne13,
                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@ -8466,19 +8614,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        // use cublasGemmBatchedEx
        const int ne23 = ne12*ne13;

-        const void ** ptrs_src = nullptr;
-              void ** ptrs_dst = nullptr;
-
-        size_t ptrs_src_s = 0;
-        size_t ptrs_dst_s = 0;
-
-        ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
-        ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
+        cuda_pool_alloc<const void *> ptrs_src(2*ne23);
+        cuda_pool_alloc<      void *> ptrs_dst(1*ne23);

        dim3 block_dims(ne13, ne12);
        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
-                src0_as_f16, src1_as_f16, dst_t,
-                ptrs_src, ptrs_dst,
+                src0_as_f16, src1_as_f16.get(), dst_t,
+                ptrs_src.get(), ptrs_dst.get(),
                ne12, ne13,
                ne23,
                nb02, nb03,
@ -8490,30 +8632,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        CUBLAS_CHECK(
        cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F,   nb01/sizeof(half),
-                       (const void **) (ptrs_src + 1*ne23), CUDA_R_16F,   nb11/sizeof(float),
-                beta,  (      void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
+                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/sizeof(half),
+                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/sizeof(float),
+                beta,  (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
                ne23,
                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-        if (ptrs_src_s != 0) {
-            ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
-        }
-        if (ptrs_dst_s != 0) {
-            ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
-        }
    }
 #endif

    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
-
-        ggml_cuda_pool_free(dst_f16, dst_as);
+        to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
    }
-
-    ggml_cuda_pool_free(src1_as_f16, src1_as);
 }

 static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -8526,8 +8657,8 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1

    int64_t min_compute_capability = INT_MAX;
    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
-            min_compute_capability = g_compute_capabilities[id];
+        if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            min_compute_capability = g_device_caps[id].cc;
        }
    }

@ -8840,12 +8971,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
        }
    } else {
-        size_t as_src1, as_dst;
-        char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
-        char *  dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst),  &as_dst);
+        cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
+        cuda_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));

-        src1_row_extra.data_device[g_main_device] = src1_contiguous;
-        dst_row_extra.data_device[g_main_device]  =  dst_contiguous;
+        src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
+        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();

        const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
@ -8865,7 +8995,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s

                GGML_ASSERT(row_id >= 0 && row_id < n_as);

-                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
+                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
                                        nb11, src1_kind, stream));
                num_src1_rows++;
            }
@ -8897,14 +9027,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s

                GGML_ASSERT(row_id >= 0 && row_id < n_as);

-                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
+                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
                                        nb1, dst_kind, stream));
                num_src1_rows++;
            }
        }
-
-        ggml_cuda_pool_free(src1_contiguous, as_src1);
-        ggml_cuda_pool_free(dst_contiguous,  as_dst);
    }

    if (dst->backend == GGML_BACKEND_CPU) {
@ -9670,12 +9797,16 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
 // host buffer type

 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    CUDA_CHECK(cudaFreeHost(buffer->context));
+    ggml_cuda_host_free(buffer->context);
 }

 static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr;
-    CUDA_CHECK(cudaMallocHost(&ptr, size));
+    void * ptr = ggml_cuda_host_malloc(size);
+
+    if (ptr == nullptr) {
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }

    // FIXME: this is a hack to avoid having to implement a new buffer type
    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
--- a/ggml.c
+++ b/ggml.c
@ -17456,9 +17456,9 @@ static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g
 }

 //
-// ADAM
+// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
 //
-//   ref: https://arxiv.org/pdf/1412.6980.pdf
+// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
 //

 static enum ggml_opt_result ggml_opt_adam(
@ -19351,7 +19351,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                        }
                        gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
-                        free(data);
+                        free((void *)data);
                    } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
                        GGML_ASSERT(false && "nested arrays not supported");
                    } else {
--- a/ggml.h
+++ b/ggml.h
@ -255,6 +255,8 @@
 #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
 #elif defined(__GNUC__)
 #define GGML_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define GGML_UNREACHABLE() __assume(0)
 #else
 #define GGML_UNREACHABLE() ((void) 0)
 #endif
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -96,6 +96,7 @@ class MODEL_ARCH(IntEnum):
    STABLELM  = auto()
    QWEN      = auto()
    PHI2      = auto()
+    PLAMO     = auto()


 class MODEL_TENSOR(IntEnum):
@ -142,6 +143,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.STABLELM:       "stablelm",
    MODEL_ARCH.QWEN:           "qwen",
    MODEL_ARCH.PHI2:           "phi2",
+    MODEL_ARCH.PLAMO:          "plamo",
 }

 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -349,6 +351,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
+    MODEL_ARCH.PLAMO: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
    MODEL_ARCH.GPT2: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.POS_EMBD,
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -82,6 +82,7 @@ class TensorNameMap:
            "model.layers.{bid}.ln1",                               # yi
            "h.{bid}.ln_1",                                         # gpt2
            "transformer.h.{bid}.ln",                               # phi2
+            "model.layers.layers.{bid}.norm",                       # plamo
        ),

        # Attention norm 2
@ -103,26 +104,29 @@ class TensorNameMap:

        # Attention query
        MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",       # llama-hf
-            "layers.{bid}.attention.wq",                 # llama-pth
-            "encoder.layer.{bid}.attention.self.query",  # bert
-            "transformer.h.{bid}.attn.q_proj",           # gpt-j
+            "model.layers.{bid}.self_attn.q_proj",         # llama-hf
+            "layers.{bid}.attention.wq",                   # llama-pth
+            "encoder.layer.{bid}.attention.self.query",    # bert
+            "transformer.h.{bid}.attn.q_proj",             # gpt-j
+            "model.layers.layers.{bid}.self_attn.q_proj",  # plamo
        ),

        # Attention key
        MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",     # llama-hf
-            "layers.{bid}.attention.wk",               # llama-pth
-            "encoder.layer.{bid}.attention.self.key",  # bert
-            "transformer.h.{bid}.attn.k_proj",         # gpt-j
+            "model.layers.{bid}.self_attn.k_proj",         # llama-hf
+            "layers.{bid}.attention.wk",                   # llama-pth
+            "encoder.layer.{bid}.attention.self.key",      # bert
+            "transformer.h.{bid}.attn.k_proj",             # gpt-j
+            "model.layers.layers.{bid}.self_attn.k_proj",  # plamo
        ),

        # Attention value
        MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",       # llama-hf
-            "layers.{bid}.attention.wv",                 # llama-pth
-            "encoder.layer.{bid}.attention.self.value",  # bert
-            "transformer.h.{bid}.attn.v_proj",           # gpt-j
+            "model.layers.{bid}.self_attn.v_proj",         # llama-hf
+            "layers.{bid}.attention.wv",                   # llama-pth
+            "encoder.layer.{bid}.attention.self.value",    # bert
+            "transformer.h.{bid}.attn.v_proj",             # gpt-j
+            "model.layers.layers.{bid}.self_attn.v_proj",  # plamo
        ),

        # Attention output
@ -139,12 +143,14 @@ class TensorNameMap:
            "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
            "h.{bid}.attn.c_proj",                                       # gpt2
            "transformer.h.{bid}.mixer.out_proj",                        # phi2
+            "model.layers.layers.{bid}.self_attn.o_proj",                # plamo
        ),

        # Rotary embeddings
        MODEL_TENSOR.ATTN_ROT_EMBD: (
-            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",   # llama-hf
-            "layers.{bid}.attention.inner_attention.rope.freqs",  # llama-pth
+            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",        # llama-hf
+            "layers.{bid}.attention.inner_attention.rope.freqs",       # llama-pth
+            "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
        ),

        # Feed-forward norm
@ -181,6 +187,7 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.w1",                             # qwen
            "h.{bid}.mlp.c_fc",                                       # gpt2
            "transformer.h.{bid}.mlp.fc1",                            # phi2
+            "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
        ),

        MODEL_TENSOR.FFN_UP_EXP: (
@ -193,6 +200,7 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact
            "layers.{bid}.feed_forward.w1",               # llama-pth
            "transformer.h.{bid}.mlp.w2",                 # qwen
+            "model.layers.layers.{bid}.mlp.gate_proj",    # plamo
        ),

        MODEL_TENSOR.FFN_GATE_EXP: (
@ -214,6 +222,7 @@ class TensorNameMap:
            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
            "h.{bid}.mlp.c_proj",                                     # gpt2
            "transformer.h.{bid}.mlp.fc2",                            # phi2
+            "model.layers.layers.{bid}.mlp.down_proj",                # plamo
        ),

        MODEL_TENSOR.FFN_DOWN_EXP: (
--- a/llama.cpp
+++ b/llama.cpp
@ -198,6 +198,7 @@ enum llm_arch {
    LLM_ARCH_STABLELM,
    LLM_ARCH_QWEN,
    LLM_ARCH_PHI2,
+    LLM_ARCH_PLAMO,
    LLM_ARCH_UNKNOWN,
 };

@ -216,6 +217,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
    { LLM_ARCH_STABLELM,        "stablelm"  },
    { LLM_ARCH_QWEN,            "qwen"      },
    { LLM_ARCH_PHI2,            "phi2"      },
+    { LLM_ARCH_PLAMO,           "plamo"     },
 };

 enum llm_kv {
@ -576,6 +578,24 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
+    {
+        LLM_ARCH_PLAMO,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },

    {
        LLM_ARCH_UNKNOWN,
@ -1186,21 +1206,27 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
 }

 static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
+    ggml_backend_buffer_type_t buft = nullptr;
+
 #ifdef GGML_USE_METAL
    if (n_gpu_layers > 0) {
-        return ggml_backend_metal_buffer_type();
+        buft = ggml_backend_metal_buffer_type();
    }
 #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
    if (n_gpu_layers > 0) {
-        return ggml_backend_cuda_buffer_type(0);
+        buft = ggml_backend_cuda_buffer_type(0);
    }
 #elif defined(GGML_USE_CUBLAS)
-    return ggml_backend_cuda_host_buffer_type();
+    buft = ggml_backend_cuda_host_buffer_type();
 #elif defined(GGML_USE_CPU_HBM)
-    return ggml_backend_cpu_hbm_buffer_type();
+    buft = ggml_backend_cpu_hbm_buffer_type();
 #endif

-    return ggml_backend_cpu_buffer_type();
+    if (buft == nullptr) {
+        buft = ggml_backend_cpu_buffer_type();
+    }
+
+    return buft;

    GGML_UNUSED(n_gpu_layers);
 }
@ -1288,7 +1314,7 @@ struct llama_hparams {
        if (this->rope_finetuned  != other.rope_finetuned)  return true;
        if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;

-        const float EPSILON = 1e-9;
+        const float EPSILON = 1e-9f;

        if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
        if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
@ -2760,6 +2786,15 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_PLAMO:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 40: model.type = e_model::MODEL_13B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+               }
+            } break;
        case LLM_ARCH_GPT2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@ -3652,6 +3687,51 @@ static bool llm_load_tensors(
                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i),   {n_ff},         backend);
                    }
                } break;
+            case LLM_ARCH_PLAMO:
+                {
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+
+                    // output
+                    {
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
+
+                        if (n_gpu_layers > int(n_layer)) {
+                            backend_norm   = llama_backend_offload;
+                            backend_output = llama_backend_offload_split;
+                        } else {
+                            backend_norm   = GGML_BACKEND_CPU;
+                            backend_output = GGML_BACKEND_CPU;
+                        }
+
+                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                    }
+
+                    const uint32_t n_ff = hparams.n_ff;
+
+                    const int i_gpu_start = n_layer - n_gpu_layers;
+
+                    model.layers.resize(n_layer);
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+
+                        layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
+                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+
+                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                    }
+                } break;
            case LLM_ARCH_GPT2:
                {
                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
@ -5650,6 +5730,109 @@ struct llm_build_context {
        return gf;
    }

+    struct ggml_cgraph * build_plamo() {
+        struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            struct ggml_tensor * attention_norm = cur;
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_custom(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
+                        n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_custom(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
+                        n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                        model.layers[il].wo, NULL,
+                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                cb(cur, "kqv_out", il);
+            }
+            struct ggml_tensor * sa_out = cur;
+
+            cur = attention_norm;
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up, NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, sa_out);
+            cb(cur, "l_out", il);
+
+            cur = ggml_add(ctx0, cur, inpL);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
    struct ggml_cgraph * build_gpt2() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

@ -6258,6 +6441,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_phi2();
            } break;
+        case LLM_ARCH_PLAMO:
+            {
+                result = llm.build_plamo();
+            } break;
        case LLM_ARCH_GPT2:
            {
                result = llm.build_gpt2();
@ -10497,7 +10684,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
                std::string result = model->vocab.id_to_token[token].text;
                llama_unescape_whitespace(result);
                if (length < (int) result.length()) {
-                    return -result.length();
+                    return -(int) result.length();
                }
                memcpy(buf, result.c_str(), result.length());
                return result.length();
@ -10527,7 +10714,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
                std::string result = model->vocab.id_to_token[token].text;
                result = llama_decode_text(result);
                if (length < (int) result.length()) {
-                    return -result.length();
+                    return -(int) result.length();
                }
                memcpy(buf, result.c_str(), result.length());
                return result.length();
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@ -883,9 +883,6 @@ int main(int argc, const char ** argv) {
            srand(seed);
            const int nargs = 1;

-            int64_t ne2[4];
-            ne2[0] = 1;
-
            for (int ndims = 1; ndims <= 2; ++ndims) {
                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);