Merge branch 'ggerganov:master' into master

2023-07-29 10:15:57 +02:00 · 2023-07-29 10:15:57 +02:00 · a4e9c92292
commit a4e9c92292
parent 9f894cca30 8a88e5855c
38 changed files with 4130 additions and 1495 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -197,6 +197,8 @@ jobs:
    strategy:
      matrix:
        include:
          - build: 'noavx'
            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
          - build: 'avx2'
            defines: '-DLLAMA_BUILD_SERVER=ON'
          - build: 'avx'
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -357,6 +357,7 @@ if (LLAMA_ALL_WARNINGS)
            -Wshadow
            -Wstrict-prototypes
            -Wpointer-arith
            -Wmissing-prototypes
        )
        set(cxx_flags
            -Wall
--- a/10
+++ b/10
@ -63,7 +63,8 @@ ifdef LLAMA_SERVER_VERBOSE
 endif
 # warnings
-CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
+CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
 			-Wmissing-prototypes
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
 # OS specific
@ -323,6 +324,9 @@ llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
@ -333,7 +337,7 @@ clean:
 # Examples
 #
-main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o $(OBJS)
+main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
@ -357,7 +361,7 @@ embedding: examples/embedding/embedding.cpp                   build-info.h ggml.
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
 $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
--- a/README.md
+++ b/README.md
@ -77,6 +77,7 @@ as the main playground for developing new features for the [ggml](https://github
 **Supported models:**
 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
@ -650,6 +651,19 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
 - The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
 - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
 ### Obtaining and using the Facebook LLaMA 2 model
 - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
 - Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
  - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGML)
  - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGML)
  - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGML)
  - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML)
  - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML)
  - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML)
 - Specify `-eps 1e-5` for best generation quality
 - Specify `-gqa 8` for 70B models to work
 ### Verifying the model files
 Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
--- a/convert.py
+++ b/convert.py
@ -133,7 +133,7 @@ TENSORS_SET = set(TENSORS_LIST)
 def find_n_mult(n_ff: int, n_embd: int) -> int:
    # hardcoded magic range
-    for n_mult in range(256, 1, -1):
+    for n_mult in range(8192, 1, -1):
        calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
        if calc_ff == n_ff:
            return n_mult
@ -146,6 +146,7 @@ class Params:
    n_mult:    int
    n_head:    int
    n_layer:   int
    n_kv_head: Optional[int]  # This parameter is only used for Llama 2
    @staticmethod
    def guessed(model: 'LazyModel') -> 'Params':
@ -172,6 +173,7 @@ class Params:
            n_mult    = 256,
            n_head    = n_head,
            n_layer   = n_layer,
            n_kv_head = None,
        )
    @staticmethod
@ -183,6 +185,7 @@ class Params:
        n_head  = config["num_attention_heads"];
        n_layer = config["num_hidden_layers"];
        n_ff    = config["intermediate_size"];
        n_kv_head = config.get("num_key_value_heads")
        n_mult = find_n_mult(n_ff, n_embd);
@ -192,6 +195,7 @@ class Params:
            n_mult    = n_mult,
            n_head    = n_head,
            n_layer   = n_layer,
            n_kv_head = n_kv_head,
        )
    # LLaMA v2 70B params.json
@ -215,6 +219,7 @@ class Params:
            n_mult    = n_mult,
            n_head    = n_head,
            n_layer   = n_layer,
            n_kv_head = None,
        )
    @staticmethod
@ -234,13 +239,20 @@ class Params:
 class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path], vocabtype: Optional[str]) -> None:
        self.vocabtype = vocabtype
        if self.vocabtype == "bpe":
          self.sentencepiece_tokenizer = json.loads(open(str(fname_tokenizer)).read())
        else:
          self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        added_tokens: Dict[str, int]
        if fname_added_tokens is not None:
            added_tokens = json.load(open(fname_added_tokens))
        else:
            added_tokens = {}
        if self.vocabtype == "bpe":
          vocab_size: int = len(self.sentencepiece_tokenizer)
        else:
          vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids = sorted(added_tokens.values())
@ -255,6 +267,16 @@ class SentencePieceVocab:
    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
        tokenizer = self.sentencepiece_tokenizer
        if self.vocabtype == "bpe":
          from transformers.models.gpt2 import tokenization_gpt2
          byte_encoder = tokenization_gpt2.bytes_to_unicode()
          byte_decoder = {v: k for k, v in byte_encoder.items()}
          for i, item in enumerate(tokenizer):
            text: bytes
            text = b''.join([x.to_bytes(1, byteorder='big') for x in [byte_decoder[y] for y in item]])
            score: float = -i
            yield text, score
        else:
          for i in range(tokenizer.vocab_size()):
              text: bytes
              if tokenizer.is_unknown(i):
@ -300,7 +322,9 @@ class GGMLVocab:
 Vocab = Union[SentencePieceVocab, GGMLVocab]
-def permute(weights: NDArray, n_head: int) -> NDArray:
+def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
    if n_kv_head is not None and n_head != n_kv_head:
        n_head //= n_kv_head
    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
                .swapaxes(1, 2)
                .reshape(weights.shape))
@ -351,7 +375,7 @@ class Tensor(metaclass=ABCMeta):
    @abstractmethod
    def astype(self, data_type: DataType) -> 'Tensor': ...
    @abstractmethod
-    def permute(self, n_head: int) -> 'Tensor': ...
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
    @abstractmethod
    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
    @abstractmethod
@ -389,8 +413,8 @@ class UnquantizedTensor(Tensor):
        r = self.ndarray.shape[0] // 3
        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
-    def permute(self, n_head: int) -> 'UnquantizedTensor':
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
-        return UnquantizedTensor(permute(self.ndarray, n_head))
+        return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
 def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
@ -438,26 +462,27 @@ class GGMLQuantizedTensor(Tensor):
    def to_ggml(self) -> 'GGMLQuantizedTensor':
        return self
-    def permute(self, n_head: int) -> 'GGMLQuantizedTensor':
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
-        return GGMLQuantizedTensor(permute(self.ndarray, n_head), self.shape, self.data_type)
+        return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
 GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
 class DeferredPermutedTensor(Tensor):
-    def __init__(self, base: Tensor, n_head: int) -> None:
+    def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
        self.base = base
        self.n_head = n_head
        self.n_kv_head = n_kv_head
        self.data_type = self.base.data_type
    def astype(self, data_type: DataType) -> Tensor:
-        return self.base.astype(data_type).permute(self.n_head)
+        return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
    def to_ggml(self) -> GGMLCompatibleTensor:
-        return self.base.to_ggml().permute(self.n_head)
+        return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
-    def permute(self, n_head: int) -> Tensor:
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
        raise Exception("shouldn't permute twice")
@ -549,8 +574,8 @@ class GPTQForLLaMaQuantizedTensor(Tensor):
        ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
        return ret
-    def permute(self, n_head: int) -> Tensor:
+    def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
-        return DeferredPermutedTensor(self, n_head)
+        return DeferredPermutedTensor(self, n_head, n_kv_head)
    def to_ggml(self) -> GGMLQuantizedTensor:
        # The output format looks like this:
@ -681,10 +706,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
    return ModelPlus(model, paths, format, vocab)
-def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
+def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
    def load() -> Tensor:
-        return lazy_tensor.load().permute(n_head)
+        return lazy_tensor.load().permute(n_head, n_kv_head)
-    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
+    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
 def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
    def load() -> Tensor:
@ -709,7 +734,7 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
    for i in itertools.count():
        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
            out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
-            out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
+            out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
            out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
            out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
@ -1196,14 +1221,18 @@ def filter_and_sort_tensors(model: LazyModel) -> LazyModel:
    return {name: model[name] for name in TENSORS_LIST if name in model}
-def load_vocab(path: Path) -> SentencePieceVocab:
+def load_vocab(path: Path, vocabtype: Optional[str]) -> SentencePieceVocab:
    print(f"vocabtype: {vocabtype}")
    # Be extra-friendly and accept either a file or a directory.  Also, if it's
    # a directory, it might be the model directory, and tokenizer.model might
    # be in the parent of that.
    if path.is_dir():
-        path2 = path / "tokenizer.model"
+        vocab_file = "tokenizer.model"
        if vocabtype == 'bpe':
          vocab_file = "vocab.json"
        path2 = path / vocab_file
        # Use `.parent` instead of /.. to handle the symlink case better.
-        path3 = path.parent / "tokenizer.model"
+        path3 = path.parent / vocab_file
        if path2.exists():
            path = path2
        elif path3.exists():
@ -1214,7 +1243,8 @@ def load_vocab(path: Path) -> SentencePieceVocab:
                "if it's in another directory, pass the directory as --vocab-dir")
    added_tokens_path = path.parent / "added_tokens.json"
    print(f"Loading vocab file {path}")
-    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+    return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None,
                              vocabtype)
 def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
@ -1252,6 +1282,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
    parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
    parser.add_argument("model", type=Path,
                        help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
    parser.add_argument("--vocabtype", default='spm', choices=["spm", "bpe"], help="vocab format (default: spm)")
    args = parser.parse_args(args_in)
    vocab: Vocab
@ -1259,7 +1290,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
        model_plus = lazy_load_file(args.model)
        do_dump_model(model_plus)
    elif args.vocab_only:
-        vocab = load_vocab(args.vocab_dir or args.model)
+        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
        assert args.outfile, "need --outfile if using --vocab-only"
        outfile = args.outfile
        OutputFile.write_vocab_only(outfile, vocab)
@ -1273,7 +1304,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
            vocab = model_plus.vocab
        else:
            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-            vocab = load_vocab(vocab_dir)
+            vocab = load_vocab(vocab_dir, args.vocabtype)
        params = Params.load(model_plus)
        model = model_plus.model
        model = do_necessary_conversions(model, params)
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -13,6 +13,8 @@ set(TARGET common)
 add_library(${TARGET} OBJECT
    common.h
    common.cpp
    grammar-parser.h
    grammar-parser.cpp
    )
 if (BUILD_SHARED_LIBS)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -8,6 +8,12 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 #ifdef LLAMA_DEFAULT_RMS_EPS
 static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 #else
 static const float rms_norm_eps = 5e-6f;
 #endif
 float frand() {
    return (float)rand()/(float)RAND_MAX;
 }
@ -562,7 +568,7 @@ struct ggml_tensor * forward(
        // norm
        {
            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            // cur = attention_norm*cur
            cur = ggml_mul(ctx0,
@ -685,7 +691,7 @@ struct ggml_tensor * forward(
            // norm
            {
                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                // cur = ffn_norm*cur
                // cur shape [n_embd,N,1,1]
@ -729,7 +735,7 @@ struct ggml_tensor * forward(
    {
        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        // inpL = norm*inpL
        // inpL shape [n_embd,N,1,1]
@ -817,7 +823,7 @@ struct ggml_tensor * forward_batch(
        // norm
        {
            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);
            // cur = attention_norm*cur
@ -981,7 +987,7 @@ struct ggml_tensor * forward_batch(
            // norm
            {
                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);
                // cur = ffn_norm*cur
@ -1034,7 +1040,7 @@ struct ggml_tensor * forward_batch(
    {
        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);
        // inpL = norm*inpL
@ -1104,7 +1110,7 @@ struct ggml_tensor * forward_lora(
        // norm
        {
            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            // cur = attention_norm*cur
            cur = ggml_mul(ctx0,
@ -1251,7 +1257,7 @@ struct ggml_tensor * forward_lora(
            // norm
            {
                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                // cur = ffn_norm*cur
                // cur shape [n_embd,N,1,1]
@ -1295,7 +1301,7 @@ struct ggml_tensor * forward_lora(
    {
        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        // inpL = norm*inpL
        // inpL shape [n_embd,N,1,1]
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -177,6 +177,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_gqa = std::stoi(argv[i]);
        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rms_norm_eps = std::stof(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@ -396,8 +402,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.antiprompt.push_back(argv[i]);
        } else if (arg == "--perplexity") {
            params.perplexity = true;
-        } else if (arg == "--perplexity-lines") {
+        } else if (arg == "--hellaswag") {
-            params.perplexity_lines = true;
+            params.hellaswag = true;
        } else if (arg == "--hellaswag-tasks") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.hellaswag_tasks = std::stoi(argv[i]);
        } else if (arg == "--ignore-eos") {
            params.logit_bias[llama_token_eos()] = -INFINITY;
        } else if (arg == "--no-penalize-nl") {
@ -426,6 +438,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
        } else if (arg == "--in-prefix-bos") {
            params.input_prefix_bos = true;
        } else if (arg == "--in-prefix") {
            if (++i >= argc) {
                invalid_param = true;
@ -438,6 +452,28 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.input_suffix = argv[i];
        } else if (arg == "--grammar") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.grammar = argv[i];
        } else if (arg == "--grammar-file") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::ifstream file(argv[i]);
            if (!file) {
                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
                invalid_param = true;
                break;
            }
            std::copy(
                std::istreambuf_iterator<char>(file),
                std::istreambuf_iterator<char>(),
                std::back_inserter(params.grammar)
            );
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, default_params);
@ -489,6 +525,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        not supported with --interactive or other interactive options\n");
    fprintf(stdout, "  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
    fprintf(stdout, "  --random-prompt       start with a randomized prompt.\n");
    fprintf(stdout, "  --in-prefix-bos       prefix BOS to user inputs, preceding the `--in-prefix` string\n");
    fprintf(stdout, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
    fprintf(stdout, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    fprintf(stdout, "  -f FNAME, --file FNAME\n");
@ -497,6 +534,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@ -514,6 +552,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        modifies the likelihood of token appearing in the completion,\n");
    fprintf(stdout, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
    fprintf(stdout, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
    fprintf(stdout, "  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
    fprintf(stdout, "  --grammar-file FNAME  file to read grammar from\n");
    fprintf(stdout, "  --cfg-negative-prompt PROMPT \n");
    fprintf(stdout, "                        negative prompt to use for guidance. (default: empty)\n");
    fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
@ -525,8 +565,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
    fprintf(stdout, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
    fprintf(stdout, "  --perplexity          compute perplexity over each ctx window of the prompt\n");
-    fprintf(stdout, "  --perplexity-lines    compute perplexity over each line of the prompt\n");
+    fprintf(stdout, "  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
-    fprintf(stdout, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+    fprintf(stdout, "  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %d)\n", params.hellaswag_tasks);
    fprintf(stdout, "  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    fprintf(stdout, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    if (llama_mlock_supported()) {
        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@ -591,6 +632,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    lparams.n_ctx           = params.n_ctx;
    lparams.n_batch         = params.n_batch;
    lparams.n_gqa           = params.n_gqa;
    lparams.rms_norm_eps    = params.rms_norm_eps;
    lparams.n_gpu_layers    = params.n_gpu_layers;
    lparams.main_gpu        = params.main_gpu;
    lparams.tensor_split    = params.tensor_split;
--- a/examples/common.h
+++ b/examples/common.h
@ -34,6 +34,7 @@ struct gpt_params {
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
    float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
@ -63,11 +64,15 @@ struct gpt_params {
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
    std::string input_prefix      = "";  // string to prefix user inputs with
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string lora_adapter = "";  // lora adapter path
    std::string lora_base    = "";  // base model path for the lora adapter
    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
    bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
@ -80,10 +85,10 @@ struct gpt_params {
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool penalize_nl       = true;  // consider newlines as a repeatable token
    bool perplexity        = false; // compute perplexity over the prompt
    bool perplexity_lines  = false; // compute perplexity over each line of the prompt
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
--- a/examples/grammar-parser.cpp
+++ b/examples/grammar-parser.cpp
@ -0,0 +1,423 @@
 #include "grammar-parser.h"
 #include <cstdint>
 #include <cwchar>
 #include <string>
 #include <utility>
 #include <stdexcept>
 #include <exception>
 namespace grammar_parser {
    // NOTE: assumes valid utf8 (but checks for overrun)
    // copied from llama.cpp
    std::pair<uint32_t, const char *> decode_utf8(const char * src) {
        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
        uint8_t  first_byte = static_cast<uint8_t>(*src);
        uint8_t  highbits   = first_byte >> 4;
        int      len        = lookup[highbits];
        uint8_t  mask       = (1 << (8 - len)) - 1;
        uint32_t value      = first_byte & mask;
        const char * end    = src + len; // may overrun!
        const char * pos    = src + 1;
        for ( ; pos < end && *pos; pos++) {
            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
        }
        return std::make_pair(value, pos);
    }
    uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
        return result.first->second;
    }
    uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
        return next_id;
    }
    void add_rule(
            parse_state & state,
            uint32_t      rule_id,
            const std::vector<llama_grammar_element> & rule) {
        if (state.rules.size() <= rule_id) {
            state.rules.resize(rule_id + 1);
        }
        state.rules[rule_id] = rule;
    }
    bool is_word_char(char c) {
        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
    }
    std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
        const char * pos   = src;
        const char * end   = src + size;
        uint32_t     value = 0;
        for ( ; pos < end && *pos; pos++) {
            value <<= 4;
            char c = *pos;
            if ('a' <= c && c <= 'f') {
                value += c - 'a' + 10;
            } else if ('A' <= c && c <= 'F') {
                value += c - 'A' + 10;
            } else if ('0' <= c && c <= '9') {
                value += c - '0';
            } else {
                break;
            }
        }
        if (pos != end) {
            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
        }
        return std::make_pair(value, pos);
    }
    const char * parse_space(const char * src, bool newline_ok) {
        const char * pos = src;
        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
            if (*pos == '#') {
                while (*pos && *pos != '\r' && *pos != '\n') {
                    pos++;
                }
            } else {
                pos++;
            }
        }
        return pos;
    }
    const char * parse_name(const char * src) {
        const char * pos = src;
        while (is_word_char(*pos)) {
            pos++;
        }
        if (pos == src) {
            throw std::runtime_error(std::string("expecting name at ") + src);
        }
        return pos;
    }
    std::pair<uint32_t, const char *> parse_char(const char * src) {
        if (*src == '\\') {
            switch (src[1]) {
                case 'x': return parse_hex(src + 2, 2);
                case 'u': return parse_hex(src + 2, 4);
                case 'U': return parse_hex(src + 2, 8);
                case 't': return std::make_pair('\t', src + 2);
                case 'r': return std::make_pair('\r', src + 2);
                case 'n': return std::make_pair('\n', src + 2);
                case '\\':
                case '"':
                case '[':
                case ']':
                    return std::make_pair(src[1], src + 2);
                default:
                    throw std::runtime_error(std::string("unknown escape at ") + src);
            }
        } else if (*src) {
            return decode_utf8(src);
        }
        throw std::runtime_error("unexpected end of input");
    }
    const char * parse_alternates(
            parse_state       & state,
            const char        * src,
            const std::string & rule_name,
            uint32_t            rule_id,
            bool                is_nested);
    const char * parse_sequence(
            parse_state                        & state,
            const char                         * src,
            const std::string                  & rule_name,
            std::vector<llama_grammar_element> & out_elements,
            bool                                 is_nested) {
        size_t last_sym_start = out_elements.size();
        const char * pos = src;
        while (*pos) {
            if (*pos == '"') { // literal string
                pos++;
                last_sym_start = out_elements.size();
                while (*pos != '"') {
                    auto char_pair = parse_char(pos);
                         pos       = char_pair.second;
                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
                }
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '[') { // char range(s)
                pos++;
                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
                if (*pos == '^') {
                    pos++;
                    start_type = LLAMA_GRETYPE_CHAR_NOT;
                }
                last_sym_start = out_elements.size();
                while (*pos != ']') {
                    auto char_pair = parse_char(pos);
                         pos       = char_pair.second;
                    enum llama_gretype type = last_sym_start < out_elements.size()
                        ? LLAMA_GRETYPE_CHAR_ALT
                        : start_type;
                    out_elements.push_back({type, char_pair.first});
                    if (pos[0] == '-' && pos[1] != ']') {
                        auto endchar_pair = parse_char(pos + 1);
                             pos          = endchar_pair.second;
                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
                    }
                }
                pos = parse_space(pos + 1, is_nested);
            } else if (is_word_char(*pos)) { // rule reference
                const char * name_end    = parse_name(pos);
                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
                pos = parse_space(name_end, is_nested);
                last_sym_start = out_elements.size();
                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
            } else if (*pos == '(') { // grouping
                // parse nested alternates into synthesized rule
                pos = parse_space(pos + 1, true);
                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
                last_sym_start = out_elements.size();
                // output reference to synthesized rule
                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
                if (*pos != ')') {
                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
                }
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                if (last_sym_start == out_elements.size()) {
                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
                }
                // apply transformation to previous symbol (last_sym_start to end) according to
                // rewrite rules:
                // S* --> S' ::= S S' |
                // S+ --> S' ::= S S' | S
                // S? --> S' ::= S |
                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
                std::vector<llama_grammar_element> sub_rule;
                // add preceding symbol to generated rule
                sub_rule.insert(
                    sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
                if (*pos == '*' || *pos == '+') {
                    // cause generated rule to recurse
                    sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
                }
                // mark start of alternate def
                sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
                if (*pos == '+') {
                    // add preceding symbol as alternate only for '+' (otherwise empty)
                    sub_rule.insert(
                        sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
                }
                sub_rule.push_back({LLAMA_GRETYPE_END, 0});
                add_rule(state, sub_rule_id, sub_rule);
                // in original rule, replace previous symbol with reference to generated rule
                out_elements.resize(last_sym_start);
                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
                pos = parse_space(pos + 1, is_nested);
            } else {
                break;
            }
        }
        return pos;
    }
    const char * parse_alternates(
            parse_state       & state,
            const char        * src,
            const std::string & rule_name,
            uint32_t            rule_id,
            bool                is_nested) {
        std::vector<llama_grammar_element> rule;
        const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
        while (*pos == '|') {
            rule.push_back({LLAMA_GRETYPE_ALT, 0});
            pos = parse_space(pos + 1, true);
            pos = parse_sequence(state, pos, rule_name, rule, is_nested);
        }
        rule.push_back({LLAMA_GRETYPE_END, 0});
        add_rule(state, rule_id, rule);
        return pos;
    }
    const char * parse_rule(parse_state & state, const char * src) {
        const char * name_end = parse_name(src);
        const char * pos      = parse_space(name_end, false);
        size_t       name_len = name_end - src;
        uint32_t     rule_id  = get_symbol_id(state, src, name_len);
        const std::string name(src, name_len);
        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
            throw std::runtime_error(std::string("expecting ::= at ") + pos);
        }
        pos = parse_space(pos + 3, true);
        pos = parse_alternates(state, pos, name, rule_id, false);
        if (*pos == '\r') {
            pos += pos[1] == '\n' ? 2 : 1;
        } else if (*pos == '\n') {
            pos++;
        } else if (*pos) {
            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
        }
        return parse_space(pos, true);
    }
    parse_state parse(const char * src) {
        try {
            parse_state state;
            const char * pos = parse_space(src, true);
            while (*pos) {
                pos = parse_rule(state, pos);
            }
            return state;
        } catch (const std::exception & err) {
            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
            return parse_state();
        }
    }
    void print_grammar_char(FILE * file, uint32_t c) {
        if (0x20 <= c && c <= 0x7f) {
            fprintf(file, "%c", static_cast<char>(c));
        } else {
            // cop out of encoding UTF-8
            fprintf(file, "<U+%04X>", c);
        }
    }
    bool is_char_element(llama_grammar_element elem) {
        switch (elem.type) {
            case LLAMA_GRETYPE_CHAR:           return true;
            case LLAMA_GRETYPE_CHAR_NOT:       return true;
            case LLAMA_GRETYPE_CHAR_ALT:       return true;
            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
            default:                           return false;
        }
    }
    void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
        for (auto elem : rule) {
            switch (elem.type) {
                case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
                case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
                case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
                case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
            }
            switch (elem.type) {
                case LLAMA_GRETYPE_END:
                case LLAMA_GRETYPE_ALT:
                case LLAMA_GRETYPE_RULE_REF:
                    fprintf(file, "(%u) ", elem.value);
                    break;
                case LLAMA_GRETYPE_CHAR:
                case LLAMA_GRETYPE_CHAR_NOT:
                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
                case LLAMA_GRETYPE_CHAR_ALT:
                    fprintf(file, "(\"");
                    print_grammar_char(file, elem.value);
                    fprintf(file, "\") ");
                    break;
            }
        }
        fprintf(file, "\n");
    }
    void print_rule(
            FILE     * file,
            uint32_t   rule_id,
            const std::vector<llama_grammar_element> & rule,
            const std::map<uint32_t, std::string>    & symbol_id_names) {
        if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
            throw std::runtime_error(
                "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
        }
        fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
        for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
            llama_grammar_element elem = rule[i];
            switch (elem.type) {
                case LLAMA_GRETYPE_END:
                    throw std::runtime_error(
                        "unexpected end of rule: " + std::to_string(rule_id) + "," +
                        std::to_string(i));
                case LLAMA_GRETYPE_ALT:
                    fprintf(file, "| ");
                    break;
                case LLAMA_GRETYPE_RULE_REF:
                    fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
                    break;
                case LLAMA_GRETYPE_CHAR:
                    fprintf(file, "[");
                    print_grammar_char(file, elem.value);
                    break;
                case LLAMA_GRETYPE_CHAR_NOT:
                    fprintf(file, "[^");
                    print_grammar_char(file, elem.value);
                    break;
                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
                    if (i == 0 || !is_char_element(rule[i - 1])) {
                        throw std::runtime_error(
                            "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
                            std::to_string(rule_id) + "," + std::to_string(i));
                    }
                    fprintf(file, "-");
                    print_grammar_char(file, elem.value);
                    break;
                case LLAMA_GRETYPE_CHAR_ALT:
                    if (i == 0 || !is_char_element(rule[i - 1])) {
                        throw std::runtime_error(
                            "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
                            std::to_string(rule_id) + "," + std::to_string(i));
                    }
                    print_grammar_char(file, elem.value);
                    break;
            }
            if (is_char_element(elem)) {
                switch (rule[i + 1].type) {
                    case LLAMA_GRETYPE_CHAR_ALT:
                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
                        break;
                    default:
                        fprintf(file, "] ");
                }
            }
        }
        fprintf(file, "\n");
    }
    void print_grammar(FILE * file, const parse_state & state) {
        try {
            std::map<uint32_t, std::string> symbol_id_names;
            for (auto kv : state.symbol_ids) {
                symbol_id_names[kv.second] = kv.first;
            }
            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
                // fprintf(file, "%zu: ", i);
                // print_rule_binary(file, state.rules[i]);
                print_rule(file, i, state.rules[i], symbol_id_names);
                // fprintf(file, "\n");
            }
        } catch (const std::exception & err) {
            fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
        }
    }
    std::vector<const llama_grammar_element *> parse_state::c_rules() {
        std::vector<const llama_grammar_element *> ret;
        for (const auto & rule : rules) {
            ret.push_back(rule.data());
        }
        return ret;
    }
 }
--- a/examples/grammar-parser.h
+++ b/examples/grammar-parser.h
@ -0,0 +1,29 @@
 // Implements a parser for an extended Backus-Naur form (BNF), producing the
 // binary context-free grammar format specified by llama.h. Supports character
 // ranges, grouping, and repetition operators. As an example, a grammar for
 // arithmetic might look like:
 //
 // root  ::= expr
 // expr  ::= term ([-+*/] term)*
 // term  ::= num | "(" space expr ")" space
 // num   ::= [0-9]+ space
 // space ::= [ \t\n]*
 #pragma once
 #include "llama.h"
 #include <vector>
 #include <map>
 #include <cstdint>
 #include <string>
 namespace grammar_parser {
    struct parse_state {
        std::map<std::string, uint32_t>                 symbol_ids;
        std::vector<std::vector<llama_grammar_element>> rules;
        std::vector<const llama_grammar_element *> c_rules();
    };
    parse_state parse(const char * src);
    void print_grammar(FILE * file, const parse_state & state);
 }
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -202,9 +202,9 @@ Example usage: `--top-p 0.95`
 -   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
-Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced.
+Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS.
-Example usage: `--tfs 2.0`
+Example usage: `--tfs 0.95`
 ### Locally Typical Sampling
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -6,6 +6,7 @@
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
 #include "grammar-parser.h"
 #include <cassert>
 #include <cinttypes>
@ -324,6 +325,10 @@ int main(int argc, char ** argv) {
            }
        }
        if (params.input_prefix_bos) {
            fprintf(stderr, "Input prefix with BOS\n");
        }
        if (!params.input_prefix.empty()) {
            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
        }
@ -337,6 +342,31 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    fprintf(stderr, "\n\n");
    grammar_parser::parse_state parsed_grammar;
    llama_grammar *             grammar = NULL;
    if (!params.grammar.empty()) {
        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
        // will be empty (default) if there are parse errors
        if (parsed_grammar.rules.empty()) {
            return 1;
        }
        fprintf(stderr, "%s: grammar:\n", __func__);
        grammar_parser::print_grammar(stderr, parsed_grammar);
        fprintf(stderr, "\n");
        {
            auto it = params.logit_bias.find(llama_token_eos());
            if (it != params.logit_bias.end() && it->second == -INFINITY) {
                fprintf(stderr,
                    "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
            }
        }
        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
        grammar = llama_grammar_init(
            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    }
    // TODO: replace with ring-buffer
    std::vector<llama_token> last_n_tokens(n_ctx);
    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
@ -570,6 +600,10 @@ int main(int argc, char ** argv) {
                    logits[llama_token_nl()] = nl_logit;
                }
                if (grammar != NULL) {
                    llama_sample_grammar(ctx, &candidates_p, grammar);
                }
                if (temp <= 0) {
                    // Greedy sampling
                    id = llama_sample_token_greedy(ctx, &candidates_p);
@ -595,18 +629,12 @@ int main(int argc, char ** argv) {
                }
                // printf("`%d`", candidates_p.size);
-                last_n_tokens.erase(last_n_tokens.begin());
+                if (grammar != NULL) {
-                last_n_tokens.push_back(id);
+                    llama_grammar_accept_token(ctx, grammar, id);
                }
-            // replace end of text token with newline token when in interactive mode
+                last_n_tokens.erase(last_n_tokens.begin());
-            if (id == llama_token_eos() && params.interactive && !params.instruct) {
+                last_n_tokens.push_back(id);
                id = llama_token_newline.front();
                if (params.antiprompt.size() != 0) {
                    // tokenize and inject first reverse prompt
                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                }
            }
            // add it to the context
@ -674,11 +702,34 @@ int main(int argc, char ** argv) {
                }
            }
            // deal with end of text token in interactive mode
            if (last_n_tokens.back() == llama_token_eos()) {
                if (params.interactive) {
                    if (params.antiprompt.size() != 0) {
                        // tokenize and inject first reverse prompt
                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                        is_antiprompt = true;
                    }
                    is_interacting = true;
                    printf("\n");
                    console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
                    fflush(stdout);
                } else if (params.instruct) {
                    is_interacting = true;
                }
            }
            if (n_past > 0 && is_interacting) {
                if (params.instruct) {
                    printf("\n> ");
                }
                if (params.input_prefix_bos) {
                    embd_inp.push_back(llama_token_bos());
                }
                std::string buffer;
                if (!params.input_prefix.empty()) {
                    buffer += params.input_prefix;
@ -725,19 +776,27 @@ int main(int argc, char ** argv) {
            }
            if (n_past > 0) {
                if (is_interacting) {
                    // reset grammar state if we're restarting generation
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);
                        std::vector<const llama_grammar_element *> grammar_rules(
                            parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
                    }
                }
                is_interacting = false;
            }
        }
        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos()) {
+        if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) {
            if (params.instruct) {
                is_interacting = true;
            } else {
            fprintf(stderr, " [end of text]\n");
            break;
        }
        }
        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
        if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
@ -756,6 +815,9 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);
    if (grammar != NULL) {
        llama_grammar_free(grammar);
    }
    llama_backend_free();
    return 0;
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -121,8 +121,23 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    printf("\n");
 }
-void perplexity_lines(llama_context * ctx, const gpt_params & params) {
+void hellaswag_score(llama_context * ctx, const gpt_params & params) {
-    // Calculates perplexity over each line of the prompt
+    // Calculates hellaswag score (acc_norm) from prompt
    //
    // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
    // All used data fields are preprocessed as in https://github.com/EleutherAI/lm-evaluation-harness/blob/df3da98c5405deafd519c2ddca52bb7c3fe36bef/lm_eval/tasks/hellaswag.py#L62-L68
    //
    // All 10042 tasks should be extracted to keep the results standardized like other implementations.
    //
    // Datafile layout:
    // ['??'] denotes json fields
    // 6 lines per task:
    // ['activity_label'] + ": " +['ctx']  - The first part of the query, the context
    // ['label'] - The index the best common sense ending aka gold ending
    // ['endings'][0] - Endings added to the first part of the query
    // ['endings'][1]
    // ['endings'][2]
    // ['endings'][3]
    std::vector<std::string> prompt_lines;
    std::istringstream strstream(params.prompt);
@ -132,63 +147,149 @@ void perplexity_lines(llama_context * ctx, const gpt_params & params) {
        prompt_lines.push_back(line);
    }
-    const int n_vocab = llama_n_vocab(ctx);
+    if( prompt_lines.size() % 6 != 0) {
-
+        fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
    int counttotal   = 0;
    size_t n_lines = prompt_lines.size();
    double nll = 0.0;
    fprintf(stderr, "%s: calculating perplexity over %lu lines\n", __func__, n_lines);
    printf("\nLine\tPPL line\tPPL cumulative\n");
    for (size_t i = 0; i < n_lines; ++i) {
        // Tokenize and insert BOS at start
        std::vector<int> batch_embd = ::llama_tokenize(ctx, prompt_lines[i], true);
        size_t batch_size  = batch_embd.size();
        // Stop if line is too long
        if( batch_size > (size_t)params.n_ctx ) {
            fprintf(stderr, "%s : tokens in line %lu > n_ctxl\n", __func__, i);
        return;
    }
-        if (llama_eval(ctx, batch_embd.data(), batch_size, 0, params.n_threads)) {
+    size_t hs_task_count = prompt_lines.size()/6;
    fprintf(stderr, "%s : loaded %lu tasks from prompt.\n", __func__, hs_task_count);
    // This is needed as usual for LLaMA models
    bool prepend_bos = true;
    // Number of tasks to use when computing the score
    if ( params.hellaswag_tasks < hs_task_count  ) {
        hs_task_count = params.hellaswag_tasks;
    }
    // The tasks should be randomized so the score stabilizes quickly.
    bool randomize_tasks = true;
    // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
    std::mt19937 rng(1);
    // Dataholder for hellaswag tasks
    struct hs_data_t {
        std::string context;
        size_t gold_ending_idx;
        std::string ending[4];
        size_t ending_logprob_count[4];
        double ending_logprob[4];
    };
    fprintf(stderr, "%s : selecting %lu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
    // Select and read data from prompt lines
    hs_data_t *hs_data = new hs_data_t[hs_task_count];
    for (size_t i=0; i < hs_task_count; i++) {
        size_t idx = i;
        // Select a random example of those left in the prompt
        if (randomize_tasks) {
            std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
            idx = dist(rng);
        }
        hs_data[i].context = prompt_lines[idx*6];
        hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
        for (size_t j=0; j < 4; j++) {
            hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
        }
        // Delete the selected random example from the prompt
        if (randomize_tasks) {
            prompt_lines.erase( std::next(prompt_lines.begin(),idx*6)  , std::next(prompt_lines.begin(),idx*6+6) );
        }
    }
    fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
    printf("\ntask\tacc_norm\n");
    double acc = 0.0f;
    const int n_vocab = llama_n_vocab(ctx);
    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
        // Tokenize the context to count tokens
        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
        size_t context_size = context_embd.size();
        for (size_t ending_idx=0;ending_idx<4;ending_idx++) {
            // Tokenize the query
            std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
            size_t query_size = query_embd.size();
            // Stop if query wont fit the ctx window
            if (query_size > (size_t)params.n_ctx) {
                fprintf(stderr, "%s : number of tokens in query %lu > n_ctxl\n", __func__, query_size);
                return;
            }
            // Speedup small evaluations by evaluating atleast 32 tokens
            if (query_size < 32) {
                query_embd.resize(32);
            }
            // Evaluate the query
            if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
            }
-        const auto batch_logits = llama_get_logits(ctx);
+            const auto query_logits = llama_get_logits(ctx);
            std::vector<float> logits;
-        logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
-        double nllline = 0.0;
+            hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
-        int countline = 0;
+            hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
-        // Perplexity over second half of the line
+            // Calculate the logprobs over the ending
-        for (size_t j = batch_size/2; j < batch_size - 1; ++j) {
+            for (size_t j = context_size-1; j < query_size - 1; j++) {
                // Calculate probability of next token, given the previous ones.
                const std::vector<float> tok_logits(
                    logits.begin() + (j + 0) * n_vocab,
                    logits.begin() + (j + 1) * n_vocab);
-            const float prob = softmax(tok_logits)[batch_embd[ j + 1]];
+                const float prob = softmax(tok_logits)[query_embd[ j + 1]];
-            nllline += -std::log(prob);
+                hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
-            ++countline;
+                hs_data[task_idx].ending_logprob_count[ending_idx]++;
            }
-        nll += nllline;
+            // Calculate the mean token logprob for acc_norm
-        counttotal += countline;
+            hs_data[task_idx].ending_logprob[ending_idx] /= hs_data[task_idx].ending_logprob_count[ending_idx];
-        // perplexity is e^(average negative log-likelihood)
+
-        printf("%lu\t%.8lf\t%.8lf\n", i + 1, std::exp(nllline/countline), std::exp(nll / counttotal) );
+//            printf("task %lu, ending %lu, whole_len %lu, context_len %lu, ending_logprob_count %lu, ending_logprob %.4f\n",
 //                task_idx,ending_idx,whole_size,context_size, hs_data[task_idx].ending_logprob_count[ending_idx], hs_data[task_idx].ending_logprob[ending_idx] );
        }
        // Find the ending with maximum logprob
        size_t ending_logprob_max_idx = -1;
        double ending_logprob_max_val = -INFINITY;
        for (size_t j=0; j < 4; j++) {
            if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
                ending_logprob_max_idx = j;
                ending_logprob_max_val =  hs_data[task_idx].ending_logprob[j];
            }
        }
 //        printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_data[task_idx].gold_ending_idx);
        // If the gold ending got the maximum logprobe add one accuracy point
        if (ending_logprob_max_idx == hs_data[task_idx].gold_ending_idx) {
            acc += 1.0;
        }
        // Print the accumulated accuracy mean x 100
        printf("%li\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0);
        fflush(stdout);
    }
    delete [] hs_data;
    printf("\n");
 }
@ -240,8 +341,8 @@ int main(int argc, char ** argv) {
                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }
-    if (params.perplexity_lines) {
+    if (params.hellaswag) {
-        perplexity_lines(ctx, params);
+        hellaswag_score(ctx, params);
    } else {
        perplexity(ctx, params);
    }
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -26,6 +26,7 @@ int main(int argc, char ** argv) {
    auto lparams = llama_context_default_params();
    lparams.n_ctx     = params.n_ctx;
    lparams.n_gqa     = params.n_gqa;
    lparams.seed      = params.seed;
    lparams.f16_kv    = params.memory_f16;
    lparams.use_mmap  = params.use_mmap;
--- a/examples/server-llama2-13B.sh
+++ b/examples/server-llama2-13B.sh
@ -0,0 +1,26 @@
 #!/bin/bash
 set -e
 cd "$(dirname "$0")/.." || exit
 # Specify the model you want to use here:
 MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}"
 PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
 # Adjust to the number of CPU cores you want to use.
 N_THREAD="${N_THREAD:-12}"
 # Note: you can also override the generation options by specifying them on the command line:
 GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
 ./server $GEN_OPTIONS \
  --model "$MODEL" \
  --threads "$N_THREAD" \
  --rope-freq-scale 1.0 \
  "$@"
 # I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line.
 # -ngl 1 \
--- a/examples/server/chat-llama2.sh
+++ b/examples/server/chat-llama2.sh
@ -0,0 +1,109 @@
 #!/bin/bash
 API_URL="${API_URL:-http://127.0.0.1:8080}"
 CHAT=(
    "Hello, Assistant."
    "Hello. How may I help you today?"
 )
 INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
 trim() {
    shopt -s extglob
    set -- "${1##+([[:space:]])}"
    printf "%s" "${1%%+([[:space:]])}"
 }
 trim_trailing() {
    shopt -s extglob
    printf "%s" "${1%%+([[:space:]])}"
 }
 format_prompt() {
    if [[ "${#CHAT[@]}" -eq 0 ]]; then
        echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
    else
        LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
        echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
    fi
 }
 tokenize() {
    curl \
        --silent \
        --request POST \
        --url "${API_URL}/tokenize" \
        --header "Content-Type: application/json" \
        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
    | jq '.tokens[]'
 }
 N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
 chat_completion() {
    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
        prompt: .,
        temperature: 0.2,
        top_k: 40,
        top_p: 0.9,
        n_keep: $n_keep,
        n_predict: 1024,
        stop: ["[INST]"],
        stream: true
    }')"
    # Create a temporary file to hold the Python output
    TEMPFILE=$(mktemp)
    exec 3< <(curl \
        --silent \
        --no-buffer \
        --request POST \
        --url "${API_URL}/completion" \
        --header "Content-Type: application/json" \
        --data-raw "${DATA}")
    python -c "
 import json
 import sys
 answer = ''
 while True:
    line = sys.stdin.readline()
    if not line:
        break
    if line.startswith('data: '):
        json_content = line[6:].strip()
        content = json.loads(json_content)['content']
        sys.stdout.write(content)
        sys.stdout.flush()
        answer += content
 answer = answer.rstrip('\n')
 # Write the answer to the temporary file
 with open('$TEMPFILE', 'w') as f:
    f.write(answer)
    " <&3
    exec 3<&-
    # Read the answer from the temporary file
    ANSWER=$(cat $TEMPFILE)
    # Clean up the temporary file
    rm $TEMPFILE
    printf "\n"
    CHAT+=("$1" "$(trim "$ANSWER")")
 }
 while true; do
    echo -en "\033[0;32m"  # Green color
    read -r -e -p "> " QUESTION
    echo -en "\033[0m"  # Reset color
    chat_completion "${QUESTION}"
 done
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -73,6 +73,37 @@
      margin: 0;
    }
    fieldset.two {
      display: grid;
      grid-template: "a a";
      gap: 1em;
    }
    fieldset.three {
      display: grid;
      grid-template: "a a a";
      gap: 1em;
    }
    details {
      border: 1px solid #aaa;
      border-radius: 4px;
      padding: 0.5em 0.5em 0;
      margin-top: 0.5em;
    }
    summary {
      font-weight: bold;
      margin: -0.5em -0.5em 0;
      padding: 0.5em;
      cursor: pointer;
    }
    details[open] {
      padding: 0.5em;
    }
    textarea {
      padding: 5px;
      flex-grow: 1;
@ -125,10 +156,17 @@
    const params = signal({
      n_predict: 400,
      temperature: 0.7,
-      repeat_last_n: 256,
+      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
-      repeat_penalty: 1.18,
+      repeat_penalty: 1.18, // 1.0 = disabled
-      top_k: 40,
+      top_k: 40, // <= 0 to use vocab size
-      top_p: 0.5,
+      top_p: 0.5, // 1.0 = disabled
      tfs_z: 1.0, // 1.0 = disabled
      typical_p: 1.0, // 1.0 = disabled
      presence_penalty: 0.0, // 0.0 = disabled
      frequency_penalty: 0.0, // 0.0 = disabled
      mirostat: 0, // 0/1/2
      mirostat_tau: 5, // target entropy
      mirostat_eta: 0.1, // learning rate
    })
    const llamaStats = signal(null)
@ -264,6 +302,27 @@
      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
      const FloatField = ({label, max, min, name, step, value}) => {
        return html`
          <div>
            <label for="${name}">${label}</label>
            <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
            <span>${value}</span>
          </div>
        `
      };
      const IntField = ({label, max, min, name, value}) => {
        return html`
          <div>
            <label for="${name}">${label}</label>
            <input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
            <span>${value}</span>
          </div>
        `
      };
      return html`
        <form>
@ -272,7 +331,9 @@
              <label for="prompt">Prompt</label>
              <textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
            </div>
          </fieldset>
          <fieldset class="two">
            <div>
              <label for="user">User name</label>
              <input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
@ -282,7 +343,9 @@
              <label for="bot">Bot name</label>
              <input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
            </div>
          </fieldset>
          <fieldset>
            <div>
              <label for="template">Prompt template</label>
              <textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
@ -292,38 +355,44 @@
              <label for="template">Chat history template</label>
              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
            </div>
            <div>
              <label for="temperature">Temperature</label>
              <input type="range" id="temperature" min="0.0" max="1.0" step="0.01" name="temperature" value="${params.value.temperature}" oninput=${updateParamsFloat} />
              <span>${params.value.temperature}</span>
            </div>
            <div>
              <label for="nPredict">Predictions</label>
              <input type="range" id="nPredict" min="1" max="2048" step="1" name="n_predict" value="${params.value.n_predict}" oninput=${updateParamsFloat} />
              <span>${params.value.n_predict}</span>
            </div>
            <div>
              <label for="repeat_penalty">Penalize repeat sequence</label>
              <input type="range" id="repeat_penalty" min="0.0" max="2.0" step="0.01" name="repeat_penalty" value="${params.value.repeat_penalty}" oninput=${updateParamsFloat} />
              <span>${params.value.repeat_penalty}</span>
            </div>
            <div>
              <label for="repeat_last_n">Consider N tokens for penalize</label>
              <input type="range" id="repeat_last_n" min="0.0" max="2048" name="repeat_last_n" value="${params.value.repeat_last_n}" oninput=${updateParamsFloat} />
              <span>${params.value.repeat_last_n}</span>
            </div>
          </fieldset>
          <fieldset class="two">
            ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
            ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
            ${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})}
            ${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})}
            ${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})}
            ${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})}
          </fieldset>
          <details>
            <summary>More options</summary>
            <fieldset class="two">
              ${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})}
              ${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})}
              ${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})}
              ${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})}
            </fieldset>
            <hr />
            <fieldset class="three">
              <div>
                <label><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> no Mirostat</label>
                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
              </div>
              ${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
              ${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
            </fieldset>
          </details>
        </form>
      `
    }
    // poor mans markdown replacement
    const Markdownish = (params) => {
      const md = params.text
        .replace(/&/g, '&amp;')
        .replace(/</g, '&lt;')
        .replace(/>/g, '&gt;')
        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
        .replace(/__(.*?)__/g, '<strong>$1</strong>')
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -601,47 +601,49 @@ struct llama_server_context
 static void server_print_usage(const char *argv0, const gpt_params &params,
                               const server_params &sparams)
 {
-    fprintf(stderr, "usage: %s [options]\n", argv0);
+    fprintf(stdout, "usage: %s [options]\n", argv0);
-    fprintf(stderr, "\n");
+    fprintf(stdout, "\n");
-    fprintf(stderr, "options:\n");
+    fprintf(stdout, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stdout, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
+    fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
-    fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
-    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
-    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
-    fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
    if (llama_mlock_supported())
    {
-        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
    if (llama_mmap_supported())
    {
-        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+        fprintf(stdout, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stdout, "  -ngl N, --n-gpu-layers N\n");
-    fprintf(stderr, "                        number of layers to store in VRAM\n");
+    fprintf(stdout, "                        number of layers to store in VRAM\n");
-    fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
-    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    fprintf(stderr, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
 #endif
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stdout, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -a ALIAS, --alias ALIAS\n");
+    fprintf(stdout, "  -a ALIAS, --alias ALIAS\n");
-    fprintf(stderr, "                        set an alias for the model, will be added as `model` field in completion response\n");
+    fprintf(stdout, "                        set an alias for the model, will be added as `model` field in completion response\n");
-    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stdout, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    fprintf(stderr, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
+    fprintf(stdout, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
-    fprintf(stderr, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
+    fprintf(stdout, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
-    fprintf(stderr, "  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
+    fprintf(stdout, "  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
-    fprintf(stderr, "  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
+    fprintf(stdout, "  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
-    fprintf(stderr, "  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
+    fprintf(stdout, "  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
-    fprintf(stderr, "\n");
+    fprintf(stdout, "\n");
 }
 static void server_params_parse(int argc, char **argv, server_params &sparams,
@ -724,9 +726,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_ctx = std::stoi(argv[i]);
        }
        else if (arg == "-gqa" || arg == "--gqa")
        {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            params.n_gqa = std::stoi(argv[i]);
        }
        else if (arg == "-eps" || arg == "--rms-norm-eps") {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            params.rms_norm_eps = std::stof(argv[i]);
        }
        else if (arg == "--rope-freq-base")
        {
-            if (++i >= argc) {
+            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
@ -734,7 +754,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        }
        else if (arg == "--rope-freq-scale")
        {
-            if (++i >= argc) {
+            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -16,6 +16,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 struct random_normal_distribution {
    std::mt19937 gen;
    std::normal_distribution<float> rd;
@ -439,7 +441,7 @@ struct ggml_tensor * forward(
        // norm
        {
            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            // cur = attention_norm*cur
            cur = ggml_mul(ctx0,
@ -562,7 +564,7 @@ struct ggml_tensor * forward(
            // norm
            {
                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                // cur = ffn_norm*cur
                // cur shape [n_embd,N,1,1]
@ -606,7 +608,7 @@ struct ggml_tensor * forward(
    {
        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        // inpL = norm*inpL
        // inpL shape [n_embd,N,1,1]
@ -694,7 +696,7 @@ struct ggml_tensor * forward_batch(
        // norm
        {
            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);
            // cur = attention_norm*cur
@ -857,7 +859,7 @@ struct ggml_tensor * forward_batch(
            // norm
            {
                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);
                // cur = ffn_norm*cur
@ -910,7 +912,7 @@ struct ggml_tensor * forward_batch(
    {
        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);
        // inpL = norm*inpL
@ -979,7 +981,7 @@ struct ggml_tensor * forward_batch_wo_cache(
        // norm
        {
            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);
            // cur = attention_norm*cur
@ -1085,7 +1087,7 @@ struct ggml_tensor * forward_batch_wo_cache(
            // norm
            {
                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);
                // cur = ffn_norm*cur
@ -1138,7 +1140,7 @@ struct ggml_tensor * forward_batch_wo_cache(
    {
        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);
        // inpL = norm*inpL
@ -1203,7 +1205,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
        // norm
        {
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);
            // cur = attention_norm*cur
@ -1267,7 +1269,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
        {
            // norm
            {
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);
                // cur = ffn_norm*cur
@ -1311,7 +1313,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
    // norm
    {
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);
        // inpL = norm*inpL
@ -1603,7 +1605,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
        struct my_llama_layer & layer = model->layers[il];
        // tensors with values necessary for backward pass are in persistent buf(-1)
        // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
-        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur));                                    assert_shape_2d(t02, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
        use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
@ -1623,7 +1625,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
        use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
        use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21));                                    assert_shape_2d(t22, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
        use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
@ -1666,7 +1668,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
    }
    clr_buf(0);
    use_buf(0);
-    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur));                       assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
    use_buf(-1);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -332,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
    }
 }
-static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
+static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    const int tid = threadIdx.x;
    const float eps = 1e-6f;
    float tmp = 0.0f; // partial sum for thread in warp
    for (int col = tid; col < ncols; col += WARP_SIZE) {
@ -1073,10 +1071,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
    uint16_t aux[4];
    const uint8_t * sc = (const uint8_t *)aux;
    uint16_t q16[8];
    const uint8_t * q4 = (const uint8_t *)q16;
    for (int i = ix; i < num_blocks_per_row; i += 2) {
        const uint8_t * ql1 = x[i].qs + q_offset;
        const uint8_t * ql2 = ql1 + 64;
        const uint8_t * qh  = x[i].qh + l0;
        const float   * y1  = yy + i*QK_K + y_offset;
        const float   * y2  = y1 + 128;
@ -1092,15 +1092,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
        float4 sum = {0.f, 0.f, 0.f, 0.f};
        float smin = 0;
        const uint16_t * q1 = (const uint16_t *)ql1;
        const uint16_t * q2 = q1 + 32;
        q16[0] = q1[0] & 0x0f0f;
        q16[1] = q1[8] & 0x0f0f;
        q16[2] = (q1[0] >> 4) & 0x0f0f;
        q16[3] = (q1[8] >> 4) & 0x0f0f;
        q16[4] = q2[0] & 0x0f0f;
        q16[5] = q2[8] & 0x0f0f;
        q16[6] = (q2[0] >> 4) & 0x0f0f;
        q16[7] = (q2[8] >> 4) & 0x0f0f;
        for (int l = 0; l < n; ++l) {
-            sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
+            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
-                   + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
+                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
-            sum.y += y1[l+32] * ((ql1[l+ 0] >>  4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
+            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
-                   + y1[l+48] * ((ql1[l+16] >>  4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
+                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
-            sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
+            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
-                   + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
+                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
-            sum.w += y2[l+32] * ((ql2[l+ 0] >>  4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
+            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
-                   + y2[l+48] * ((ql2[l+16] >>  4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
+                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
        }
@ -1554,15 +1564,25 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
    const int bq8_offset = QR4_K * (iqs / QI8_1); // 0, 2, 4, 6
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;
 #ifndef GGML_QKK_64
    // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
    const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
    const float    d = bq4_K->d;
    const float dmin = bq4_K->dmin;
-    const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
+    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
    const int v1 = q4[0];
    const int v2 = q4[4];
    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
    uint16_t aux[2];
@ -1580,16 +1600,59 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    for (int i = 0; i < QR4_K; ++i) {
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
        const float d8i = bq8i->d;
        const int * q8 = (const int *)bq8i->qs + (iqs%4);
        const int ui1 = q8[0];
        const int ui2 = q8[4];
-        const int vi = (v >> (4*i)) & 0x0F0F0F0F;
+        const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
        const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
-        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc[i]); // SIMD dot product
+        const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
-        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
+        const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
        sumf_d += d8i * (dot1 * sc[i]);
        sumf_m += d8i * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
    }
    return d*sumf_d - dmin*sumf_m;
 #else
    uint16_t aux16[2];
    const uint8_t * s = (const uint8_t *)aux16;
    const uint16_t * a = (const uint16_t *)bq4_K->scales;
    aux16[0] = a[0] & 0x0f0f;
    aux16[1] = (a[0] >> 4) & 0x0f0f;
    const float dall = bq4_K->d[0];
    const float dmin = bq4_K->d[1];
    const float d8_1 = bq8_1[0].d;
    const float d8_2 = bq8_1[1].d;
    const int ui1 = *((const int *)bq8_1[0].qs + iqs);
    const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
    const int ui3 = *((const int *)bq8_1[1].qs + iqs);
    const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
    const int * q4 = (const int *)bq4_K->qs + iqs;
    const int v1 = q4[0];
    const int v2 = q4[4];
    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
    return dall * sumf_d - dmin * sumf_m;
 #endif
 #else
    return 0.0f; // only to satisfy the compiler
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@ -1601,7 +1664,11 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-    const int bq8_offset = QR5_K * (iqs / QI8_1);
+#ifndef GGML_QKK_64
    const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
    const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
    float sumf_d = 0.0f;
    float sumf_m = 0.0f;
@ -1609,31 +1676,87 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
    const float    d = bq5_K->d;
    const float dmin = bq5_K->dmin;
-    const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
+    const int vl1 = ql[0];
    const int vl2 = ql[4];
-    const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
+    const int vh1 = qh[0] >> bq8_offset;
    const int vh2 = qh[4] >> bq8_offset;
    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
    uint16_t aux[2];
    const int j = bq8_offset/2;
    if (j < 2) {
        aux[0] = scales[j+0] & 0x3f3f;
        aux[1] = scales[j+2] & 0x3f3f;
    } else {
        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
    }
    const uint8_t * sc = (const uint8_t *)aux;
    const uint8_t * m  = sc + 2;
    for (int i = 0; i < QR5_K; ++i) {
        const int isc = bq8_offset + i;
        uint8_t sc, m;
        get_scale_min_k4(isc, bq5_K->scales, sc, m);
        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
        const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
        const float d8i = bq8i->d;
        const int * q8 = (const int *)bq8i->qs + (iqs%4);
        const int ui1 = q8[0];
        const int ui2 = q8[4];
-        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+        const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
        const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
-        const int vih = ((vh >> i) << 4) & 0x10101010;
+        const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
        const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
-        const int vi = vil | vih;
+        const int vi1 = vil1 | vih1;
        const int vi2 = vil2 | vih2;
        const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
        const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
        sumf_d += d8i * (dot1 * sc[i]);
        sumf_m += d8i * (dot2 * m[i]);
        sumf_d += d8i * (__dp4a(vi,         ui, 0) * sc); // SIMD dot product
        sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m);  // multiply constant part of q5_K with sum of q8_1 values
    }
    return d*sumf_d - dmin*sumf_m;
 #else
    const int8_t * s = bq5_K->scales;
    const float d = bq5_K->d;
    const float d8_1 = bq8_1[0].d;
    const float d8_2 = bq8_1[1].d;
    const int ui1 = *((const int *)bq8_1[0].qs + iqs);
    const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
    const int ui3 = *((const int *)bq8_1[1].qs + iqs);
    const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
    const int * ql = (const int *)bq5_K->qs + iqs;
    const int vl1 = ql[0];
    const int vl2 = ql[4];
    const int step = 4 * iqs; // 0, 4, 8, 12
    const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
    const int in = step%8; // 0, 4, 0, 4
    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
    return d * sumf_d;
 #endif
 #else
    return 0.0f; // only to satisfy the compiler
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@ -2074,10 +2197,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
    norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
 }
-static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
    GGML_ASSERT(ncols % WARP_SIZE == 0);
    const dim3 block_dims(WARP_SIZE, 1, 1);
-    rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+    rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
 }
 static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
@ -2306,7 +2429,10 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
+    // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
    //       kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
    //       is better amortized.
    mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
@ -2315,7 +2441,10 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(1, block_num_y, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
+    // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
    //       kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
    //       is better amortized.
    mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
@ -2822,8 +2951,11 @@ inline void ggml_cuda_op_rms_norm(
    const int64_t ne00 = src0->ne[0];
    const int64_t i01_diff = i01_high - i01_low;
    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));
    // compute
-    rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
+    rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
    (void) src1;
    (void) dst;
@ -3908,18 +4040,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
            }
            func = ggml_cuda_mul;
            break;
-        case GGML_OP_GELU:
+        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(tensor)) {
                case GGML_UNARY_OP_GELU:
                    if (!any_on_device) {
                        return false;
                    }
                    func = ggml_cuda_gelu;
                    break;
-        case GGML_OP_SILU:
+                case GGML_UNARY_OP_SILU:
                    if (!any_on_device) {
                        return false;
                    }
                    func = ggml_cuda_silu;
                    break;
                default:
                    return false;
            } break;
        case GGML_OP_NORM:
            if (!any_on_device) {
                return false;
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -61,6 +61,13 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
 // get data from the device into host memory
 void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
 // try to find operations that can be run concurrently in the graph
 // you should run it again if the topology of your graph changes
 void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 // if the graph has been optimized for concurrently dispatch
 bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -36,6 +36,9 @@ struct ggml_metal_context {
    int n_buffers;
    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
    int concur_list[GGML_MAX_NODES];
    int concur_list_len;
    // custom kernels
 #define GGML_METAL_DECL_KERNEL(name) \
    id<MTLFunction>             function_##name; \
@ -98,6 +101,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    ctx->device = MTLCreateSystemDefaultDevice();
    ctx->queue  = [ctx->device newCommandQueue];
    ctx->n_buffers = 0;
    ctx->concur_list_len = 0;
    // determine if we can use MPS
    if (MPSSupportsMTLDevice(ctx->device)) {
@ -217,6 +221,13 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
    ctx->n_cb = n_cb;
 }
 bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
    if (ctx->concur_list_len) {
        return true;
    }
    return false;
 }
 // finds the Metal buffer that contains the tensor data on the GPU device
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
@ -355,11 +366,98 @@ void ggml_metal_get_tensor(
    memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
 }
 void ggml_metal_graph_find_concurrency(
        struct ggml_metal_context * ctx,
        struct ggml_cgraph * gf) {
    int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
    int nodes_unused[GGML_MAX_NODES];
    for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;}
    for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;}
    ctx->concur_list_len = 0;
    int n_left = gf->n_nodes;
    int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
    int level_pos = 0;  // at ctx->concur_list, the last layer (level) ends at level_pos
    while (n_left > 0) {
        // number of nodes at a layer (that can be issued concurrently)
        int concurrency = 0;
        for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
            if (nodes_unused[i]) {
                // if the requirements for gf->nodes[i] are satisfied
                int exe_flag=1;
                // scan all srcs
                for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
                    struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
                    if (src_cur) {
                        // if is leaf nodes it's satisfied.
                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
                        // otherwise this src should be the output from previous nodes.
                        int is_found = 0;
                        // scan 2*search_depth back because we inserted barrier.
                        for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
                            if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;}
                        }
                        if (is_found == 0) {exe_flag = 0; break;}
                    }
                }
                if (exe_flag) {
                    // check if nodes[i]'s data will be overwritten by a node before nodes[i].
                    // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
                    int64_t data_start = (int64_t) gf->nodes[i]->data;
                    int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
                    for (int j = n_start; j < i; j++) {
                        if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
                                            && gf->nodes[j]->op != GGML_OP_VIEW \
                                            && gf->nodes[j]->op != GGML_OP_TRANSPOSE \
                                            && gf->nodes[j]->op != GGML_OP_PERMUTE) {
                            if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
                                ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
                                continue;
                            } else {
                                exe_flag = 0;
                            }
                        }
                    }
                }
                if (exe_flag) {
                    ctx->concur_list[level_pos + concurrency] = i;
                    nodes_unused[i] = 0;
                    concurrency++;
                    ctx->concur_list_len++;
                }
            }
        }
        n_left -= concurrency;
        // adding a barrier different layer
        ctx->concur_list[level_pos + concurrency] = -1;
        ctx->concur_list_len++;
        // jump all sorted nodes at nodes_bak
        while (!nodes_unused[n_start]) {n_start++;}
        level_pos += concurrency + 1;
    }
    if (ctx->concur_list_len > GGML_MAX_NODES) {
        fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
    }
 }
 void ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {
    metal_printf("%s: evaluating graph\n", __func__);
    // if there is ctx->concur_list, dispatch concurrently
    // else fallback to serial dispatch
    MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES;
    const int n_nodes  = has_concur ? ctx->concur_list_len      : gf->n_nodes;
    edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
    // create multiple command buffers and enqueue them
    // then, we encode the graph into the command buffers in parallel
@ -378,7 +476,7 @@ void ggml_metal_graph_compute(
    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-        const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
+        const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
        dispatch_async(queue, ^{
            size_t offs_src0 = 0;
@ -390,9 +488,20 @@ void ggml_metal_graph_compute(
            id<MTLComputeCommandEncoder> encoder = nil;
            const int node_start =                                  (cb_idx + 0) * n_nodes_per_cb;
-            const int node_end   = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
+            const int node_end   = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
            for (int ind = node_start; ind < node_end; ++ind) {
                const int i = has_concur ? ctx->concur_list[ind] : ind;
                if (i == -1) {
                    if (encoder == nil) {
                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                        continue;
                    }
                    [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
                    continue;
                }
            for (int i = node_start; i < node_end; ++i) {
                metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
@ -463,7 +572,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_ADD:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            if (ggml_nelements(src1) == ne10) {
@ -484,7 +593,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_MUL:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            if (ggml_nelements(src1) == ne10) {
@ -505,7 +614,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_SCALE:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const float scale = *(const float *) src1->data;
@ -519,10 +628,12 @@ void ggml_metal_graph_compute(
                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
-                    case GGML_OP_SILU:
+                    case GGML_OP_UNARY:
                        switch (ggml_get_unary_op(gf->nodes[i])) {
                            case GGML_UNARY_OP_SILU:
                                {
                                    if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                                    }
                                    [encoder setComputePipelineState:ctx->pipeline_silu];
@ -533,10 +644,10 @@ void ggml_metal_graph_compute(
                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
-                    case GGML_OP_RELU:
+                            case GGML_UNARY_OP_RELU:
                                {
                                    if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                                    }
                                    [encoder setComputePipelineState:ctx->pipeline_relu];
@ -547,10 +658,10 @@ void ggml_metal_graph_compute(
                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
-                    case GGML_OP_GELU:
+                            case GGML_UNARY_OP_GELU:
                                {
                                    if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                                    }
                                    [encoder setComputePipelineState:ctx->pipeline_gelu];
@ -561,10 +672,16 @@ void ggml_metal_graph_compute(
                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            default:
                                {
                                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                                    GGML_ASSERT(false);
                                }
                        } break;
                    case GGML_OP_SOFT_MAX:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const int nth = 32;
@ -582,7 +699,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_DIAG_MASK_INF:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const int n_past = ((int32_t *)(dst->op_params))[0];
@ -645,7 +762,7 @@ void ggml_metal_graph_compute(
                                }
                            } else {
                                if (encoder == nil) {
-                                    encoder = [command_buffer computeCommandEncoder];
+                                    encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                                }
                                int nth0 = 32;
@ -772,7 +889,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_GET_ROWS:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            switch (src0->type) {
@ -801,10 +918,11 @@ void ggml_metal_graph_compute(
                    case GGML_OP_RMS_NORM:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
-                            const float eps = 1e-6f;
+                            float eps;
                            memcpy(&eps, dst->op_params, sizeof(float));
                            const int nth = 512;
@ -823,7 +941,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_NORM:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const float eps = 1e-5f;
@ -845,7 +963,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_ALIBI:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            GGML_ASSERT((src0t == GGML_TYPE_F32));
@ -888,7 +1006,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_ROPE:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const int n_past = ((int32_t *) dst->op_params)[0];
@ -932,7 +1050,7 @@ void ggml_metal_graph_compute(
                    case GGML_OP_CONT:
                        {
                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
                            }
                            const int nth = 32;
@ -979,10 +1097,12 @@ void ggml_metal_graph_compute(
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    default:
                        {
                            fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                            GGML_ASSERT(false);
                        }
                }
            }
            if (encoder != nil) {
                [encoder endEncoding];
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -387,87 +387,90 @@ kernel void kernel_rms_norm(
    }
 }
-// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
+// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
-float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
+// il indicates where the q4 quants begin (0 or QK4_0/4)
 // we assume that the yl's have been multiplied with the appropriate scale factor
 // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
    float d = qb_curr->d;
-    float4 acc = 0.f;
+    float2 acc = 0.f;
-    device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
+    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
-    for (int i = 0; i < 16; i+=2) {
+    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i]      * (qs[i / 2] & 0x000F);
+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
-        acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
+                + yl[i + 1] * (qs[i / 2] & 0x0F00);
-        acc[2] += yl[i +  1] * (qs[i / 2] & 0x0F00);
+        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
-        acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
+                + yl[i + 9] * (qs[i / 2] & 0xF000);
    }
-    return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
+    return d * (sumy * -8.f + acc[0] + acc[1]);
 }
-// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
+// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
-float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
+// il indicates where the q4 quants begin (0 or QK4_0/4)
 // we assume that the yl's have been multiplied with the appropriate scale factor
 // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
    float d = qb_curr->d;
    float m = qb_curr->m;
-    float4 acc = 0.f;
+    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
-    device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
+    float2 acc = 0.f;
-    for (int i = 0; i < 16; i+=2) {
+    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i]      * (qs[i / 2] & 0x000F);
+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
-        acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
+                + yl[i + 1] * (qs[i / 2] & 0x0F00);
-        acc[2] += yl[i +  1] * (qs[i / 2] & 0x0F00);
+        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
-        acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
+                + yl[i + 9] * (qs[i / 2] & 0xF000);
    }
-    return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
+    return d * (acc[0] + acc[1]) + sumy * m;
 }
 // putting them in the kernel cause a significant performance penalty
 #define N_DST 4 // each SIMD group works on 4 rows
 #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
-template<typename block_q_type>
+//Note: This is a template, but strictly speaking it only applies to
 //      quantizations where the block size is 32. It also does not
 //      giard against the number of rows not being divisible by
 //      N_DST, so this is another explicit assumption of the implementation.
 template<typename block_q_type, int nr, int nsg, int nw>
 void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
                    int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
                    uint2 tgpig, uint tiisg, uint sgitg) {
    const int nb = ne00/QK4_0;
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
-    device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
+    const int first_row = (r0 * nsg + sgitg) * nr;
    device const block_q_type * x = (device const block_q_type *) src0 + first_row * nb;
    device const float      * y = (device const float      *) src1 + r1*ne10;
-    float4 y_curr[8];       // src1 vector cache
+    float yl[16];       // src1 vector cache
-    float sumf[N_DST]={0.f}, all_sum;
+    float sumf[nr]={0.f};
    thread float * yl=(thread float *)y_curr;
-    // each thread in a SIMD group deals with 1 block.
+    const int ix = tiisg/2;
-    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
+    const int il = 8*(tiisg%2);
    device const float * yb = y + ix * QK4_0 + il;
    // each thread in a SIMD group deals with half a block.
    for (int ib = ix; ib < nb; ib += nw/2) {
        float sumy = 0;
-        for (int i = 0; i < QK4_0 / 4; i++) {
+        for (int i = 0; i < 8; i += 2) {
-            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
+            sumy += yb[i] + yb[i+1];
-            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
+            yl[i+0] = yb[i+ 0];
            yl[i+1] = yb[i+ 1]/256.f;
            sumy += yb[i+16] + yb[i+17];
            yl[i+8] = yb[i+16]/16.f;
            yl[i+9] = yb[i+17]/4096.f;
        }
-        for (int row = 0; row < N_DST; row++) {
+        for (int row = 0; row < nr; row++) {
-            sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
+            sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
        }
        }
-    // from now loads two rows every time and 16 blocks per row
+        yb += QK4_0 * 16;
    int ir = tiisg / (N_SIMDWIDTH / 2);
    int ib = tiisg % (N_SIMDWIDTH / 2);
    for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
        int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
        float sumy = 0;
        for (int i = 0; i < QK4_0 / 4; i++) {
            y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
    }
-        for (int row = 0; row < N_DST; row+=2) {
+    for (int row = 0; row < nr; ++row) {
-            if (nb_start + ib < nb) {
+        const float tot = simd_sum(sumf[row]);
-                sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
+        if (tiisg == 0 && first_row + row < ne01) {
-            }
+            dst[r1*ne0 + first_row + row] = tot;
        }
    }
    for (int row = 0; row < N_DST; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
            dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
        }
    }
 }
@ -483,7 +486,7 @@ kernel void kernel_mul_mat_q4_0_f32(
        uint2 tgpig[[threadgroup_position_in_grid]],
        uint tiisg[[thread_index_in_simdgroup]],
        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-    mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
+    mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
 }
 kernel void kernel_mul_mat_q4_1_f32(
@ -497,7 +500,7 @@ kernel void kernel_mul_mat_q4_1_f32(
        uint2 tgpig[[threadgroup_position_in_grid]],
        uint tiisg[[thread_index_in_simdgroup]],
        uint sgitg[[simdgroup_index_in_threadgroup]]) {
-     mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
+     mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
 }
 kernel void kernel_mul_mat_f16_f32(
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -208,6 +208,7 @@
 #define GGML_UNUSED(x) (void)(x)
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
 #define GGML_ASSERT(x) \
    do { \
@ -330,16 +331,6 @@ extern "C" {
        GGML_OP_ARGMAX,
        GGML_OP_REPEAT,
        GGML_OP_REPEAT_BACK,
        GGML_OP_ABS,
        GGML_OP_SGN,
        GGML_OP_NEG,
        GGML_OP_STEP,
        GGML_OP_TANH,
        GGML_OP_ELU,
        GGML_OP_RELU,
        GGML_OP_GELU,
        GGML_OP_GELU_QUICK,
        GGML_OP_SILU,
        GGML_OP_SILU_BACK,
        GGML_OP_NORM, // normalize
        GGML_OP_RMS_NORM,
@ -378,6 +369,8 @@ extern "C" {
        GGML_OP_WIN_PART,
        GGML_OP_WIN_UNPART,
        GGML_OP_UNARY,
        GGML_OP_MAP_UNARY,
        GGML_OP_MAP_BINARY,
@ -391,6 +384,24 @@ extern "C" {
        GGML_OP_COUNT,
    };
    enum ggml_unary_op {
        GGML_UNARY_OP_ABS,
        GGML_UNARY_OP_SGN,
        GGML_UNARY_OP_NEG,
        GGML_UNARY_OP_STEP,
        GGML_UNARY_OP_TANH,
        GGML_UNARY_OP_ELU,
        GGML_UNARY_OP_RELU,
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
    };
    enum ggml_object_type {
        GGML_OBJECT_TENSOR,
        GGML_OBJECT_GRAPH,
        GGML_OBJECT_WORK_BUFFER
    };
    // ggml object
    struct ggml_object {
@ -399,7 +410,9 @@ extern "C" {
        struct ggml_object * next;
-        char padding[8];
+        enum ggml_object_type type;
        char padding[4];
    };
    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
@ -420,7 +433,7 @@ extern "C" {
        enum ggml_op op;
        // op params - allocated as int32_t for alignment
-        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
+        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
        bool is_param;
@ -438,7 +451,7 @@ extern "C" {
        void * extra; // extra things e.g. for ggml-cuda.cu
-        char padding[8];
+        char padding[4];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -459,6 +472,11 @@ extern "C" {
        void * abort_callback_data;
    };
    // next prime after GGML_MAX_NODES
    // #define GGML_GRAPH_HASHTABLE_SIZE 4099
    // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
    #define GGML_GRAPH_HASHTABLE_SIZE 8273
    // computation graph
    struct ggml_cgraph {
        int n_nodes;
@ -468,12 +486,16 @@ extern "C" {
        struct ggml_tensor * grads[GGML_MAX_NODES];
        struct ggml_tensor * leafs[GGML_MAX_NODES];
        void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
        // performance
        int     perf_runs;
        int64_t perf_cycles;
        int64_t perf_time_us;
    };
    static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
    // scratch buffer
    struct ggml_scratch {
        size_t offs;
@ -535,6 +557,7 @@ extern "C" {
    GGML_API const char * ggml_type_name(enum ggml_type type);
    GGML_API const char * ggml_op_name  (enum ggml_op   op);
    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
@ -558,6 +581,7 @@ extern "C" {
    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
    GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);
@ -617,6 +641,8 @@ extern "C" {
    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);
@ -629,6 +655,11 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_dup_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_add(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -853,14 +884,17 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_rms_norm(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
            float                 eps);
    GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
            float                 eps);
    // a - x
    // b - dy
    // TODO: update with configurable eps
    GGML_API struct ggml_tensor * ggml_rms_norm_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -952,11 +986,22 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // a -> b, in-place, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // make contiguous
    GGML_API struct ggml_tensor * ggml_cont(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // make contiguous, in-place
    GGML_API struct ggml_tensor * ggml_cont_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // return view(a), b specifies the new shape
    // TODO: when we start computing gradient, make a copy instead of view
    GGML_API struct ggml_tensor * ggml_reshape(
@ -1268,6 +1313,16 @@ extern "C" {
    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
    GGML_API struct ggml_tensor * ggml_unary(
            struct ggml_context * ctx,
             struct ggml_tensor * a,
             enum ggml_unary_op op);
    GGML_API struct ggml_tensor * ggml_unary_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum ggml_unary_op op);
    GGML_API struct ggml_tensor * ggml_map_unary_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
@ -1347,11 +1402,17 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * tensor);
    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
    // graph allocation in a context
    GGML_API struct ggml_cgraph * ggml_new_graph        (struct ggml_context * ctx);
    GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
    GGML_API size_t ggml_graph_overhead(void);
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
--- a/grammars/arithmetic.gbnf
+++ b/grammars/arithmetic.gbnf
@ -0,0 +1,6 @@
 root  ::= (expr "=" ws term "\n")+
 expr  ::= term ([-+*/] term)*
 term  ::= ident | num | "(" ws expr ")" ws
 ident ::= [a-z] [a-z0-9_]* ws
 num   ::= [0-9]+ ws
 ws    ::= [ \t\n]*
--- a/grammars/chess.gbnf
+++ b/grammars/chess.gbnf
@ -0,0 +1,13 @@
 # Specifies chess moves as a list in algebraic notation, using PGN conventions
 # Force first move to "1. ", then any 1-2 digit number after, relying on model to follow the pattern
 root    ::= "1. " move " " move "\n" ([1-9] [0-9]? ". " move " " move "\n")+
 move    ::= (pawn | nonpawn | castle) [+#]?
 # piece type, optional file/rank, optional capture, dest file & rank
 nonpawn ::= [NBKQR] [a-h]? [1-8]? "x"? [a-h] [1-8]
 # optional file & capture, dest file & rank, optional promotion
 pawn    ::= ([a-h] "x")? [a-h] [1-8] ("=" [NBKQR])?
 castle  ::= "O-O" "-O"?
--- a/grammars/japanese.gbnf
+++ b/grammars/japanese.gbnf
@ -0,0 +1,7 @@
 # A probably incorrect grammar for Japanese
 root        ::= jp-char+ ([ \t\n] jp-char+)*
 jp-char     ::= hiragana | katakana | punctuation | cjk
 hiragana    ::= [ぁ-ゟ]
 katakana    ::= [ァ-ヿ]
 punctuation ::= [、-〾]
 cjk         ::= [一-鿿]
--- a/grammars/json.gbnf
+++ b/grammars/json.gbnf
@ -0,0 +1,29 @@
 # Grammar for subset of JSON - doesn't support full string or number syntax
 root  ::= object
 value ::= object | array | string | number | boolean | "null"
 object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
  )? "}"
 array  ::=
  "[" ws (
            value
    ("," ws value)*
  )? "]"
 string  ::=
  "\"" (
    [^"\\] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws
 # Only plain integers currently
 number  ::= "-"? [0-9]+ ws
 boolean ::= ("true" | "false") ws
 # Optional space: by convention, applied in this grammar after literal chars when allowed
 ws ::= ([ \t\n] ws)?
--- a/grammars/list.gbnf
+++ b/grammars/list.gbnf
@ -0,0 +1,4 @@
 root ::= item+
 # Excludes various line break characters
 item ::= "- " [^\r\n\x0b\x0c\x85\u2028\u2029]+ "\n"
--- a/k_quants.c
+++ b/k_quants.c
@ -39,6 +39,8 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
 //
 // 2-6 bit quantization in super-blocks
 //
@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
        __m256i sumi = _mm256_setzero_si256();
@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
        // sumf += -dmin * summs in 32bits*8
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        }
        // sumf += dall * isum - dmin * summs in 32bits
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
    }
@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        summs += dmin * smin;
        const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
-        const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
+        const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
-        const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
+        const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
@ -1666,6 +1668,62 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc) + summs;
 #elif defined __AVX__
    const __m128i m3 = _mm_set1_epi8(3);
    __m256 acc = _mm256_setzero_ps();
    uint32_t ud, um;
    const uint8_t * restrict db = (const uint8_t *)&ud;
    const uint8_t * restrict mb = (const uint8_t *)&um;
    float summs = 0;
    // TODO: optimize this
    for (int i = 0; i < nb; ++i) {
        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
        const uint8_t * restrict q2 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
        ud = (sc[0] >> 0) & 0x0f0f0f0f;
        um = (sc[0] >> 4) & 0x0f0f0f0f;
        int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
        summs += dmin * smin;
        const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
        const __m128i q2_0 = _mm_and_si128(q2bits, m3);
        const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
        const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
        const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
        const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0));
        const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1));
        const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
        const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
        const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
        const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
        const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
        const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc);
    }
    *s = hsum_float_8(acc) + summs;
 #else
    float sumf = 0;
@ -1861,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
        // high bit
        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
@ -2072,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        }
        // multiply with block scale and accumulate
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
    }
@ -2247,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        aux16[0] = a & 0x0f0f;
        aux16[1] = (a >> 4) & 0x0f0f;
-        const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
+        const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
-        const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
+        const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
        memcpy(&aux64, x[i].hmask, 8);
        const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
-        __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
+        __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
        __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
        q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
        q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
@ -2262,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
        // prepare low and high bits
-        const __m256i q3aux  = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
+        const __m256i q3aux  = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
        const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
        const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
@ -2295,6 +2353,93 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc);
 #elif defined __AVX__
    const __m128i m3 = _mm_set1_epi8(3);
    const __m128i m1 = _mm_set1_epi8(1);
    __m256 acc = _mm256_setzero_ps();
    uint64_t aux64;
    uint16_t aux16[2];
    const int8_t * aux8 = (const int8_t *)aux16;
    for (int i = 0; i < nb; ++i) {
        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
        const uint8_t * restrict q3 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
        const uint16_t a = *(const uint16_t *)x[i].scales;
        aux16[0] = a & 0x0f0f;
        aux16[1] = (a >> 4) & 0x0f0f;
        const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8);
        const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8);
        const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8);
        const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8);
        memcpy(&aux64, x[i].hmask, 8);
        __m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
        __m128i q3h_1 = _mm_srli_epi16(q3h_0, 2);
        __m128i q3h_2 = _mm_srli_epi16(q3h_0, 4);
        __m128i q3h_3 = _mm_srli_epi16(q3h_0, 6);
        q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2);
        q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2);
        q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2);
        q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2);
        // load low 2 bits
        const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
        // prepare low and high bits
        const __m128i q3l_0 = _mm_and_si128(q3bits, m3);
        const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3);
        const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3);
        const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3);
        // load Q8 quants
        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
        // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16,
        // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
        // and 2 if the high bit was set)
        const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0));
        const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1));
        const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0));
        const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1));
        __m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0));
        __m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1));
        __m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0));
        __m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1));
        p16_0 = _mm_sub_epi16(p16_0, q8s_0);
        p16_1 = _mm_sub_epi16(p16_1, q8s_1);
        p16_2 = _mm_sub_epi16(p16_2, q8s_2);
        p16_3 = _mm_sub_epi16(p16_3, q8s_3);
        // multiply with scales
        p16_0 = _mm_madd_epi16(scale_0, p16_0);
        p16_1 = _mm_madd_epi16(scale_1, p16_1);
        p16_2 = _mm_madd_epi16(scale_2, p16_2);
        p16_3 = _mm_madd_epi16(scale_3, p16_3);
        p16_0 = _mm_add_epi32(p16_0, p16_2);
        p16_1 = _mm_add_epi32(p16_1, p16_3);
        __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
        // multiply with block scale and accumulate
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
    }
    *s = hsum_float_8(acc);
 #else
    int8_t  aux8[QK_K];
@ -2477,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = _mm256_set_m128i(sc128, sc128);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
        __m256i sumi = _mm256_setzero_si256();
@ -2584,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
        }
        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
    }
@ -2781,6 +2926,60 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc) - summs;
 #elif defined __AVX__
    const __m128i m4 = _mm_set1_epi8(0xF);
    __m256 acc = _mm256_setzero_ps();
    float summs = 0;
    uint16_t aux16[2];
    const uint8_t * scales = (const uint8_t *)aux16;
    for (int i = 0; i < nb; ++i) {
        const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
        const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
        const __m256 vd = _mm256_set1_ps(d);
        const uint16_t * a = (const uint16_t *)x[i].scales;
        aux16[0] = a[0] & 0x0f0f;
        aux16[1] = (a[0] >> 4) & 0x0f0f;
        summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
        const uint8_t * restrict q4 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
        const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
        const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
        const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1);
        const __m128i q4_0 = _mm_and_si128(q4bits_0, m4);
        const __m128i q4_1 = _mm_and_si128(q4bits_1, m4);
        const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4);
        const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4);
        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
        const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
        const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
        const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
        const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
        const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
        const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
        const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
        const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
    }
    *s = hsum_float_8(acc) - summs;
 #else
    uint8_t aux8[QK_K];
@ -2963,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        summs += dmin * _mm_extract_epi32(hsum, 0);
        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = _mm256_set_m128i(sc128, sc128);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
        __m256i hmask = mone;
@ -3102,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        }
        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
    }
@ -3265,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
-        const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
+        const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
-        const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
+        const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
        int64_t aux64;
        memcpy(&aux64, x[i].qh, 8);
        const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
-        const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
+        const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
        const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
        const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
@ -3295,10 +3494,66 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc);
 #elif defined __AVX__
    const __m128i m4 = _mm_set1_epi8(0xF);
    const __m128i mone  = _mm_set1_epi8(1);
    __m256 acc = _mm256_setzero_ps();
    for (int i = 0; i < nb; ++i) {
        const uint8_t * restrict q5 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
        const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
        const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]);
        const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]);
        const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]);
        const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]);
        int64_t aux64;
        memcpy(&aux64, x[i].qh, 8);
        const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64);
        const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2);
        const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4);
        const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4);
        const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4);
        const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4);
        const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4);
        const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4);
        const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4);
        const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4);
        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
        const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0)));
        const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1)));
        const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0)));
        const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1)));
        const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0)));
        const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1)));
        const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0)));
        const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1)));
        const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
        const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
    }
    *s = hsum_float_8(acc);
 #else
-
+    int8_t aux8[QK_K];
    uint8_t aux8[QK_K];
    int16_t aux16[16];
    float   sums [8];
    memset(sums, 0, 8*sizeof(float));
@ -3308,7 +3563,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        const uint8_t * restrict q4 = x[i].qs;
        const uint8_t * restrict hm = x[i].qh;
        const  int8_t * restrict q8 = y[i].qs;
-        uint8_t * restrict a = aux8;
+        int8_t * restrict a = aux8;
        for (int l = 0; l < 32; ++l) {
            a[l+ 0] = q4[l] & 0xF;
            a[l+32] = q4[l]  >> 4;
@ -3672,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
        }
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
    }
@ -3830,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
        const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
        const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
-        const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
+        const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
-        const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
+        const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
        const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
        const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
@ -3858,6 +4113,77 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc);
 #elif defined __AVX__
    const __m128i m4 = _mm_set1_epi8(0xF);
    const __m128i m2 = _mm_set1_epi8(3);
    const __m128i m32s = _mm_set1_epi8(32);
    __m256 acc = _mm256_setzero_ps();
    for (int i = 0; i < nb; ++i) {
        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
        const uint8_t * restrict q4 = x[i].ql;
        const uint8_t * restrict qh = x[i].qh;
        const int8_t  * restrict q8 = y[i].qs;
        const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
        const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
        const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
        const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
        __m128i sumi_0 = _mm_setzero_si128();
        __m128i sumi_1 = _mm_setzero_si128();
        const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
        const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
        const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
        const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
        const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
        const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
        const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
        const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
        const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
        const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
        const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
        const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
        __m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
        __m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
        __m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
        __m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
        __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
        __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
        __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
        __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
        p16_0 = _mm_sub_epi16(p16_0, q8s_0);
        p16_1 = _mm_sub_epi16(p16_1, q8s_1);
        p16_2 = _mm_sub_epi16(p16_2, q8s_2);
        p16_3 = _mm_sub_epi16(p16_3, q8s_3);
        p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
        p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
        p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
        p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
        sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
        sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
    }
    *s = hsum_float_8(acc);
 #else
    int8_t  aux8[QK_K];
--- a/llama.cpp
+++ b/llama.cpp
@ -186,6 +186,7 @@ struct llama_hparams {
    // LLaMAv2
    // TODO: load from model data hparams
    float f_ffn_mult = 1.0f;
    float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
    float rope_freq_base  = 10000.0f;
    float rope_freq_scale = 1.0f;
@ -869,6 +870,7 @@ struct llama_context_params llama_context_default_params() {
        /*.n_ctx                       =*/ 512,
        /*.n_batch                     =*/ 512,
        /*.n_gqa                       =*/ 1,
        /*.rms_norm_eps                =*/ LLAMA_DEFAULT_RMS_EPS,
        /*.gpu_layers                  =*/ 0,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ nullptr,
@ -1000,6 +1002,7 @@ static void llama_model_load_internal(
        int n_ctx,
        int n_batch,
        int n_gqa,
        float rms_norm_eps,
        int n_gpu_layers,
        int main_gpu,
        const float * tensor_split,
@ -1024,6 +1027,9 @@ static void llama_model_load_internal(
    auto & hparams = model.hparams;
    // TODO: read from file
    hparams.f_rms_norm_eps = rms_norm_eps;
    {
        switch (hparams.n_layer) {
            case 26: model.type = e_model::MODEL_3B; break;
@ -1072,6 +1078,7 @@ static void llama_model_load_internal(
        fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
        fprintf(stderr, "%s: n_gqa      = %u\n",   __func__, hparams.n_gqa());
        fprintf(stderr, "%s: rnorm_eps  = %.1e\n", __func__, hparams.f_rms_norm_eps);
        fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
        fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
        fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
@ -1330,6 +1337,7 @@ static bool llama_model_load(
        int n_ctx,
        int n_batch,
        int n_gqa,
        float rms_norm_eps,
        int n_gpu_layers,
        int main_gpu,
        const float * tensor_split,
@ -1343,7 +1351,7 @@ static bool llama_model_load(
        llama_progress_callback progress_callback,
        void *progress_callback_user_data) {
    try {
-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::exception & err) {
@ -1396,10 +1404,12 @@ static bool llama_eval_internal(
    const int64_t n_vocab     = hparams.n_vocab;
    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
    LLAMA_ASSERT(n_embd_head == hparams.n_rot);
    const float freq_base  = hparams.rope_freq_base;
    const float freq_scale = hparams.rope_freq_scale;
    const float rms_norm_eps = hparams.f_rms_norm_eps;
    const int n_gpu_layers = model.n_gpu_layers;
@ -1414,7 +1424,7 @@ static bool llama_eval_internal(
    struct ggml_context * ctx0 = ggml_init(params);
-    ggml_cgraph gf = {};
+    ggml_cgraph * gf = ggml_new_graph(ctx0);
    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
@ -1479,7 +1489,7 @@ static bool llama_eval_internal(
        // norm
        {
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            offload_func(cur);
            ggml_set_name(cur, "rms_norm_0");
@ -1531,8 +1541,8 @@ static bool llama_eval_internal(
                ggml_set_name(v, "v");
                // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
            }
            struct ggml_tensor * Q =
@ -1627,7 +1637,7 @@ static bool llama_eval_internal(
        {
            // norm
            {
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                offload_func(cur);
                ggml_set_name(cur, "rms_norm_1");
@ -1680,7 +1690,7 @@ static bool llama_eval_internal(
    // norm
    {
-        cur = ggml_rms_norm(ctx0, inpL);
+        cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        offload_func_nr(cur);
        ggml_set_name(cur, "rms_norm_2");
@ -1702,16 +1712,22 @@ static bool llama_eval_internal(
    //cur = ggml_soft_max_inplace(ctx0, cur);
    // run the computation
-    ggml_build_forward_expand(&gf, cur);
+    ggml_build_forward_expand(gf, cur);
    // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
 #if GGML_USE_MPI
-    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
+    ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
 #endif
 #ifdef GGML_USE_METAL
    if (lctx.ctx_metal && N == 1) {
        // TODO: disabled until #2413 is resolved
        //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
        //    ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
        //}
        ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
-        ggml_metal_graph_compute(lctx.ctx_metal, &gf);
+        ggml_metal_graph_compute(lctx.ctx_metal, gf);
        ggml_metal_get_tensor   (lctx.ctx_metal, cur);
    } else {
        // IMPORTANT:
@ -1730,34 +1746,34 @@ static bool llama_eval_internal(
            ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
        }
-        ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
+        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
    }
 #else
-    ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
+    ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
 #endif
 #if GGML_USE_MPI
-    ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
+    ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
 #endif
    // update kv token count
    lctx.kv_self.n = n_past + N;
-    struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
+    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
    if (cgraph_fname) {
-        ggml_graph_export(&gf, cgraph_fname);
+        ggml_graph_export(gf, cgraph_fname);
    }
 #ifdef GGML_PERF
    // print timing information per ggml operation (for debugging purposes)
    // requires GGML_PERF to be defined
-    ggml_graph_print(&gf);
+    ggml_graph_print(gf);
 #endif
    // plot the computation graph in dot format (for debugging purposes)
    //if (n_past%100 == 0) {
-    //    ggml_graph_dump_dot(&gf, NULL, "llama.dot");
+    //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
    //}
    // extract logits
@ -1908,7 +1924,9 @@ struct llama_tokenizer {
            if (token == vocab_.token_to_id.end()) {
                // output any symbols that did not form tokens as bytes.
                for (int j = 0; j < (int) symbol.n; ++j) {
-                    llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
+                    // NOTE: old version, before #2420 - not sure what are the implications of this
                    //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
                    llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
                    output.push_back(token_id);
                }
            } else {
@ -1965,6 +1983,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
    return output;
 }
 //
 // grammar - internal
 //
 struct llama_grammar {
    const std::vector<std::vector<llama_grammar_element>>   rules;
    std::vector<std::vector<const llama_grammar_element *>> stacks;
 };
 struct llama_grammar_candidate {
    size_t           index;
    const uint32_t * code_points;
 };
 // NOTE: assumes valid utf8 (but checks for overrun)
 // adds a terminating 0 for use as pointer
 std::vector<uint32_t> decode_utf8(const char * src) {
    static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    const char          * pos      = src;
    std::vector<uint32_t> code_points;
    while (*pos != 0) {
        uint8_t  first_byte = static_cast<uint8_t>(*pos);
        uint8_t  highbits   = first_byte >> 4;
        int      len        = lookup[highbits];
        uint8_t  mask       = (1 << (8 - len)) - 1;
        uint32_t value      = first_byte & mask;
        const char * end    = pos + len; // may overrun!
        ++pos;
        for ( ; pos < end && *pos != 0; ++pos) {
            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
        }
        code_points.push_back(value);
    }
    code_points.push_back(0);
    return code_points;
 }
 // returns true iff pos points to the end of one of the definitions of a rule
 static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
    switch (pos->type) {
        case LLAMA_GRETYPE_END: return true;
        case LLAMA_GRETYPE_ALT: return true;
        default:                return false;
    }
 }
 // returns true iff chr satisfies the char range at pos (regular or inverse range)
 // asserts that pos is pointing to a char range element
 static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
        const llama_grammar_element * pos,
        const uint32_t                chr) {
    bool found            = false;
    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
    LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
    do {
        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
            // inclusive range, e.g. [a-z]
            found = found || (pos->value <= chr && chr <= pos[1].value);
            pos += 2;
        } else {
            // exact char match, e.g. [a] or "a"
            found = found || pos->value == chr;
            pos += 1;
        }
    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
    return std::make_pair(found == is_positive_char, pos);
 }
 // transforms a grammar pushdown stack into N possible stacks, all ending
 // at a character range (terminal element)
 static void llama_grammar_advance_stack(
        const std::vector<std::vector<llama_grammar_element>>   & rules,
        const std::vector<const llama_grammar_element *>        & stack,
        std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
    if (stack.empty()) {
        new_stacks.push_back(stack);
        return;
    }
    const llama_grammar_element * pos = stack.back();
    switch (pos->type) {
        case LLAMA_GRETYPE_RULE_REF: {
            const size_t                  rule_id = static_cast<size_t>(pos->value);
            const llama_grammar_element * subpos  = rules[rule_id].data();
            do {
                // init new stack without the top (pos)
                std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
                if (!llama_grammar_is_end_of_sequence(pos + 1)) {
                    // if this rule ref is followed by another element, add that to stack
                    new_stack.push_back(pos + 1);
                }
                if (!llama_grammar_is_end_of_sequence(subpos)) {
                    // if alternate is nonempty, add to stack
                    new_stack.push_back(subpos);
                }
                llama_grammar_advance_stack(rules, new_stack, new_stacks);
                while (!llama_grammar_is_end_of_sequence(subpos)) {
                    // scan to end of alternate def
                    subpos++;
                }
                if (subpos->type == LLAMA_GRETYPE_ALT) {
                    // there's another alternate def of this rule to process
                    subpos++;
                } else {
                    break;
                }
            } while (true);
            break;
        }
        case LLAMA_GRETYPE_CHAR:
        case LLAMA_GRETYPE_CHAR_NOT:
            new_stacks.push_back(stack);
            break;
        default:
            // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
            // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
            // those
            LLAMA_ASSERT(false);
    }
 }
 // takes a set of possible pushdown stacks on a grammar, which are required to
 // be positioned at a character range (see `llama_grammar_advance_stack`), and
 // produces the N possible stacks if the given char is accepted at those
 // positions
 static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
        const std::vector<std::vector<llama_grammar_element>>         & rules,
        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
        const uint32_t                                                  chr) {
    std::vector<std::vector<const llama_grammar_element *>> new_stacks;
    for (const auto & stack : stacks) {
        if (stack.empty()) {
            continue;
        }
        auto match = llama_grammar_match_char(stack.back(), chr);
        if (match.first) {
            const llama_grammar_element * pos = match.second;
            // update top of stack to next element, if any
            std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
            if (!llama_grammar_is_end_of_sequence(pos)) {
                new_stack.push_back(pos);
            }
            llama_grammar_advance_stack(rules, new_stack, new_stacks);
        }
    }
    return new_stacks;
 }
 static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
        const std::vector<std::vector<llama_grammar_element>>         & rules,
        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
        const std::vector<llama_grammar_candidate>                    & candidates);
 static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
        const std::vector<std::vector<llama_grammar_element>> & rules,
        const std::vector<const llama_grammar_element *>      & stack,
        const std::vector<llama_grammar_candidate>            & candidates) {
    std::vector<llama_grammar_candidate> rejects;
    if (stack.empty()) {
        // accept nothing; EOS is handled elsewhere
        rejects.insert(rejects.end(), candidates.begin(), candidates.end());
        return rejects;
    }
    const llama_grammar_element * stack_pos = stack.back();
    std::vector<llama_grammar_candidate> next_candidates;
    for (auto tok : candidates) {
        if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
            if (tok.code_points[1] != 0) {
                next_candidates.push_back({ tok.index, tok.code_points + 1 });
            }
        } else {
            rejects.push_back(tok);
        }
    }
    auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
    // update top of stack to next element, if any
    std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
    if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
        stack_after.push_back(stack_pos_after);
    }
    std::vector<std::vector<const llama_grammar_element *>> next_stacks;
    llama_grammar_advance_stack(rules, stack_after, next_stacks);
    auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
    for (auto tok : next_rejects) {
        rejects.push_back({ tok.index, tok.code_points - 1 });
    }
    return rejects;
 }
 static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
        const std::vector<std::vector<llama_grammar_element>>         & rules,
        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
        const std::vector<llama_grammar_candidate>                    & candidates) {
    LLAMA_ASSERT(!stacks.empty()); // REVIEW
    if (candidates.empty()) {
        return std::vector<llama_grammar_candidate>();
    }
    auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
    for (size_t i = 1, size = stacks.size(); i < size; ++i) {
        rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
    }
    return rejects;
 }
 //
 // grammar - external
 //
 struct llama_grammar * llama_grammar_init(
            const llama_grammar_element ** rules,
                                 size_t    n_rules,
                                 size_t    start_rule_index) {
    const llama_grammar_element * pos;
    // copy rule definitions into vectors
    std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
    for (size_t i = 0; i < n_rules; i++) {
        for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
            vec_rules[i].push_back(*pos);
        }
        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
    }
    // loop over alternates of start rule to build initial stacks
    std::vector<std::vector<const llama_grammar_element *>> stacks;
    pos = rules[start_rule_index];
    do {
        std::vector<const llama_grammar_element *> stack;
        if (!llama_grammar_is_end_of_sequence(pos)) {
            // if alternate is nonempty, add to stack
            stack.push_back(pos);
        }
        llama_grammar_advance_stack(vec_rules, stack, stacks);
        while (!llama_grammar_is_end_of_sequence(pos)) {
            // scan to end of alternate def
            pos++;
        }
        if (pos->type == LLAMA_GRETYPE_ALT) {
            // there's another alternate def of this rule to process
            pos++;
        } else {
            break;
        }
    } while (true);
    return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
 }
 void llama_grammar_free(struct llama_grammar * grammar) {
    delete grammar;
 }
 //
 // sampling
 //
@ -2250,6 +2541,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
    }
 }
 void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
    assert(ctx);
    const int64_t t_start_sample_us = ggml_time_us();
    bool allow_eos = false;
    for (const auto & stack : grammar->stacks) {
        if (stack.empty()) {
            allow_eos = true;
            break;
        }
    }
    const llama_token eos = llama_token_eos();
    std::vector<std::vector<uint32_t>>   candidates_decoded;
    std::vector<llama_grammar_candidate> candidates_grammar;
    for (size_t i = 0; i < candidates->size; ++i) {
        const llama_token id  = candidates->data[i].id;
        const char *      str = llama_token_to_str(ctx, id);
        if (id == eos) {
            if (!allow_eos) {
                candidates->data[i].logit = -INFINITY;
            }
        } else if (*str == 0) {
            candidates->data[i].logit = -INFINITY;
        } else {
            candidates_decoded.push_back(decode_utf8(str));
            candidates_grammar.push_back({ i, candidates_decoded.back().data() });
        }
    }
    const auto rejects =
        llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
    for (auto & reject : rejects) {
        candidates->data[reject.index].logit = -INFINITY;
    }
    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 }
 static void llama_log_softmax(float * array, size_t size) {
    float max_l = *std::max_element(array, array + size);
    float sum = 0.f;
@ -2425,6 +2757,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
    return result;
 }
 void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
    const int64_t t_start_sample_us = ggml_time_us();
    if (token == llama_token_eos()) {
        for (const auto & stack : grammar->stacks) {
            if (stack.empty()) {
                return;
            }
        }
        LLAMA_ASSERT(false);
    }
    const char * str = llama_token_to_str(ctx, token);
    // Note terminating 0 in decoded string
    auto code_points = decode_utf8(str);
    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
        grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
    }
    LLAMA_ASSERT(!grammar->stacks.empty());
    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 }
 //
 // quantization
 //
@ -2747,7 +3102,7 @@ struct llama_model * llama_load_model_from_file(
    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
-    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.n_gpu_layers,
+    if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
                params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
                memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
                params.progress_callback_user_data)) {
@ -2825,7 +3180,7 @@ struct llama_context * llama_new_context_with_model(
            ctx->embedding.resize(hparams.n_embd);
        }
-        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
@ -3310,7 +3665,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
        const auto & kv_self = ctx->kv_self;
        const auto & hparams = ctx->model.hparams;
        const int    n_layer = hparams.n_layer;
-        const int    n_embd  = hparams.n_embd;
+        const int    n_embd  = hparams.n_embd_gqa();
        const int    n_ctx   = hparams.n_ctx;
        const size_t kv_size = kv_self.buf.size;
@ -3413,7 +3768,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
        const auto & kv_self = ctx->kv_self;
        const auto & hparams = ctx->model.hparams;
        const int    n_layer = hparams.n_layer;
-        const int    n_embd  = hparams.n_embd;
+        const int    n_embd  = hparams.n_embd_gqa();
        const int    n_ctx   = hparams.n_ctx;
        size_t kv_size;
--- a/llama.h
+++ b/llama.h
@ -53,6 +53,10 @@
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
 #ifndef LLAMA_DEFAULT_RMS_EPS
 #define LLAMA_DEFAULT_RMS_EPS 5e-6f
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -87,6 +91,7 @@ extern "C" {
        int32_t  n_ctx;        // text context
        int32_t  n_batch;      // prompt processing batch size
        int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
        float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
        int32_t  n_gpu_layers; // number of layers to store in VRAM
        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
@ -141,6 +146,40 @@ extern "C" {
        bool quantize_output_tensor; // quantize output.weight
    } llama_model_quantize_params;
    // grammar types
    struct llama_grammar;
    // grammar element type
    enum llama_gretype {
        // end of rule definition
        LLAMA_GRETYPE_END            = 0,
        // start of alternate definition for rule
        LLAMA_GRETYPE_ALT            = 1,
        // non-terminal element: reference to rule
        LLAMA_GRETYPE_RULE_REF       = 2,
        // terminal element: character (code point)
        LLAMA_GRETYPE_CHAR           = 3,
        // inverse char(s) ([^a], [^a-b] [^abc])
        LLAMA_GRETYPE_CHAR_NOT       = 4,
        // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
        // be an inclusive range ([a-z])
        LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
        // modifies a preceding LLAMA_GRETYPE_CHAR or
        // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
        LLAMA_GRETYPE_CHAR_ALT       = 6,
    };
    typedef struct llama_grammar_element {
        enum llama_gretype type;
        uint32_t           value; // Unicode code point or rule ID
    } llama_grammar_element;
    // performance timing information
    struct llama_timings {
        double t_start_ms;
@ -333,6 +372,15 @@ extern "C" {
    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
    LLAMA_API llama_token llama_token_nl();   // next-line
    // Grammar
    //
    LLAMA_API struct llama_grammar * llama_grammar_init(
            const llama_grammar_element ** rules,
                                 size_t    n_rules,
                                 size_t    start_rule_index);
    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
    // Sampling functions
    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@ -367,6 +415,9 @@ extern "C" {
    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
    /// @details Apply constraints from grammar
    LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@ -388,6 +439,9 @@ extern "C" {
    /// @details Randomly selects a token from the candidates based on their probabilities.
    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
    /// @details Accepts the sampled token into the grammar
    LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
--- a/scripts/build-info.sh
+++ b/scripts/build-info.sh
@ -16,7 +16,8 @@ fi
 echo "#ifndef BUILD_INFO_H"
 echo "#define BUILD_INFO_H"
 echo ""
-echo "#define BUILD_NUMBER $BUILD_NUMBER"
+echo "#define BUILD_NUMBER $BUILD_NUMBER" | tr -d '\n'
-echo "#define BUILD_COMMIT \"$BUILD_COMMIT\""
+echo ""
 echo "#define BUILD_COMMIT \"$BUILD_COMMIT\"" | tr -d '\n'
 echo ""
 echo "#endif // BUILD_INFO_H"
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@ -64,7 +64,7 @@ void get_random_dims(int64_t * dims, int ndims) {
    }
 }
-struct ggml_tensor * get_random_tensor(
+struct ggml_tensor * get_random_tensor_f32(
        struct ggml_context * ctx0,
        int ndims,
        int64_t ne[],
@ -112,7 +112,55 @@ struct ggml_tensor * get_random_tensor(
    return result;
 }
-struct ggml_tensor * get_random_tensor_int(
+struct ggml_tensor * get_random_tensor_f16(
        struct ggml_context * ctx0,
        int ndims,
        int64_t ne[],
        float fmin,
        float fmax) {
    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
    switch (ndims) {
        case 1:
            for (int i0 = 0; i0 < ne[0]; i0++) {
                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
            }
            break;
        case 2:
            for (int i1 = 0; i1 < ne[1]; i1++) {
                for (int i0 = 0; i0 < ne[0]; i0++) {
                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
                }
            }
            break;
        case 3:
            for (int i2 = 0; i2 < ne[2]; i2++) {
                for (int i1 = 0; i1 < ne[1]; i1++) {
                    for (int i0 = 0; i0 < ne[0]; i0++) {
                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
                    }
                }
            }
            break;
        case 4:
            for (int i3 = 0; i3 < ne[3]; i3++) {
                for (int i2 = 0; i2 < ne[2]; i2++) {
                    for (int i1 = 0; i1 < ne[1]; i1++) {
                        for (int i0 = 0; i0 < ne[0]; i0++) {
                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
                        }
                    }
                }
            }
            break;
        default:
            assert(false);
    };
    return result;
 }
 struct ggml_tensor * get_random_tensor_i32(
        struct ggml_context * ctx0,
        int ndims,
        int64_t ne[],
@ -160,23 +208,6 @@ struct ggml_tensor * get_random_tensor_int(
    return result;
 }
 float get_element(const struct ggml_tensor * t, int idx) {
    if (t->type == GGML_TYPE_F32) {
        return ((float *)t->data)[idx];
    }
    if (t->type == GGML_TYPE_I32) {
        return ((int32_t *)t->data)[idx];
    }
    assert(false);
    return INFINITY;
 }
 void set_element(struct ggml_tensor * t, int idx, float value) {
    ((float *)t->data)[idx] = value;
 }
 void print_elements(const char* label, const struct ggml_tensor * t) {
    if (!t) {
        printf("%s: %s = null\n", __func__, label);
@ -186,7 +217,7 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
    printf("%s: %s = [", __func__, label);
    for (int k = 0; k < nelements; ++k) {
        if (k > 0) { printf(", "); }
-        printf("%.5f", get_element(t, k));
+        printf("%.5f", ggml_get_f32_1d(t, k));
    }
    printf("] shape: [");
    for (int k = 0; k < t->n_dims; ++k) {
@ -237,23 +268,23 @@ bool check_gradient(
        const int nelements = ggml_nelements(x[i]);
        for (int k = 0; k < nelements; ++k) {
            // compute gradient using finite differences
-            const float x0 = get_element(x[i], k);
+            const float x0 = ggml_get_f32_1d(x[i], k);
            const float xm = x0 - eps;
            const float xp = x0 + eps;
-            set_element(x[i], k, xp);
+            ggml_set_f32_1d(x[i], k, xp);
            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
            const float f0 = ggml_get_f32_1d(f, 0);
-            set_element(x[i], k, xm);
+            ggml_set_f32_1d(x[i], k, xm);
            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
            const float f1 = ggml_get_f32_1d(f, 0);
            const float g0 = (f0 - f1)/(2.0f*eps);
-            set_element(x[i], k, x0);
+            ggml_set_f32_1d(x[i], k, x0);
            // compute gradient using backward graph
            ggml_graph_reset  (&gf);
@ -261,7 +292,7 @@ bool check_gradient(
            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
-            const float g1 = get_element(x[i]->grad, k);
+            const float g1 = ggml_get_f32_1d(x[i]->grad, k);
            const float error_abs = fabsf(g0 - g1);
            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
@ -392,19 +423,35 @@ int main(int argc, const char ** argv) {
        struct ggml_tensor * x[MAX_NARGS];
-        // add
+        // add f32
        {
            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-                check_gradient("add", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
+                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
            }
        }
        // add f16
        {
            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
            }
        }
@ -414,7 +461,7 @@ int main(int argc, const char ** argv) {
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
@ -430,7 +477,7 @@ int main(int argc, const char ** argv) {
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
@ -446,7 +493,7 @@ int main(int argc, const char ** argv) {
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 0.5f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
@ -462,7 +509,7 @@ int main(int argc, const char ** argv) {
            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
@ -478,7 +525,7 @@ int main(int argc, const char ** argv) {
            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
@ -494,7 +541,7 @@ int main(int argc, const char ** argv) {
            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
@ -510,7 +557,7 @@ int main(int argc, const char ** argv) {
            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
@ -527,7 +574,7 @@ int main(int argc, const char ** argv) {
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
@ -537,6 +584,40 @@ int main(int argc, const char ** argv) {
            }
        }
        // mean, not yet fully implemented
        if(0)
        {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }
        // argmax
        if (0)
        {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }
        // repeat
        {
            int64_t ne2[4];
@ -549,15 +630,36 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
            }
        }
        // repeat back
        {
            int64_t ne2[4];
            get_random_dims(ne2, 4);
            ne2[0] = ne[0] * ne2[0];
            ne2[1] = ne[1] * ne2[1];
            ne2[2] = 1;
            ne2[3] = 1;
            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
            }
        }
        // abs (finite differences do not work)
@ -566,7 +668,7 @@ int main(int argc, const char ** argv) {
        //    for (int ndims = 1; ndims <= 2; ++ndims) {
        //        for (int i = 0; i < nargs; ++i) {
-        //            x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
        //            ggml_set_param(ctx0, x[i]);
        //        }
@ -576,17 +678,82 @@ int main(int argc, const char ** argv) {
        //    }
        //}
        // sgn
        {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }
        // neg
        {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }
        // step
        {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }
        // tanh, not yet fully implemented
        if(0)
        {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }
        // mul_mat
        {
            const int nargs = 2;
            for (int ndims = 2; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                {
                    int64_t ne2[4];
                    get_random_dims(ne2, 4);
                    ne2[0] = ne[0];
-                    x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                    x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                }
                ggml_set_param(ctx0, x[0]);
@ -602,13 +769,63 @@ int main(int argc, const char ** argv) {
            }
        }
        // elu, not yet fully implemented
        if(0)
        {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }
        // relu
        {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }
        // gelu, not yet fully implemented
        if(0)
        {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
            }
        }
        // silu
        {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
@ -629,11 +846,11 @@ int main(int argc, const char ** argv) {
            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0]));
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
            }
@ -647,8 +864,8 @@ int main(int argc, const char ** argv) {
            ne2[0] = 1;
            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                ggml_set_param(ctx0, x[1]);
@ -659,20 +876,37 @@ int main(int argc, const char ** argv) {
            }
        }
-        // cpy
+        // cpy f32
        {
            const int nargs = 2;
            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-                check_gradient("cpy", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
+                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }
        // cpy f16
        {
            const int nargs = 2;
            for (int ndims = 1; ndims <= 2; ++ndims) {
                for (int i = 0; i < nargs; ++i) {
                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
                    ggml_set_param(ctx0, x[i]);
                }
                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
            }
        }
@ -689,8 +923,8 @@ int main(int argc, const char ** argv) {
                for (int i = 0; i < ndims; ++i) {
                    ne2[0] *= ne[i];
                }
-                x[0] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
@ -712,8 +946,8 @@ int main(int argc, const char ** argv) {
                for (int i = 0; i < ndims; ++i) {
                    ne2[0] *= ne[i];
                }
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
@ -729,7 +963,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                get_random_dims(ne2, 1);
@ -737,7 +971,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 1);
                }
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);
                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
@ -758,7 +992,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 2;
            for (int ndims = 2; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                get_random_dims(ne2, 2);
@ -766,7 +1000,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 2);
                }
-                x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);
                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -790,7 +1024,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 2;
            for (int ndims = 3; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                get_random_dims(ne2, 3);
@ -798,7 +1032,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 3);
                }
-                x[1] = get_random_tensor(ctx0, 3, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);
                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -824,7 +1058,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 2;
            for (int ndims = 4; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                get_random_dims(ne2, 4);
@ -832,7 +1066,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 4);
                }
-                x[1] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);
                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -858,7 +1092,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 2;
            for (int ndims = 1; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                get_random_dims(ne2, 1);
@ -866,7 +1100,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 1);
                }
-                x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);
                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
@ -887,7 +1121,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            for (int ndims = 2; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                get_random_dims(ne2, 2);
@ -895,7 +1129,7 @@ int main(int argc, const char ** argv) {
                    get_random_dims(ne2, 2);
                }
-                x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[1]);
                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -915,7 +1149,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
@ -941,7 +1175,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                get_random_dims(ne2, 2);
                while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
@ -971,7 +1205,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            for (int ndims = 1; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
                get_random_dims(ne2, 3);
                while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
@ -1010,7 +1244,7 @@ int main(int argc, const char ** argv) {
                for (int i=ndims; i<4; ++i) {
                    ne2[i] = 1;
                }
-                x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
@ -1043,7 +1277,7 @@ int main(int argc, const char ** argv) {
                for (int i=ndims; i<4; ++i) {
                    ne2[i] = 1;
                }
-                x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
@ -1060,8 +1294,8 @@ int main(int argc, const char ** argv) {
            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
            const int nargs = 1;
            const int ndims = 2;
-            x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-            x[1] = get_random_tensor_int(ctx0, 1, ne3, 0, ne2[1]);
+            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
            ggml_set_param(ctx0, x[0]);
@ -1075,7 +1309,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            const int ndims = 2;
-            x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
            ggml_set_param(ctx0, x[0]);
            int n_past = irand(ne[0]);
@ -1090,7 +1324,7 @@ int main(int argc, const char ** argv) {
            const int nargs = 1;
            const int ndims = 2;
-            x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f);
+            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
            ggml_set_param(ctx0, x[0]);
            int n_past = irand(ne[0]);
@ -1108,7 +1342,7 @@ int main(int argc, const char ** argv) {
            get_random_dims(ne2, 4);
            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
@ -1125,8 +1359,8 @@ int main(int argc, const char ** argv) {
            get_random_dims(ne2, 4);
            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f);
+                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);
                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
@ -1136,7 +1370,7 @@ int main(int argc, const char ** argv) {
            }
        }
-        // rope
+        // rope f32
        {
            const int nargs = 1;
@ -1148,7 +1382,7 @@ int main(int argc, const char ** argv) {
            for (int ndims = 3; ndims <= 4; ++ndims) {
                for (int mode = 0; mode < 4; ++mode) {
                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
+                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                        ggml_set_param(ctx0, x[0]);
@ -1163,14 +1397,48 @@ int main(int argc, const char ** argv) {
                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
-                        GGML_PRINT_DEBUG("rope: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
+                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
+                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
                    }
                }
            }
        }
-        // flash_attn
+        // rope f16
        {
            const int nargs = 1;
            int64_t ne2[4];
            get_random_dims(ne2, 4);
            ne2[0] += ne2[0] % 2;
            int n_rot = ne2[0];
            for (int ndims = 3; ndims <= 4; ++ndims) {
                for (int mode = 0; mode < 4; ++mode) {
                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
                        ggml_set_param(ctx0, x[0]);
                        const bool skip_past = (mode & 1);
                        if (skip_past) {
                            // we have no past, so this would have to work on uninitialized memory.
                            // we only test the gradients here;
                            // skip_past should have no influence on gradient computation.
                            // so when other modes work, we assume that this does as well.
                            continue;
                        }
                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
                    }
                }
            }
        }
        // flash_attn f32
        {
            const int nargs = 3;
@ -1196,16 +1464,57 @@ int main(int argc, const char ** argv) {
                        nek[3] = 1;
                        nev[3] = 1;
                    }
-                    x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f);
+                    x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f);
+                    x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f);
+                    x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
                    ggml_set_param(ctx0, x[0]);
                    ggml_set_param(ctx0, x[1]);
                    ggml_set_param(ctx0, x[2]);
                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-                    check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
+                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
                }
            }
        }
        // flash_attn f16, not yet fully implemented
        if(0)
        {
            const int nargs = 3;
            int64_t ne2[4];
            get_random_dims(ne2, 4);
            int64_t D = ne2[0];
            int64_t N = ne2[1];
            int64_t M = ne2[2] + N;
            int64_t B = ne2[3];
            for (int masked = 0; masked <= 1; ++masked) {
                for (int ndims = 2; ndims <= 4; ++ndims) {
                    int64_t neq[4] = { D, N, B, ne[3] };
                    int64_t nek[4] = { D, M, B, ne[3] };
                    int64_t nev[4] = { M, D, B, ne[3] };
                    if (ndims == 2) {
                        neq[2] = 1; neq[3] = 1;
                        nek[2] = 1; nek[3] = 1;
                        nev[2] = 1; nev[3] = 1;
                    } else if (ndims == 3) {
                        neq[3] = 1;
                        nek[3] = 1;
                        nev[3] = 1;
                    }
                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
                    ggml_set_param(ctx0, x[0]);
                    ggml_set_param(ctx0, x[1]);
                    ggml_set_param(ctx0, x[2]);
                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
                }
            }
        }
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@ -125,9 +125,9 @@ int main(void) {
    };
    struct ggml_context * ctx = ggml_init(params);
-    int64_t ne1[4] = {4, 1024, 1, 1};
+    int64_t ne1[4] = {4, 128, 1, 1};
-    int64_t ne2[4] = {4, 2048, 1, 1};;
+    int64_t ne2[4] = {4, 256, 1, 1};;
-    int64_t ne3[4] = {1024, 2048, 1, 1};
+    int64_t ne3[4] = {128, 256, 1, 1};
    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);